mac_sched.c revision 10491:8893b747ecdf
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/callb.h>
28#include <sys/sdt.h>
29#include <sys/strsubr.h>
30#include <sys/strsun.h>
31#include <sys/vlan.h>
32#include <inet/ipsec_impl.h>
33#include <inet/ip_impl.h>
34#include <inet/sadb.h>
35#include <inet/ipsecesp.h>
36#include <inet/ipsecah.h>
37#include <inet/ip6.h>
38
39#include <sys/mac_impl.h>
40#include <sys/mac_client_impl.h>
41#include <sys/mac_client_priv.h>
42#include <sys/mac_soft_ring.h>
43#include <sys/mac_flow_impl.h>
44
45static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
46    uintptr_t, uint16_t, mblk_t **);
47static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
48    uintptr_t, uint16_t, mblk_t **);
49static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
50    uintptr_t, uint16_t, mblk_t **);
51static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
52    uintptr_t, uint16_t, mblk_t **);
53
54typedef struct mac_tx_mode_s {
55	mac_tx_srs_mode_t	mac_tx_mode;
56	mac_tx_func_t		mac_tx_func;
57} mac_tx_mode_t;
58
59/*
60 * There are five modes of operation on the Tx side. These modes get set
61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
62 * none of the other modes are user configurable. They get selected by
63 * the system depending upon whether the link (or flow) has multiple Tx
64 * rings or a bandwidth configured, etc.
65 */
66mac_tx_mode_t mac_tx_mode_list[] = {
67	{SRS_TX_DEFAULT,	mac_tx_single_ring_mode},
68	{SRS_TX_SERIALIZE,	mac_tx_serializer_mode},
69	{SRS_TX_FANOUT,		mac_tx_fanout_mode},
70	{SRS_TX_BW,		mac_tx_bw_mode},
71	{SRS_TX_BW_FANOUT,	mac_tx_bw_mode}
72};
73
74/*
75 * Soft Ring Set (SRS) - The Run time code that deals with
76 * dynamic polling from the hardware, bandwidth enforcement,
77 * fanout etc.
78 *
79 * We try to use H/W classification on NIC and assign traffic for
80 * a MAC address to a particular Rx ring or ring group. There is a
81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
82 * switches the underlying Rx ring between interrupt and
83 * polling mode and enforces any specified B/W control.
84 *
85 * There is always a SRS created and tied to each H/W and S/W rule.
86 * Whenever we create a H/W rule, we always add the the same rule to
87 * S/W classifier and tie a SRS to it.
88 *
89 * In case a B/W control is specified, it is broken into bytes
90 * per ticks and as soon as the quota for a tick is exhausted,
91 * the underlying Rx ring is forced into poll mode for remainder of
92 * the tick. The SRS poll thread only polls for bytes that are
93 * allowed to come in the SRS. We typically let 4x the configured
94 * B/W worth of packets to come in the SRS (to prevent unnecessary
95 * drops due to bursts) but only process the specified amount.
96 *
97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more
98 * Rx rings (and corresponding SRSs) assigned to it. The SRS
99 * in turn can have softrings to do protocol level fanout or
100 * softrings to do S/W based fanout or both. In case the NIC
101 * has no Rx rings, we do S/W classification to respective SRS.
102 * The S/W classification rule is always setup and ready. This
103 * allows the MAC layer to reassign Rx rings whenever needed
104 * but packets still continue to flow via the default path and
105 * getting S/W classified to correct SRS.
106 *
107 * The SRS's are used on both Tx and Rx side. They use the same
108 * data structure but the processing routines have slightly different
109 * semantics due to the fact that Rx side needs to do dynamic
110 * polling etc.
111 *
112 * Dynamic Polling Notes
113 * =====================
114 *
115 * Each Soft ring set is capable of switching its Rx ring between
116 * interrupt and poll mode and actively 'polls' for packets in
117 * poll mode. If the SRS is implementing a B/W limit, it makes
118 * sure that only Max allowed packets are pulled in poll mode
119 * and goes to poll mode as soon as B/W limit is exceeded. As
120 * such, there are no overheads to implement B/W limits.
121 *
122 * In poll mode, its better to keep the pipeline going where the
123 * SRS worker thread keeps processing packets and poll thread
124 * keeps bringing more packets (specially if they get to run
125 * on different CPUs). This also prevents the overheads associated
126 * by excessive signalling (on NUMA machines, this can be
127 * pretty devastating). The exception is latency optimized case
128 * where worker thread does no work and interrupt and poll thread
129 * are allowed to do their own drain.
130 *
131 * We use the following policy to control Dynamic Polling:
132 * 1) We switch to poll mode anytime the processing
133 *    thread causes a backlog to build up in SRS and
134 *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
135 * 2) As long as the backlog stays under the low water
136 *    mark (sr_lowat), we poll the H/W for more packets.
137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
138 *    water mark, we stay in poll mode but don't poll
139 *    the H/W for more packets.
140 * 4) Anytime in polling mode, if we poll the H/W for
141 *    packets and find nothing plus we have an existing
142 *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
143 *    mode but don't poll the H/W for packets anymore
144 *    (let the polling thread go to sleep).
145 * 5) Once the backlog is relived (packets are processed)
146 *    we reenable polling (by signalling the poll thread)
147 *    only when the backlog dips below sr_poll_thres.
148 * 6) sr_hiwat is used exclusively when we are not
149 *    polling capable and is used to decide when to
150 *    drop packets so the SRS queue length doesn't grow
151 *    infinitely.
152 *
153 * NOTE: Also see the block level comment on top of mac_soft_ring.c
154 */
155
156/*
157 * mac_latency_optimize
158 *
159 * Controls whether the poll thread can process the packets inline
160 * or let the SRS worker thread do the processing. This applies if
161 * the SRS was not being processed. For latency sensitive traffic,
162 * this needs to be true to allow inline processing. For throughput
163 * under load, this should be false.
164 *
165 * This (and other similar) tunable should be rolled into a link
166 * or flow specific workload hint that can be set using dladm
167 * linkprop (instead of multiple such tunables).
168 */
169boolean_t mac_latency_optimize = B_TRUE;
170
171/*
172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
173 *
174 * queue a mp or chain in soft ring set and increment the
175 * local count (srs_count) for the SRS and the shared counter
176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings
177 * to track the total unprocessed packets for polling to work
178 * correctly).
179 *
180 * The size (total bytes queued) counters are incremented only
181 * if we are doing B/W control.
182 */
183#define	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {		\
184	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
185	if ((mac_srs)->srs_last != NULL)				\
186		(mac_srs)->srs_last->b_next = (head);			\
187	else								\
188		(mac_srs)->srs_first = (head);				\
189	(mac_srs)->srs_last = (tail);					\
190	(mac_srs)->srs_count += count;					\
191}
192
193#define	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
194	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
195									\
196	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
197	srs_rx->sr_poll_pkt_cnt += count;				\
198	ASSERT(srs_rx->sr_poll_pkt_cnt > 0);				\
199	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
200		(mac_srs)->srs_size += (sz);				\
201		mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);		\
202		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
203		mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);		\
204	}								\
205}
206
207#define	MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
208	mac_srs->srs_state |= SRS_ENQUEUED;				\
209	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
210	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
211		(mac_srs)->srs_size += (sz);				\
212		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
213	}								\
214}
215
216/*
217 * Turn polling on routines
218 */
219#define	MAC_SRS_POLLING_ON(mac_srs) {					\
220	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
221	if (((mac_srs)->srs_state &					\
222	    (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {	\
223		(mac_srs)->srs_state |= SRS_POLLING;			\
224		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
225		    (mac_srs)->srs_ring);				\
226		(mac_srs)->srs_rx.sr_poll_on++;				\
227	}								\
228}
229
230#define	MAC_SRS_WORKER_POLLING_ON(mac_srs) {				\
231	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
232	if (((mac_srs)->srs_state &					\
233	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == 		\
234	    (SRS_POLLING_CAPAB|SRS_WORKER)) {				\
235		(mac_srs)->srs_state |= SRS_POLLING;			\
236		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
237		    (mac_srs)->srs_ring);				\
238		(mac_srs)->srs_rx.sr_worker_poll_on++;			\
239	}								\
240}
241
242/*
243 * MAC_SRS_POLL_RING
244 *
245 * Signal the SRS poll thread to poll the underlying H/W ring
246 * provided it wasn't already polling (SRS_GET_PKTS was set).
247 *
248 * Poll thread gets to run only from mac_rx_srs_drain() and only
249 * if the drain was being done by the worker thread.
250 */
251#define	MAC_SRS_POLL_RING(mac_srs) {					\
252	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
253									\
254	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
255	srs_rx->sr_poll_thr_sig++;					\
256	if (((mac_srs)->srs_state & 					\
257	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==		\
258		(SRS_WORKER|SRS_POLLING_CAPAB)) {			\
259		(mac_srs)->srs_state |= SRS_GET_PKTS;			\
260		cv_signal(&(mac_srs)->srs_cv);   			\
261	} else {							\
262		srs_rx->sr_poll_thr_busy++;				\
263	}								\
264}
265
266/*
267 * MAC_SRS_CHECK_BW_CONTROL
268 *
269 * Check to see if next tick has started so we can reset the
270 * SRS_BW_ENFORCED flag and allow more packets to come in the
271 * system.
272 */
273#define	MAC_SRS_CHECK_BW_CONTROL(mac_srs) {				\
274	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
275	ASSERT(((mac_srs)->srs_type & SRST_TX) ||			\
276	    MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));		\
277	if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) {    		\
278		(mac_srs)->srs_bw->mac_bw_curr_time = lbolt;   		\
279		(mac_srs)->srs_bw->mac_bw_used = 0;	       		\
280		if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)	\
281			(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
282	}								\
283}
284
285/*
286 * MAC_SRS_WORKER_WAKEUP
287 *
288 * Wake up the SRS worker thread to process the queue as long as
289 * no one else is processing the queue. If we are optimizing for
290 * latency, we wake up the worker thread immediately or else we
291 * wait mac_srs_worker_wakeup_ticks before worker thread gets
292 * woken up.
293 */
294int mac_srs_worker_wakeup_ticks = 0;
295#define	MAC_SRS_WORKER_WAKEUP(mac_srs) {				\
296	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
297	if (!((mac_srs)->srs_state & SRS_PROC) &&			\
298		(mac_srs)->srs_tid == NULL) {				\
299		if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||		\
300			(mac_srs_worker_wakeup_ticks == 0))		\
301			cv_signal(&(mac_srs)->srs_async);		\
302		else							\
303			(mac_srs)->srs_tid =				\
304				timeout(mac_srs_fire, (mac_srs),	\
305					mac_srs_worker_wakeup_ticks);	\
306	}								\
307}
308
309#define	TX_SINGLE_RING_MODE(mac_srs)				\
310	((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || 	\
311	    (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE ||	\
312	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW)
313
314#define	TX_BANDWIDTH_MODE(mac_srs)				\
315	((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||		\
316	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
317
318#define	TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {			\
319	uint_t hash, indx;						\
320	hash = HASH_HINT(hint);					\
321	indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);	\
322	softring = mac_srs->srs_oth_soft_rings[indx];			\
323	(void) (mac_tx_soft_ring_process(softring, head, 0, NULL));	\
324}
325
326/*
327 * MAC_TX_SRS_BLOCK
328 *
329 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
330 * will be set only if srs_tx_woken_up is FALSE. If
331 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
332 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
333 * attempt to transmit again and not setting SRS_TX_BLOCKED does
334 * that.
335 */
336#define	MAC_TX_SRS_BLOCK(srs, mp)	{			\
337	ASSERT(MUTEX_HELD(&(srs)->srs_lock));			\
338	if ((srs)->srs_tx.st_woken_up) {			\
339		(srs)->srs_tx.st_woken_up = B_FALSE;		\
340	} else {						\
341		ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));	\
342		(srs)->srs_state |= SRS_TX_BLOCKED;		\
343		(srs)->srs_tx.st_blocked_cnt++;			\
344	}							\
345}
346
347/*
348 * MAC_TX_SRS_TEST_HIWAT
349 *
350 * Called before queueing a packet onto Tx SRS to test and set
351 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
352 */
353#define	MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {		\
354	boolean_t enqueue = 1;						\
355									\
356	if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {		\
357		/*							\
358		 * flow-controlled. Store srs in cookie so that it	\
359		 * can be returned as mac_tx_cookie_t to client		\
360		 */							\
361		(srs)->srs_state |= SRS_TX_HIWAT;			\
362		cookie = (mac_tx_cookie_t)srs;				\
363		(srs)->srs_tx.st_hiwat_cnt++;				\
364		if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {	\
365			/* increment freed stats */			\
366			(srs)->srs_tx.st_drop_count += cnt;		\
367			/*						\
368			 * b_prev may be set to the fanout hint		\
369			 * hence can't use freemsg directly		\
370			 */						\
371			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
372			DTRACE_PROBE1(tx_queued_hiwat,			\
373			    mac_soft_ring_set_t *, srs);		\
374			enqueue = 0;					\
375		}							\
376	}								\
377	if (enqueue)							\
378		MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);	\
379}
380
381/* Some utility macros */
382#define	MAC_SRS_BW_LOCK(srs)						\
383	if (!(srs->srs_type & SRST_TX))					\
384		mutex_enter(&srs->srs_bw->mac_bw_lock);
385
386#define	MAC_SRS_BW_UNLOCK(srs)						\
387	if (!(srs->srs_type & SRST_TX))					\
388		mutex_exit(&srs->srs_bw->mac_bw_lock);
389
390#define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
391	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
392	/* increment freed stats */				\
393	mac_srs->srs_tx.st_drop_count++;			\
394	cookie = (mac_tx_cookie_t)srs;				\
395}
396
397#define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
398	mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;			\
399	cookie = (mac_tx_cookie_t)srs;					\
400	*ret_mp = mp_chain;						\
401}
402
403/*
404 * Drop the rx packet and advance to the next one in the chain.
405 */
406static void
407mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
408{
409	mac_srs_rx_t	*srs_rx = &srs->srs_rx;
410
411	ASSERT(mp->b_next == NULL);
412	mutex_enter(&srs->srs_lock);
413	MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
414	MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
415	mutex_exit(&srs->srs_lock);
416
417	srs_rx->sr_drop_count++;
418	freemsg(mp);
419}
420
421/* DATAPATH RUNTIME ROUTINES */
422
423/*
424 * mac_srs_fire
425 *
426 * Timer callback routine for waking up the SRS worker thread.
427 */
428static void
429mac_srs_fire(void *arg)
430{
431	mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
432
433	mutex_enter(&mac_srs->srs_lock);
434	if (mac_srs->srs_tid == 0) {
435		mutex_exit(&mac_srs->srs_lock);
436		return;
437	}
438
439	mac_srs->srs_tid = 0;
440	if (!(mac_srs->srs_state & SRS_PROC))
441		cv_signal(&mac_srs->srs_async);
442
443	mutex_exit(&mac_srs->srs_lock);
444}
445
446/*
447 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
448 * and it is used on the TX path.
449 */
450#define	HASH_HINT(hint)	(((hint) << 17) | ((hint) >> 16))
451
452/*
453 * hash based on the src address and the port information.
454 */
455#define	HASH_ADDR(src, ports)					\
456	(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
457	((ports) >> 8) ^ (ports))
458
459#define	COMPUTE_INDEX(key, sz)	(key % sz)
460
461#define	FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {	\
462	if ((tail) != NULL) {						\
463		ASSERT((tail)->b_next == NULL);				\
464		(tail)->b_next = (mp);					\
465	} else {							\
466		ASSERT((head) == NULL);					\
467		(head) = (mp);						\
468	}								\
469	(tail) = (mp);							\
470	(cnt)++;							\
471	if ((bw_ctl))							\
472		(sz) += (sz0);						\
473}
474
475#define	MAC_FANOUT_DEFAULT	0
476#define	MAC_FANOUT_RND_ROBIN	1
477int mac_fanout_type = MAC_FANOUT_DEFAULT;
478
479#define	MAX_SR_TYPES	3
480/* fanout types for port based hashing */
481enum pkt_type {
482	V4_TCP = 0,
483	V4_UDP,
484	OTH,
485	UNDEF
486};
487
488/*
489 * In general we do port based hashing to spread traffic over different
490 * softrings. The below tunable allows to override that behavior. Setting it
491 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
492 * is also the applicable to ipv6 packets carrying multiple optional headers
493 * and other uncommon packet types.
494 */
495boolean_t mac_src_ipv6_fanout = B_FALSE;
496
497/*
498 * Pair of local and remote ports in the transport header
499 */
500#define	PORTS_SIZE 4
501
502/*
503 * mac_rx_srs_proto_fanout
504 *
505 * This routine delivers packets destined to an SRS into one of the
506 * protocol soft rings.
507 *
508 * Given a chain of packets we need to split it up into multiple sub chains
509 * destined into TCP, UDP or OTH soft ring. Instead of entering
510 * the soft ring one packet at a time, we want to enter it in the form of a
511 * chain otherwise we get this start/stop behaviour where the worker thread
512 * goes to sleep and then next packets comes in forcing it to wake up etc.
513 */
514static void
515mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
516{
517	struct ether_header		*ehp;
518	struct ether_vlan_header	*evhp;
519	uint32_t			sap;
520	ipha_t				*ipha;
521	uint8_t				*dstaddr;
522	size_t				hdrsize;
523	mblk_t				*mp;
524	mblk_t				*headmp[MAX_SR_TYPES];
525	mblk_t				*tailmp[MAX_SR_TYPES];
526	int				cnt[MAX_SR_TYPES];
527	size_t				sz[MAX_SR_TYPES];
528	size_t				sz1;
529	boolean_t			bw_ctl;
530	boolean_t			hw_classified;
531	boolean_t			dls_bypass;
532	boolean_t			is_ether;
533	boolean_t			is_unicast;
534	enum pkt_type			type;
535	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
536
537	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
538	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
539
540	/*
541	 * If we don't have a Rx ring, S/W classification would have done
542	 * its job and its a packet meant for us. If we were polling on
543	 * the default ring (i.e. there was a ring assigned to this SRS),
544	 * then we need to make sure that the mac address really belongs
545	 * to us.
546	 */
547	hw_classified = mac_srs->srs_ring != NULL &&
548	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
549
550	/*
551	 * Special clients (eg. VLAN, non ether, etc) need DLS
552	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
553	 * such SRSs.
554	 */
555	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
556
557	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
558	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
559	bzero(cnt, MAX_SR_TYPES * sizeof (int));
560	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
561
562	/*
563	 * We got a chain from SRS that we need to send to the soft rings.
564	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
565	 * performance reasons), we need to separate out v4_tcp, v4_udp
566	 * and the rest goes in other.
567	 */
568	while (head != NULL) {
569		mp = head;
570		head = head->b_next;
571		mp->b_next = NULL;
572
573		type = OTH;
574		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
575
576		if (is_ether) {
577			/*
578			 * At this point we can be sure the packet at least
579			 * has an ether header.
580			 */
581			if (sz1 < sizeof (struct ether_header)) {
582				mac_rx_drop_pkt(mac_srs, mp);
583				continue;
584			}
585			ehp = (struct ether_header *)mp->b_rptr;
586
587			/*
588			 * Determine if this is a VLAN or non-VLAN packet.
589			 */
590			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
591				evhp = (struct ether_vlan_header *)mp->b_rptr;
592				sap = ntohs(evhp->ether_type);
593				hdrsize = sizeof (struct ether_vlan_header);
594				/*
595				 * Check if the VID of the packet, if any,
596				 * belongs to this client.
597				 */
598				if (!mac_client_check_flow_vid(mcip,
599				    VLAN_ID(ntohs(evhp->ether_tci)))) {
600					mac_rx_drop_pkt(mac_srs, mp);
601					continue;
602				}
603			} else {
604				hdrsize = sizeof (struct ether_header);
605			}
606			is_unicast =
607			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
608			dstaddr = (uint8_t *)&ehp->ether_dhost;
609		} else {
610			mac_header_info_t		mhi;
611
612			if (mac_header_info((mac_handle_t)mcip->mci_mip,
613			    mp, &mhi) != 0) {
614				mac_rx_drop_pkt(mac_srs, mp);
615				continue;
616			}
617			hdrsize = mhi.mhi_hdrsize;
618			sap = mhi.mhi_bindsap;
619			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
620			dstaddr = (uint8_t *)mhi.mhi_daddr;
621		}
622
623		if (!dls_bypass) {
624			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
625			    cnt[type], bw_ctl, sz[type], sz1, mp);
626			continue;
627		}
628
629		if (sap == ETHERTYPE_IP) {
630			/*
631			 * If we are H/W classified, but we have promisc
632			 * on, then we need to check for the unicast address.
633			 */
634			if (hw_classified && mcip->mci_promisc_list != NULL) {
635				mac_address_t		*map;
636
637				rw_enter(&mcip->mci_rw_lock, RW_READER);
638				map = mcip->mci_unicast;
639				if (bcmp(dstaddr, map->ma_addr,
640				    map->ma_len) == 0)
641					type = UNDEF;
642				rw_exit(&mcip->mci_rw_lock);
643			} else if (is_unicast) {
644				type = UNDEF;
645			}
646		}
647
648		/*
649		 * This needs to become a contract with the driver for
650		 * the fast path.
651		 *
652		 * In the normal case the packet will have at least the L2
653		 * header and the IP + Transport header in the same mblk.
654		 * This is usually the case when the NIC driver sends up
655		 * the packet. This is also true when the stack generates
656		 * a packet that is looped back and when the stack uses the
657		 * fastpath mechanism. The normal case is optimized for
658		 * performance and may bypass DLS. All other cases go through
659		 * the 'OTH' type path without DLS bypass.
660		 */
661
662		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
663		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
664			type = OTH;
665
666		if (type == OTH) {
667			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
668			    cnt[type], bw_ctl, sz[type], sz1, mp);
669			continue;
670		}
671
672		ASSERT(type == UNDEF);
673		/*
674		 * We look for at least 4 bytes past the IP header to get
675		 * the port information. If we get an IP fragment, we don't
676		 * have the port information, and we use just the protocol
677		 * information.
678		 */
679		switch (ipha->ipha_protocol) {
680		case IPPROTO_TCP:
681			type = V4_TCP;
682			mp->b_rptr += hdrsize;
683			break;
684		case IPPROTO_UDP:
685			type = V4_UDP;
686			mp->b_rptr += hdrsize;
687			break;
688		default:
689			type = OTH;
690			break;
691		}
692
693		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
694		    bw_ctl, sz[type], sz1, mp);
695	}
696
697	for (type = V4_TCP; type < UNDEF; type++) {
698		if (headmp[type] != NULL) {
699			mac_soft_ring_t			*softring;
700
701			ASSERT(tailmp[type]->b_next == NULL);
702			switch (type) {
703			case V4_TCP:
704				softring = mac_srs->srs_tcp_soft_rings[0];
705				break;
706			case V4_UDP:
707				softring = mac_srs->srs_udp_soft_rings[0];
708				break;
709			case OTH:
710				softring = mac_srs->srs_oth_soft_rings[0];
711			}
712			mac_rx_soft_ring_process(mcip, softring,
713			    headmp[type], tailmp[type], cnt[type], sz[type]);
714		}
715	}
716}
717
718int	fanout_unalligned = 0;
719
720/*
721 * mac_rx_srs_long_fanout
722 *
723 * The fanout routine for IPv6
724 */
725static int
726mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
727    uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
728{
729	ip6_t		*ip6h;
730	uint8_t		*whereptr;
731	uint_t		hash;
732	uint16_t	remlen;
733	uint8_t		nexthdr;
734	uint16_t	hdr_len;
735
736	if (sap == ETHERTYPE_IPV6) {
737		boolean_t	modifiable = B_TRUE;
738
739		ASSERT(MBLKL(mp) >= hdrsize);
740
741		ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
742		if ((unsigned char *)ip6h == mp->b_wptr) {
743			/*
744			 * The first mblk_t only includes the mac header.
745			 * Note that it is safe to change the mp pointer here,
746			 * as the subsequent operation does not assume mp
747			 * points to the start of the mac header.
748			 */
749			mp = mp->b_cont;
750
751			/*
752			 * Make sure ip6h holds the full ip6_t structure.
753			 */
754			if (mp == NULL)
755				return (-1);
756
757			if (MBLKL(mp) < IPV6_HDR_LEN) {
758				modifiable = (DB_REF(mp) == 1);
759
760				if (modifiable &&
761				    !pullupmsg(mp, IPV6_HDR_LEN)) {
762					return (-1);
763				}
764			}
765
766			ip6h = (ip6_t *)mp->b_rptr;
767		}
768
769		if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
770		    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
771			/*
772			 * If either ip6h is not alligned, or ip6h does not
773			 * hold the complete ip6_t structure (a pullupmsg()
774			 * is not an option since it would result in an
775			 * unalligned ip6h), fanout to the default ring. Note
776			 * that this may cause packets reordering.
777			 */
778			*indx = 0;
779			*type = OTH;
780			fanout_unalligned++;
781			return (0);
782		}
783
784		remlen = ntohs(ip6h->ip6_plen);
785		nexthdr = ip6h->ip6_nxt;
786
787		if (remlen < MIN_EHDR_LEN)
788			return (-1);
789		/*
790		 * Do src based fanout if below tunable is set to B_TRUE or
791		 * when mac_ip_hdr_length_v6() fails because of malformed
792		 * packets or because mblk's need to be concatenated using
793		 * pullupmsg().
794		 */
795		if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h,
796		    &hdr_len, &nexthdr)) {
797			goto src_based_fanout;
798		}
799		whereptr = (uint8_t *)ip6h + hdr_len;
800
801		/* If the transport is one of below, we do port based fanout */
802		switch (nexthdr) {
803		case IPPROTO_TCP:
804		case IPPROTO_UDP:
805		case IPPROTO_SCTP:
806		case IPPROTO_ESP:
807			/*
808			 * If the ports in the transport header is not part of
809			 * the mblk, do src_based_fanout, instead of calling
810			 * pullupmsg().
811			 */
812			if (mp->b_cont != NULL &&
813			    whereptr + PORTS_SIZE > mp->b_wptr) {
814				goto src_based_fanout;
815			}
816			break;
817		default:
818			break;
819		}
820
821		switch (nexthdr) {
822		case IPPROTO_TCP:
823			hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
824			    *(uint32_t *)whereptr);
825			*indx = COMPUTE_INDEX(hash,
826			    mac_srs->srs_tcp_ring_count);
827			*type = OTH;
828			break;
829
830		case IPPROTO_UDP:
831		case IPPROTO_SCTP:
832		case IPPROTO_ESP:
833			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
834				hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
835				    *(uint32_t *)whereptr);
836				*indx = COMPUTE_INDEX(hash,
837				    mac_srs->srs_udp_ring_count);
838			} else {
839				*indx = mac_srs->srs_ind %
840				    mac_srs->srs_udp_ring_count;
841				mac_srs->srs_ind++;
842			}
843			*type = OTH;
844			break;
845
846			/* For all other protocol, do source based fanout */
847		default:
848			goto src_based_fanout;
849		}
850	} else {
851		*indx = 0;
852		*type = OTH;
853	}
854	return (0);
855
856src_based_fanout:
857	hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
858	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
859	*type = OTH;
860	return (0);
861}
862
863/*
864 * mac_rx_srs_fanout
865 *
866 * This routine delivers packets destined to an SRS into a soft ring member
867 * of the set.
868 *
869 * Given a chain of packets we need to split it up into multiple sub chains
870 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
871 * the soft ring one packet at a time, we want to enter it in the form of a
872 * chain otherwise we get this start/stop behaviour where the worker thread
873 * goes to sleep and then next packets comes in forcing it to wake up etc.
874 *
875 * Note:
876 * Since we know what is the maximum fanout possible, we create a 2D array
877 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
878 * variables so that we can enter the softrings with chain. We need the
879 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
880 * for each packet would be expensive). If we ever want to have the
881 * ability to have unlimited fanout, we should probably declare a head,
882 * tail, cnt, sz with each soft ring (a data struct which contains a softring
883 * along with these members) and create an array of this uber struct so we
884 * don't have to do kmem_alloc.
885 */
886int	fanout_oth1 = 0;
887int	fanout_oth2 = 0;
888int	fanout_oth3 = 0;
889int	fanout_oth4 = 0;
890int	fanout_oth5 = 0;
891
892static void
893mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
894{
895	struct ether_header		*ehp;
896	struct ether_vlan_header	*evhp;
897	uint32_t			sap;
898	ipha_t				*ipha;
899	uint8_t				*dstaddr;
900	uint_t				indx;
901	size_t				ports_offset;
902	size_t				ipha_len;
903	size_t				hdrsize;
904	uint_t				hash;
905	mblk_t				*mp;
906	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
907	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
908	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
909	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
910	size_t				sz1;
911	boolean_t			bw_ctl;
912	boolean_t			hw_classified;
913	boolean_t			dls_bypass;
914	boolean_t			is_ether;
915	boolean_t			is_unicast;
916	int				fanout_cnt;
917	enum pkt_type			type;
918	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
919
920	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
921	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
922
923	/*
924	 * If we don't have a Rx ring, S/W classification would have done
925	 * its job and its a packet meant for us. If we were polling on
926	 * the default ring (i.e. there was a ring assigned to this SRS),
927	 * then we need to make sure that the mac address really belongs
928	 * to us.
929	 */
930	hw_classified = mac_srs->srs_ring != NULL &&
931	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
932
933	/*
934	 * Special clients (eg. VLAN, non ether, etc) need DLS
935	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
936	 * such SRSs.
937	 */
938	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
939
940	/*
941	 * Since the softrings are never destroyed and we always
942	 * create equal number of softrings for TCP, UDP and rest,
943	 * its OK to check one of them for count and use it without
944	 * any lock. In future, if soft rings get destroyed because
945	 * of reduction in fanout, we will need to ensure that happens
946	 * behind the SRS_PROC.
947	 */
948	fanout_cnt = mac_srs->srs_tcp_ring_count;
949
950	bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
951	bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
952	bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
953	bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
954
955	/*
956	 * We got a chain from SRS that we need to send to the soft rings.
957	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
958	 * performance reasons), we need to separate out v4_tcp, v4_udp
959	 * and the rest goes in other.
960	 */
961	while (head != NULL) {
962		mp = head;
963		head = head->b_next;
964		mp->b_next = NULL;
965
966		type = OTH;
967		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
968
969		if (is_ether) {
970			/*
971			 * At this point we can be sure the packet at least
972			 * has an ether header.
973			 */
974			if (sz1 < sizeof (struct ether_header)) {
975				mac_rx_drop_pkt(mac_srs, mp);
976				continue;
977			}
978			ehp = (struct ether_header *)mp->b_rptr;
979
980			/*
981			 * Determine if this is a VLAN or non-VLAN packet.
982			 */
983			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
984				evhp = (struct ether_vlan_header *)mp->b_rptr;
985				sap = ntohs(evhp->ether_type);
986				hdrsize = sizeof (struct ether_vlan_header);
987				/*
988				 * Check if the VID of the packet, if any,
989				 * belongs to this client.
990				 */
991				if (!mac_client_check_flow_vid(mcip,
992				    VLAN_ID(ntohs(evhp->ether_tci)))) {
993					mac_rx_drop_pkt(mac_srs, mp);
994					continue;
995				}
996			} else {
997				hdrsize = sizeof (struct ether_header);
998			}
999			is_unicast =
1000			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1001			dstaddr = (uint8_t *)&ehp->ether_dhost;
1002		} else {
1003			mac_header_info_t		mhi;
1004
1005			if (mac_header_info((mac_handle_t)mcip->mci_mip,
1006			    mp, &mhi) != 0) {
1007				mac_rx_drop_pkt(mac_srs, mp);
1008				continue;
1009			}
1010			hdrsize = mhi.mhi_hdrsize;
1011			sap = mhi.mhi_bindsap;
1012			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1013			dstaddr = (uint8_t *)mhi.mhi_daddr;
1014		}
1015
1016		if (!dls_bypass) {
1017			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1018			    hdrsize, &type, &indx) == -1) {
1019				mac_rx_drop_pkt(mac_srs, mp);
1020				continue;
1021			}
1022
1023			FANOUT_ENQUEUE_MP(headmp[type][indx],
1024			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1025			    sz[type][indx], sz1, mp);
1026			continue;
1027		}
1028
1029
1030		/*
1031		 * If we are using the default Rx ring where H/W or S/W
1032		 * classification has not happened, we need to verify if
1033		 * this unicast packet really belongs to us.
1034		 */
1035		if (sap == ETHERTYPE_IP) {
1036			/*
1037			 * If we are H/W classified, but we have promisc
1038			 * on, then we need to check for the unicast address.
1039			 */
1040			if (hw_classified && mcip->mci_promisc_list != NULL) {
1041				mac_address_t		*map;
1042
1043				rw_enter(&mcip->mci_rw_lock, RW_READER);
1044				map = mcip->mci_unicast;
1045				if (bcmp(dstaddr, map->ma_addr,
1046				    map->ma_len) == 0)
1047					type = UNDEF;
1048				rw_exit(&mcip->mci_rw_lock);
1049			} else if (is_unicast) {
1050				type = UNDEF;
1051			}
1052		}
1053
1054		/*
1055		 * This needs to become a contract with the driver for
1056		 * the fast path.
1057		 */
1058
1059		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1060		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1061			type = OTH;
1062			fanout_oth1++;
1063		}
1064
1065		if (type != OTH) {
1066			uint16_t	frag_offset_flags;
1067
1068			switch (ipha->ipha_protocol) {
1069			case IPPROTO_TCP:
1070			case IPPROTO_UDP:
1071			case IPPROTO_SCTP:
1072			case IPPROTO_ESP:
1073				ipha_len = IPH_HDR_LENGTH(ipha);
1074				if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1075				    mp->b_wptr) {
1076					type = OTH;
1077					break;
1078				}
1079				frag_offset_flags =
1080				    ntohs(ipha->ipha_fragment_offset_and_flags);
1081				if ((frag_offset_flags &
1082				    (IPH_MF | IPH_OFFSET)) != 0) {
1083					type = OTH;
1084					fanout_oth3++;
1085					break;
1086				}
1087				ports_offset = hdrsize + ipha_len;
1088				break;
1089			default:
1090				type = OTH;
1091				fanout_oth4++;
1092				break;
1093			}
1094		}
1095
1096		if (type == OTH) {
1097			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1098			    hdrsize, &type, &indx) == -1) {
1099				mac_rx_drop_pkt(mac_srs, mp);
1100				continue;
1101			}
1102
1103			FANOUT_ENQUEUE_MP(headmp[type][indx],
1104			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1105			    sz[type][indx], sz1, mp);
1106			continue;
1107		}
1108
1109		ASSERT(type == UNDEF);
1110
1111		/*
1112		 * XXX-Sunay: We should hold srs_lock since ring_count
1113		 * below can change. But if we are always called from
1114		 * mac_rx_srs_drain and SRS_PROC is set, then we can
1115		 * enforce that ring_count can't be changed i.e.
1116		 * to change fanout type or ring count, the calling
1117		 * thread needs to be behind SRS_PROC.
1118		 */
1119		switch (ipha->ipha_protocol) {
1120		case IPPROTO_TCP:
1121			/*
1122			 * Note that for ESP, we fanout on SPI and it is at the
1123			 * same offset as the 2x16-bit ports. So it is clumped
1124			 * along with TCP, UDP and SCTP.
1125			 */
1126			hash = HASH_ADDR(ipha->ipha_src,
1127			    *(uint32_t *)(mp->b_rptr + ports_offset));
1128			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1129			type = V4_TCP;
1130			mp->b_rptr += hdrsize;
1131			break;
1132		case IPPROTO_UDP:
1133		case IPPROTO_SCTP:
1134		case IPPROTO_ESP:
1135			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1136				hash = HASH_ADDR(ipha->ipha_src,
1137				    *(uint32_t *)(mp->b_rptr + ports_offset));
1138				indx = COMPUTE_INDEX(hash,
1139				    mac_srs->srs_udp_ring_count);
1140			} else {
1141				indx = mac_srs->srs_ind %
1142				    mac_srs->srs_udp_ring_count;
1143				mac_srs->srs_ind++;
1144			}
1145			type = V4_UDP;
1146			mp->b_rptr += hdrsize;
1147			break;
1148		default:
1149			indx = 0;
1150			type = OTH;
1151		}
1152
1153		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1154		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1155	}
1156
1157	for (type = V4_TCP; type < UNDEF; type++) {
1158		int	i;
1159
1160		for (i = 0; i < fanout_cnt; i++) {
1161			if (headmp[type][i] != NULL) {
1162				mac_soft_ring_t	*softring;
1163
1164				ASSERT(tailmp[type][i]->b_next == NULL);
1165				switch (type) {
1166				case V4_TCP:
1167					softring =
1168					    mac_srs->srs_tcp_soft_rings[i];
1169					break;
1170				case V4_UDP:
1171					softring =
1172					    mac_srs->srs_udp_soft_rings[i];
1173					break;
1174				case OTH:
1175					softring =
1176					    mac_srs->srs_oth_soft_rings[i];
1177					break;
1178				}
1179				mac_rx_soft_ring_process(mcip,
1180				    softring, headmp[type][i], tailmp[type][i],
1181				    cnt[type][i], sz[type][i]);
1182			}
1183		}
1184	}
1185}
1186
1187#define	SRS_BYTES_TO_PICKUP	150000
1188ssize_t	max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1189
1190/*
1191 * mac_rx_srs_poll_ring
1192 *
1193 * This SRS Poll thread uses this routine to poll the underlying hardware
1194 * Rx ring to get a chain of packets. It can inline process that chain
1195 * if mac_latency_optimize is set (default) or signal the SRS worker thread
1196 * to do the remaining processing.
1197 *
1198 * Since packets come in the system via interrupt or poll path, we also
1199 * update the stats and deal with promiscous clients here.
1200 */
1201void
1202mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1203{
1204	kmutex_t 		*lock = &mac_srs->srs_lock;
1205	kcondvar_t 		*async = &mac_srs->srs_cv;
1206	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1207	mblk_t 			*head, *tail, *mp;
1208	callb_cpr_t 		cprinfo;
1209	ssize_t 		bytes_to_pickup;
1210	size_t 			sz;
1211	int			count;
1212	mac_client_impl_t	*smcip;
1213
1214	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1215	mutex_enter(lock);
1216
1217start:
1218	for (;;) {
1219		if (mac_srs->srs_state & SRS_PAUSE)
1220			goto done;
1221
1222		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1223		cv_wait(async, lock);
1224		CALLB_CPR_SAFE_END(&cprinfo, lock);
1225
1226		if (mac_srs->srs_state & SRS_PAUSE)
1227			goto done;
1228
1229check_again:
1230		if (mac_srs->srs_type & SRST_BW_CONTROL) {
1231			/*
1232			 * We pick as many bytes as we are allowed to queue.
1233			 * Its possible that we will exceed the total
1234			 * packets queued in case this SRS is part of the
1235			 * Rx ring group since > 1 poll thread can be pulling
1236			 * upto the max allowed packets at the same time
1237			 * but that should be OK.
1238			 */
1239			mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1240			bytes_to_pickup =
1241			    mac_srs->srs_bw->mac_bw_drop_threshold -
1242			    mac_srs->srs_bw->mac_bw_sz;
1243			/*
1244			 * We shouldn't have been signalled if we
1245			 * have 0 or less bytes to pick but since
1246			 * some of the bytes accounting is driver
1247			 * dependant, we do the safety check.
1248			 */
1249			if (bytes_to_pickup < 0)
1250				bytes_to_pickup = 0;
1251			mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1252		} else {
1253			/*
1254			 * ToDO: Need to change the polling API
1255			 * to add a packet count and a flag which
1256			 * tells the driver whether we want packets
1257			 * based on a count, or bytes, or all the
1258			 * packets queued in the driver/HW. This
1259			 * way, we never have to check the limits
1260			 * on poll path. We truly let only as many
1261			 * packets enter the system as we are willing
1262			 * to process or queue.
1263			 *
1264			 * Something along the lines of
1265			 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1266			 *	mac_srs->srs_poll_pkt_cnt
1267			 */
1268
1269			/*
1270			 * Since we are not doing B/W control, pick
1271			 * as many packets as allowed.
1272			 */
1273			bytes_to_pickup = max_bytes_to_pickup;
1274		}
1275
1276		/* Poll the underlying Hardware */
1277		mutex_exit(lock);
1278		head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1279		mutex_enter(lock);
1280
1281		ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1282		    SRS_POLL_THR_OWNER);
1283
1284		mp = tail = head;
1285		count = 0;
1286		sz = 0;
1287		while (mp != NULL) {
1288			tail = mp;
1289			sz += msgdsize(mp);
1290			mp = mp->b_next;
1291			count++;
1292		}
1293
1294		if (head != NULL) {
1295			tail->b_next = NULL;
1296			smcip = mac_srs->srs_mcip;
1297
1298			if ((mac_srs->srs_type & SRST_FLOW) ||
1299			    (smcip == NULL)) {
1300				FLOW_STAT_UPDATE(mac_srs->srs_flent,
1301				    rbytes, sz);
1302				FLOW_STAT_UPDATE(mac_srs->srs_flent,
1303				    ipackets, count);
1304			}
1305
1306			/*
1307			 * If there are any promiscuous mode callbacks
1308			 * defined for this MAC client, pass them a copy
1309			 * if appropriate and also update the counters.
1310			 */
1311			if (smcip != NULL) {
1312				smcip->mci_stat_ibytes += sz;
1313				smcip->mci_stat_ipackets += count;
1314
1315				if (smcip->mci_mip->mi_promisc_list != NULL) {
1316					mutex_exit(lock);
1317					mac_promisc_dispatch(smcip->mci_mip,
1318					    head, NULL);
1319					mutex_enter(lock);
1320				}
1321			}
1322			if (mac_srs->srs_type & SRST_BW_CONTROL) {
1323				mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1324				mac_srs->srs_bw->mac_bw_polled += sz;
1325				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1326			}
1327			srs_rx->sr_poll_count += count;
1328			MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1329			    count, sz);
1330			if (count <= 10)
1331				srs_rx->sr_chain_cnt_undr10++;
1332			else if (count > 10 && count <= 50)
1333				srs_rx->sr_chain_cnt_10to50++;
1334			else
1335				srs_rx->sr_chain_cnt_over50++;
1336		}
1337
1338		/*
1339		 * We are guaranteed that SRS_PROC will be set if we
1340		 * are here. Also, poll thread gets to run only if
1341		 * the drain was being done by a worker thread although
1342		 * its possible that worker thread is still running
1343		 * and poll thread was sent down to keep the pipeline
1344		 * going instead of doing a complete drain and then
1345		 * trying to poll the NIC.
1346		 *
1347		 * So we need to check SRS_WORKER flag to make sure
1348		 * that the worker thread is not processing the queue
1349		 * in parallel to us. The flags and conditions are
1350		 * protected by the srs_lock to prevent any race. We
1351		 * ensure that we don't drop the srs_lock from now
1352		 * till the end and similarly we don't drop the srs_lock
1353		 * in mac_rx_srs_drain() till similar condition check
1354		 * are complete. The mac_rx_srs_drain() needs to ensure
1355		 * that SRS_WORKER flag remains set as long as its
1356		 * processing the queue.
1357		 */
1358		if (!(mac_srs->srs_state & SRS_WORKER) &&
1359		    (mac_srs->srs_first != NULL)) {
1360			/*
1361			 * We have packets to process and worker thread
1362			 * is not running. Check to see if poll thread is
1363			 * allowed to process.
1364			 */
1365			if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1366				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1367				if (!(mac_srs->srs_state & SRS_PAUSE) &&
1368				    srs_rx->sr_poll_pkt_cnt <=
1369				    srs_rx->sr_lowat) {
1370					srs_rx->sr_poll_again++;
1371					goto check_again;
1372				}
1373				/*
1374				 * We are already above low water mark
1375				 * so stay in the polling mode but no
1376				 * need to poll. Once we dip below
1377				 * the polling threshold, the processing
1378				 * thread (soft ring) will signal us
1379				 * to poll again (MAC_UPDATE_SRS_COUNT)
1380				 */
1381				srs_rx->sr_poll_drain_no_poll++;
1382				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1383				/*
1384				 * In B/W control case, its possible
1385				 * that the backlog built up due to
1386				 * B/W limit being reached and packets
1387				 * are queued only in SRS. In this case,
1388				 * we should schedule worker thread
1389				 * since no one else will wake us up.
1390				 */
1391				if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1392				    (mac_srs->srs_tid == NULL)) {
1393					mac_srs->srs_tid =
1394					    timeout(mac_srs_fire, mac_srs, 1);
1395					srs_rx->sr_poll_worker_wakeup++;
1396				}
1397			} else {
1398				/*
1399				 * Wakeup the worker thread for more processing.
1400				 * We optimize for throughput in this case.
1401				 */
1402				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1403				MAC_SRS_WORKER_WAKEUP(mac_srs);
1404				srs_rx->sr_poll_sig_worker++;
1405			}
1406		} else if ((mac_srs->srs_first == NULL) &&
1407		    !(mac_srs->srs_state & SRS_WORKER)) {
1408			/*
1409			 * There is nothing queued in SRS and
1410			 * no worker thread running. Plus we
1411			 * didn't get anything from the H/W
1412			 * as well (head == NULL);
1413			 */
1414			ASSERT(head == NULL);
1415			mac_srs->srs_state &=
1416			    ~(SRS_PROC|SRS_GET_PKTS);
1417
1418			/*
1419			 * If we have a packets in soft ring, don't allow
1420			 * more packets to come into this SRS by keeping the
1421			 * interrupts off but not polling the H/W. The
1422			 * poll thread will get signaled as soon as
1423			 * srs_poll_pkt_cnt dips below poll threshold.
1424			 */
1425			if (srs_rx->sr_poll_pkt_cnt == 0) {
1426				srs_rx->sr_poll_intr_enable++;
1427				MAC_SRS_POLLING_OFF(mac_srs);
1428			} else {
1429				/*
1430				 * We know nothing is queued in SRS
1431				 * since we are here after checking
1432				 * srs_first is NULL. The backlog
1433				 * is entirely due to packets queued
1434				 * in Soft ring which will wake us up
1435				 * and get the interface out of polling
1436				 * mode once the backlog dips below
1437				 * sr_poll_thres.
1438				 */
1439				srs_rx->sr_poll_no_poll++;
1440			}
1441		} else {
1442			/*
1443			 * Worker thread is already running.
1444			 * Nothing much to do. If the polling
1445			 * was enabled, worker thread will deal
1446			 * with that.
1447			 */
1448			mac_srs->srs_state &= ~SRS_GET_PKTS;
1449			srs_rx->sr_poll_goto_sleep++;
1450		}
1451	}
1452done:
1453	mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1454	cv_signal(&mac_srs->srs_async);
1455	/*
1456	 * If this is a temporary quiesce then wait for the restart signal
1457	 * from the srs worker. Then clear the flags and signal the srs worker
1458	 * to ensure a positive handshake and go back to start.
1459	 */
1460	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1461		cv_wait(async, lock);
1462	if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1463		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1464		mac_srs->srs_state &=
1465		    ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1466		cv_signal(&mac_srs->srs_async);
1467		goto start;
1468	} else {
1469		mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1470		cv_signal(&mac_srs->srs_async);
1471		CALLB_CPR_EXIT(&cprinfo);
1472		thread_exit();
1473	}
1474}
1475
1476/*
1477 * mac_srs_pick_chain
1478 *
1479 * In Bandwidth control case, checks how many packets can be processed
1480 * and return them in a sub chain.
1481 */
1482static mblk_t *
1483mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1484    size_t *chain_sz, int *chain_cnt)
1485{
1486	mblk_t 			*head = NULL;
1487	mblk_t 			*tail = NULL;
1488	size_t			sz;
1489	size_t 			tsz = 0;
1490	int			cnt = 0;
1491	mblk_t 			*mp;
1492
1493	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1494	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1495	if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1496	    mac_srs->srs_bw->mac_bw_limit) ||
1497	    (mac_srs->srs_bw->mac_bw_limit == 0)) {
1498		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1499		head = mac_srs->srs_first;
1500		mac_srs->srs_first = NULL;
1501		*chain_tail = mac_srs->srs_last;
1502		mac_srs->srs_last = NULL;
1503		*chain_sz = mac_srs->srs_size;
1504		*chain_cnt = mac_srs->srs_count;
1505		mac_srs->srs_count = 0;
1506		mac_srs->srs_size = 0;
1507		return (head);
1508	}
1509
1510	/*
1511	 * Can't clear the entire backlog.
1512	 * Need to find how many packets to pick
1513	 */
1514	ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1515	while ((mp = mac_srs->srs_first) != NULL) {
1516		sz = msgdsize(mp);
1517		if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1518		    mac_srs->srs_bw->mac_bw_limit) {
1519			if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1520				mac_srs->srs_bw->mac_bw_state |=
1521				    SRS_BW_ENFORCED;
1522			break;
1523		}
1524
1525		/*
1526		 * The _size & cnt is  decremented from the softrings
1527		 * when they send up the packet for polling to work
1528		 * properly.
1529		 */
1530		tsz += sz;
1531		cnt++;
1532		mac_srs->srs_count--;
1533		mac_srs->srs_size -= sz;
1534		if (tail != NULL)
1535			tail->b_next = mp;
1536		else
1537			head = mp;
1538		tail = mp;
1539		mac_srs->srs_first = mac_srs->srs_first->b_next;
1540	}
1541	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1542	if (mac_srs->srs_first == NULL)
1543		mac_srs->srs_last = NULL;
1544
1545	if (tail != NULL)
1546		tail->b_next = NULL;
1547	*chain_tail = tail;
1548	*chain_cnt = cnt;
1549	*chain_sz = tsz;
1550
1551	return (head);
1552}
1553
1554/*
1555 * mac_rx_srs_drain
1556 *
1557 * The SRS drain routine. Gets to run to clear the queue. Any thread
1558 * (worker, interrupt, poll) can call this based on processing model.
1559 * The first thing we do is disable interrupts if possible and then
1560 * drain the queue. we also try to poll the underlying hardware if
1561 * there is a dedicated hardware Rx ring assigned to this SRS.
1562 *
1563 * There is a equivalent drain routine in bandwidth control mode
1564 * mac_rx_srs_drain_bw. There is some code duplication between the two
1565 * routines but they are highly performance sensitive and are easier
1566 * to read/debug if they stay separate. Any code changes here might
1567 * also apply to mac_rx_srs_drain_bw as well.
1568 */
1569void
1570mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1571{
1572	mblk_t 			*head;
1573	mblk_t			*tail;
1574	timeout_id_t 		tid;
1575	int			cnt = 0;
1576	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1577	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1578
1579	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1580	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1581
1582	/* If we are blanked i.e. can't do upcalls, then we are done */
1583	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1584		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1585		    (mac_srs->srs_state & SRS_PAUSE));
1586		goto out;
1587	}
1588
1589	if (mac_srs->srs_first == NULL)
1590		goto out;
1591
1592	if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1593	    (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1594		/*
1595		 * In the normal case, the SRS worker thread does no
1596		 * work and we wait for a backlog to build up before
1597		 * we switch into polling mode. In case we are
1598		 * optimizing for throughput, we use the worker thread
1599		 * as well. The goal is to let worker thread process
1600		 * the queue and poll thread to feed packets into
1601		 * the queue. As such, we should signal the poll
1602		 * thread to try and get more packets.
1603		 *
1604		 * We could have pulled this check in the POLL_RING
1605		 * macro itself but keeping it explicit here makes
1606		 * the architecture more human understandable.
1607		 */
1608		MAC_SRS_POLL_RING(mac_srs);
1609	}
1610
1611again:
1612	head = mac_srs->srs_first;
1613	mac_srs->srs_first = NULL;
1614	tail = mac_srs->srs_last;
1615	mac_srs->srs_last = NULL;
1616	cnt = mac_srs->srs_count;
1617	mac_srs->srs_count = 0;
1618
1619	ASSERT(head != NULL);
1620	ASSERT(tail != NULL);
1621
1622	if ((tid = mac_srs->srs_tid) != 0)
1623		mac_srs->srs_tid = 0;
1624
1625	mac_srs->srs_state |= (SRS_PROC|proc_type);
1626
1627
1628	/*
1629	 * mcip is NULL for broadcast and multicast flows. The promisc
1630	 * callbacks for broadcast and multicast packets are delivered from
1631	 * mac_rx() and we don't need to worry about that case in this path
1632	 */
1633	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
1634		mutex_exit(&mac_srs->srs_lock);
1635		mac_promisc_client_dispatch(mcip, head);
1636		mutex_enter(&mac_srs->srs_lock);
1637	}
1638
1639	/*
1640	 * Check if SRS itself is doing the processing
1641	 * This direct path does not apply when subflows are present. In this
1642	 * case, packets need to be dispatched to a soft ring according to the
1643	 * flow's bandwidth and other resources contraints.
1644	 */
1645	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1646		mac_direct_rx_t		proc;
1647		void			*arg1;
1648		mac_resource_handle_t	arg2;
1649
1650		/*
1651		 * This is the case when a Rx is directly
1652		 * assigned and we have a fully classified
1653		 * protocol chain. We can deal with it in
1654		 * one shot.
1655		 */
1656		proc = srs_rx->sr_func;
1657		arg1 = srs_rx->sr_arg1;
1658		arg2 = srs_rx->sr_arg2;
1659
1660		mac_srs->srs_state |= SRS_CLIENT_PROC;
1661		mutex_exit(&mac_srs->srs_lock);
1662		if (tid != 0) {
1663			(void) untimeout(tid);
1664			tid = 0;
1665		}
1666
1667		proc(arg1, arg2, head, NULL);
1668		/*
1669		 * Decrement the size and count here itelf
1670		 * since the packet has been processed.
1671		 */
1672		mutex_enter(&mac_srs->srs_lock);
1673		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1674		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1675			cv_signal(&mac_srs->srs_client_cv);
1676		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1677	} else {
1678		/* Some kind of softrings based fanout is required */
1679		mutex_exit(&mac_srs->srs_lock);
1680		if (tid != 0) {
1681			(void) untimeout(tid);
1682			tid = 0;
1683		}
1684
1685		/*
1686		 * Since the fanout routines can deal with chains,
1687		 * shoot the entire chain up.
1688		 */
1689		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1690			mac_rx_srs_fanout(mac_srs, head);
1691		else
1692			mac_rx_srs_proto_fanout(mac_srs, head);
1693		mutex_enter(&mac_srs->srs_lock);
1694	}
1695
1696	if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1697	    (mac_srs->srs_first != NULL)) {
1698		/*
1699		 * More packets arrived while we were clearing the
1700		 * SRS. This can be possible because of one of
1701		 * three conditions below:
1702		 * 1) The driver is using multiple worker threads
1703		 *    to send the packets to us.
1704		 * 2) The driver has a race in switching
1705		 *    between interrupt and polling mode or
1706		 * 3) Packets are arriving in this SRS via the
1707		 *    S/W classification as well.
1708		 *
1709		 * We should switch to polling mode and see if we
1710		 * need to send the poll thread down. Also, signal
1711		 * the worker thread to process whats just arrived.
1712		 */
1713		MAC_SRS_POLLING_ON(mac_srs);
1714		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1715			srs_rx->sr_drain_poll_sig++;
1716			MAC_SRS_POLL_RING(mac_srs);
1717		}
1718
1719		/*
1720		 * If we didn't signal the poll thread, we need
1721		 * to deal with the pending packets ourselves.
1722		 */
1723		if (proc_type == SRS_WORKER) {
1724			srs_rx->sr_drain_again++;
1725			goto again;
1726		} else {
1727			srs_rx->sr_drain_worker_sig++;
1728			cv_signal(&mac_srs->srs_async);
1729		}
1730	}
1731
1732out:
1733	if (mac_srs->srs_state & SRS_GET_PKTS) {
1734		/*
1735		 * Poll thread is already running. Leave the
1736		 * SRS_RPOC set and hand over the control to
1737		 * poll thread.
1738		 */
1739		mac_srs->srs_state &= ~proc_type;
1740		srs_rx->sr_drain_poll_running++;
1741		return;
1742	}
1743
1744	/*
1745	 * Even if there are no packets queued in SRS, we
1746	 * need to make sure that the shared counter is
1747	 * clear and any associated softrings have cleared
1748	 * all the backlog. Otherwise, leave the interface
1749	 * in polling mode and the poll thread will get
1750	 * signalled once the count goes down to zero.
1751	 *
1752	 * If someone is already draining the queue (SRS_PROC is
1753	 * set) when the srs_poll_pkt_cnt goes down to zero,
1754	 * then it means that drain is already running and we
1755	 * will turn off polling at that time if there is
1756	 * no backlog.
1757	 *
1758	 * As long as there are packets queued either
1759	 * in soft ring set or its soft rings, we will leave
1760	 * the interface in polling mode (even if the drain
1761	 * was done being the interrupt thread). We signal
1762	 * the poll thread as well if we have dipped below
1763	 * low water mark.
1764	 *
1765	 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1766	 * since that turn polling on only for worker thread.
1767	 * Its not worth turning polling on for interrupt
1768	 * thread (since NIC will not issue another interrupt)
1769	 * unless a backlog builds up.
1770	 */
1771	if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1772	    (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1773		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1774		srs_rx->sr_drain_keep_polling++;
1775		MAC_SRS_POLLING_ON(mac_srs);
1776		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1777			MAC_SRS_POLL_RING(mac_srs);
1778		return;
1779	}
1780
1781	/* Nothing else to do. Get out of poll mode */
1782	MAC_SRS_POLLING_OFF(mac_srs);
1783	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1784	srs_rx->sr_drain_finish_intr++;
1785}
1786
1787/*
1788 * mac_rx_srs_drain_bw
1789 *
1790 * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1791 * (worker, interrupt, poll) can call this based on processing model.
1792 * The first thing we do is disable interrupts if possible and then
1793 * drain the queue. we also try to poll the underlying hardware if
1794 * there is a dedicated hardware Rx ring assigned to this SRS.
1795 *
1796 * There is a equivalent drain routine in non bandwidth control mode
1797 * mac_rx_srs_drain. There is some code duplication between the two
1798 * routines but they are highly performance sensitive and are easier
1799 * to read/debug if they stay separate. Any code changes here might
1800 * also apply to mac_rx_srs_drain as well.
1801 */
1802void
1803mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1804{
1805	mblk_t 			*head;
1806	mblk_t			*tail;
1807	timeout_id_t 		tid;
1808	size_t			sz = 0;
1809	int			cnt = 0;
1810	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1811	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1812
1813	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1814	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1815again:
1816	/* Check if we are doing B/W control */
1817	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1818	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
1819		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
1820		mac_srs->srs_bw->mac_bw_used = 0;
1821		if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1822			mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1823	} else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1824		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1825		goto done;
1826	} else if (mac_srs->srs_bw->mac_bw_used >
1827	    mac_srs->srs_bw->mac_bw_limit) {
1828		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1829		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1830		goto done;
1831	}
1832	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1833
1834	/* If we are blanked i.e. can't do upcalls, then we are done */
1835	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1836		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1837		    (mac_srs->srs_state & SRS_PAUSE));
1838		goto done;
1839	}
1840
1841	sz = 0;
1842	cnt = 0;
1843	if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1844		/*
1845		 * We couldn't pick up a single packet.
1846		 */
1847		mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1848		if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1849		    (mac_srs->srs_size != 0) &&
1850		    !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1851			/*
1852			 * Seems like configured B/W doesn't
1853			 * even allow processing of 1 packet
1854			 * per tick.
1855			 *
1856			 * XXX: raise the limit to processing
1857			 * at least 1 packet per tick.
1858			 */
1859			mac_srs->srs_bw->mac_bw_limit +=
1860			    mac_srs->srs_bw->mac_bw_limit;
1861			mac_srs->srs_bw->mac_bw_drop_threshold +=
1862			    mac_srs->srs_bw->mac_bw_drop_threshold;
1863			cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1864			    "raised B/W limit to %d since not even a "
1865			    "single packet can be processed per "
1866			    "tick %d\n", (void *)mac_srs,
1867			    (int)mac_srs->srs_bw->mac_bw_limit,
1868			    (int)msgdsize(mac_srs->srs_first));
1869		}
1870		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1871		goto done;
1872	}
1873
1874	ASSERT(head != NULL);
1875	ASSERT(tail != NULL);
1876
1877	/* zero bandwidth: drop all and return to interrupt mode */
1878	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1879	if (mac_srs->srs_bw->mac_bw_limit == 0) {
1880		srs_rx->sr_drop_count += cnt;
1881		ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1882		mac_srs->srs_bw->mac_bw_sz -= sz;
1883		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1884		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1885		mac_pkt_drop(NULL, NULL, head, B_FALSE);
1886		goto leave_poll;
1887	} else {
1888		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1889	}
1890
1891	if ((tid = mac_srs->srs_tid) != 0)
1892		mac_srs->srs_tid = 0;
1893
1894	mac_srs->srs_state |= (SRS_PROC|proc_type);
1895	MAC_SRS_WORKER_POLLING_ON(mac_srs);
1896
1897	/*
1898	 * mcip is NULL for broadcast and multicast flows. The promisc
1899	 * callbacks for broadcast and multicast packets are delivered from
1900	 * mac_rx() and we don't need to worry about that case in this path
1901	 */
1902	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
1903		mutex_exit(&mac_srs->srs_lock);
1904		mac_promisc_client_dispatch(mcip, head);
1905		mutex_enter(&mac_srs->srs_lock);
1906	}
1907
1908	/*
1909	 * Check if SRS itself is doing the processing
1910	 * This direct path does not apply when subflows are present. In this
1911	 * case, packets need to be dispatched to a soft ring according to the
1912	 * flow's bandwidth and other resources contraints.
1913	 */
1914	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1915		mac_direct_rx_t		proc;
1916		void			*arg1;
1917		mac_resource_handle_t	arg2;
1918
1919		/*
1920		 * This is the case when a Rx is directly
1921		 * assigned and we have a fully classified
1922		 * protocol chain. We can deal with it in
1923		 * one shot.
1924		 */
1925		proc = srs_rx->sr_func;
1926		arg1 = srs_rx->sr_arg1;
1927		arg2 = srs_rx->sr_arg2;
1928
1929		mac_srs->srs_state |= SRS_CLIENT_PROC;
1930		mutex_exit(&mac_srs->srs_lock);
1931		if (tid != 0) {
1932			(void) untimeout(tid);
1933			tid = 0;
1934		}
1935
1936		proc(arg1, arg2, head, NULL);
1937		/*
1938		 * Decrement the size and count here itelf
1939		 * since the packet has been processed.
1940		 */
1941		mutex_enter(&mac_srs->srs_lock);
1942		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1943		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
1944
1945		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1946			cv_signal(&mac_srs->srs_client_cv);
1947		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1948	} else {
1949		/* Some kind of softrings based fanout is required */
1950		mutex_exit(&mac_srs->srs_lock);
1951		if (tid != 0) {
1952			(void) untimeout(tid);
1953			tid = 0;
1954		}
1955
1956		/*
1957		 * Since the fanout routines can deal with chains,
1958		 * shoot the entire chain up.
1959		 */
1960		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1961			mac_rx_srs_fanout(mac_srs, head);
1962		else
1963			mac_rx_srs_proto_fanout(mac_srs, head);
1964		mutex_enter(&mac_srs->srs_lock);
1965	}
1966
1967	/*
1968	 * Send the poll thread to pick up any packets arrived
1969	 * so far. This also serves as the last check in case
1970	 * nothing else is queued in the SRS. The poll thread
1971	 * is signalled only in the case the drain was done
1972	 * by the worker thread and SRS_WORKER is set. The
1973	 * worker thread can run in parallel as long as the
1974	 * SRS_WORKER flag is set. We we have nothing else to
1975	 * process, we can exit while leaving SRS_PROC set
1976	 * which gives the poll thread control to process and
1977	 * cleanup once it returns from the NIC.
1978	 *
1979	 * If we have nothing else to process, we need to
1980	 * ensure that we keep holding the srs_lock till
1981	 * all the checks below are done and control is
1982	 * handed to the poll thread if it was running.
1983	 */
1984	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1985	if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1986		if (mac_srs->srs_first != NULL) {
1987			if (proc_type == SRS_WORKER) {
1988				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1989				if (srs_rx->sr_poll_pkt_cnt <=
1990				    srs_rx->sr_lowat)
1991					MAC_SRS_POLL_RING(mac_srs);
1992				goto again;
1993			} else {
1994				cv_signal(&mac_srs->srs_async);
1995			}
1996		}
1997	}
1998	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1999
2000done:
2001
2002	if (mac_srs->srs_state & SRS_GET_PKTS) {
2003		/*
2004		 * Poll thread is already running. Leave the
2005		 * SRS_RPOC set and hand over the control to
2006		 * poll thread.
2007		 */
2008		mac_srs->srs_state &= ~proc_type;
2009		return;
2010	}
2011
2012	/*
2013	 * If we can't process packets because we have exceeded
2014	 * B/W limit for this tick, just set the timeout
2015	 * and leave.
2016	 *
2017	 * Even if there are no packets queued in SRS, we
2018	 * need to make sure that the shared counter is
2019	 * clear and any associated softrings have cleared
2020	 * all the backlog. Otherwise, leave the interface
2021	 * in polling mode and the poll thread will get
2022	 * signalled once the count goes down to zero.
2023	 *
2024	 * If someone is already draining the queue (SRS_PROC is
2025	 * set) when the srs_poll_pkt_cnt goes down to zero,
2026	 * then it means that drain is already running and we
2027	 * will turn off polling at that time if there is
2028	 * no backlog. As long as there are packets queued either
2029	 * is soft ring set or its soft rings, we will leave
2030	 * the interface in polling mode.
2031	 */
2032	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2033	if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2034	    ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2035	    (srs_rx->sr_poll_pkt_cnt > 0))) {
2036		MAC_SRS_POLLING_ON(mac_srs);
2037		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2038		if ((mac_srs->srs_first != NULL) &&
2039		    (mac_srs->srs_tid == NULL))
2040			mac_srs->srs_tid = timeout(mac_srs_fire,
2041			    mac_srs, 1);
2042		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2043		return;
2044	}
2045	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2046
2047leave_poll:
2048
2049	/* Nothing else to do. Get out of poll mode */
2050	MAC_SRS_POLLING_OFF(mac_srs);
2051	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2052}
2053
2054/*
2055 * mac_srs_worker
2056 *
2057 * The SRS worker routine. Drains the queue when no one else is
2058 * processing it.
2059 */
2060void
2061mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2062{
2063	kmutex_t 		*lock = &mac_srs->srs_lock;
2064	kcondvar_t 		*async = &mac_srs->srs_async;
2065	callb_cpr_t		cprinfo;
2066	boolean_t		bw_ctl_flag;
2067
2068	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2069	mutex_enter(lock);
2070
2071start:
2072	for (;;) {
2073		bw_ctl_flag = B_FALSE;
2074		if (mac_srs->srs_type & SRST_BW_CONTROL) {
2075			MAC_SRS_BW_LOCK(mac_srs);
2076			MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2077			if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2078				bw_ctl_flag = B_TRUE;
2079			MAC_SRS_BW_UNLOCK(mac_srs);
2080		}
2081		/*
2082		 * The SRS_BW_ENFORCED flag may change since we have dropped
2083		 * the mac_bw_lock. However the drain function can handle both
2084		 * a drainable SRS or a bandwidth controlled SRS, and the
2085		 * effect of scheduling a timeout is to wakeup the worker
2086		 * thread which in turn will call the drain function. Since
2087		 * we release the srs_lock atomically only in the cv_wait there
2088		 * isn't a fear of waiting for ever.
2089		 */
2090		while (((mac_srs->srs_state & SRS_PROC) ||
2091		    (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2092		    (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2093		    !(mac_srs->srs_state & SRS_PAUSE)) {
2094			/*
2095			 * If we have packets queued and we are here
2096			 * because B/W control is in place, we better
2097			 * schedule the worker wakeup after 1 tick
2098			 * to see if bandwidth control can be relaxed.
2099			 */
2100			if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2101				/*
2102				 * We need to ensure that a timer  is already
2103				 * scheduled or we force  schedule one for
2104				 * later so that we can continue processing
2105				 * after this  quanta is over.
2106				 */
2107				mac_srs->srs_tid = timeout(mac_srs_fire,
2108				    mac_srs, 1);
2109			}
2110wait:
2111			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2112			cv_wait(async, lock);
2113			CALLB_CPR_SAFE_END(&cprinfo, lock);
2114
2115			if (mac_srs->srs_state & SRS_PAUSE)
2116				goto done;
2117			if (mac_srs->srs_state & SRS_PROC)
2118				goto wait;
2119
2120			if (mac_srs->srs_first != NULL &&
2121			    mac_srs->srs_type & SRST_BW_CONTROL) {
2122				MAC_SRS_BW_LOCK(mac_srs);
2123				if (mac_srs->srs_bw->mac_bw_state &
2124				    SRS_BW_ENFORCED) {
2125					MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2126				}
2127				bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2128				    SRS_BW_ENFORCED;
2129				MAC_SRS_BW_UNLOCK(mac_srs);
2130			}
2131		}
2132
2133		if (mac_srs->srs_state & SRS_PAUSE)
2134			goto done;
2135		mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2136	}
2137done:
2138	/*
2139	 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2140	 * from both hard and soft classifications and waits for such threads
2141	 * to finish before signaling the worker. So at this point the only
2142	 * thread left that could be competing with the worker is the poll
2143	 * thread. In the case of Tx, there shouldn't be any thread holding
2144	 * SRS_PROC at this point.
2145	 */
2146	if (!(mac_srs->srs_state & SRS_PROC)) {
2147		mac_srs->srs_state |= SRS_PROC;
2148	} else {
2149		ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2150		/*
2151		 * Poll thread still owns the SRS and is still running
2152		 */
2153		ASSERT((mac_srs->srs_poll_thr == NULL) ||
2154		    ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2155		    SRS_POLL_THR_OWNER));
2156	}
2157	mac_srs_worker_quiesce(mac_srs);
2158	/*
2159	 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2160	 * of the quiesce operation
2161	 */
2162	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2163		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2164
2165	if (mac_srs->srs_state & SRS_RESTART) {
2166		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2167		mac_srs_worker_restart(mac_srs);
2168		mac_srs->srs_state &= ~SRS_PROC;
2169		goto start;
2170	}
2171
2172	if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2173		mac_srs_worker_quiesce(mac_srs);
2174
2175	mac_srs->srs_state &= ~SRS_PROC;
2176	/* The macro drops the srs_lock */
2177	CALLB_CPR_EXIT(&cprinfo);
2178	thread_exit();
2179}
2180
2181/*
2182 * mac_rx_srs_subflow_process
2183 *
2184 * Receive side routine called from interrupt path when there are
2185 * sub flows present on this SRS.
2186 */
2187/* ARGSUSED */
2188void
2189mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2190    mblk_t *mp_chain, boolean_t loopback)
2191{
2192	flow_entry_t		*flent = NULL;
2193	flow_entry_t		*prev_flent = NULL;
2194	mblk_t			*mp = NULL;
2195	mblk_t			*tail = NULL;
2196	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2197	mac_client_impl_t	*mcip;
2198
2199	mcip = mac_srs->srs_mcip;
2200	ASSERT(mcip != NULL);
2201
2202	/*
2203	 * We need to determine the SRS for every packet
2204	 * by walking the flow table, if we don't get any,
2205	 * then we proceed using the SRS we came with.
2206	 */
2207	mp = tail = mp_chain;
2208	while (mp != NULL) {
2209
2210		/*
2211		 * We will increment the stats for the mactching subflow.
2212		 * when we get the bytes/pkt count for the classified packets
2213		 * later in mac_rx_srs_process.
2214		 */
2215		(void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2216		    FLOW_INBOUND, &flent);
2217
2218		if (mp == mp_chain || flent == prev_flent) {
2219			if (prev_flent != NULL)
2220				FLOW_REFRELE(prev_flent);
2221			prev_flent = flent;
2222			flent = NULL;
2223			tail = mp;
2224			mp = mp->b_next;
2225			continue;
2226		}
2227		tail->b_next = NULL;
2228		/*
2229		 * A null indicates, this is for the mac_srs itself.
2230		 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2231		 */
2232		if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2233			mac_rx_srs_process(arg,
2234			    (mac_resource_handle_t)mac_srs, mp_chain,
2235			    loopback);
2236		} else {
2237			(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2238			    prev_flent->fe_cb_arg2, mp_chain, loopback);
2239			FLOW_REFRELE(prev_flent);
2240		}
2241		prev_flent = flent;
2242		flent = NULL;
2243		mp_chain = mp;
2244		tail = mp;
2245		mp = mp->b_next;
2246	}
2247	/* Last chain */
2248	ASSERT(mp_chain != NULL);
2249	if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2250		mac_rx_srs_process(arg,
2251		    (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2252	} else {
2253		(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2254		    prev_flent->fe_cb_arg2, mp_chain, loopback);
2255		FLOW_REFRELE(prev_flent);
2256	}
2257}
2258
2259/*
2260 * mac_rx_srs_process
2261 *
2262 * Receive side routine called from the interrupt path.
2263 *
2264 * loopback is set to force a context switch on the loopback
2265 * path between MAC clients.
2266 */
2267/* ARGSUSED */
2268void
2269mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2270    boolean_t loopback)
2271{
2272	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2273	mblk_t			*mp, *tail, *head;
2274	int			count = 0;
2275	int			count1;
2276	size_t			sz = 0;
2277	size_t			chain_sz, sz1;
2278	mac_bw_ctl_t		*mac_bw;
2279	mac_client_impl_t	*smcip;
2280	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2281
2282	/*
2283	 * Set the tail, count and sz. We set the sz irrespective
2284	 * of whether we are doing B/W control or not for the
2285	 * purpose of updating the stats.
2286	 */
2287	mp = tail = mp_chain;
2288	while (mp != NULL) {
2289		tail = mp;
2290		count++;
2291		sz += msgdsize(mp);
2292		mp = mp->b_next;
2293	}
2294
2295	mutex_enter(&mac_srs->srs_lock);
2296	smcip = mac_srs->srs_mcip;
2297
2298	if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) {
2299		FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz);
2300		FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count);
2301	}
2302	if (smcip != NULL) {
2303		smcip->mci_stat_ibytes += sz;
2304		smcip->mci_stat_ipackets += count;
2305	}
2306
2307	/*
2308	 * If the SRS in already being processed; has been blanked;
2309	 * can be processed by worker thread only; or the B/W limit
2310	 * has been reached, then queue the chain and check if
2311	 * worker thread needs to be awakend.
2312	 */
2313	if (mac_srs->srs_type & SRST_BW_CONTROL) {
2314		mac_bw = mac_srs->srs_bw;
2315		ASSERT(mac_bw != NULL);
2316		mutex_enter(&mac_bw->mac_bw_lock);
2317		/* Count the packets and bytes via interrupt */
2318		srs_rx->sr_intr_count += count;
2319		mac_bw->mac_bw_intr += sz;
2320		if (mac_bw->mac_bw_limit == 0) {
2321			/* zero bandwidth: drop all */
2322			srs_rx->sr_drop_count += count;
2323			mac_bw->mac_bw_drop_bytes += sz;
2324			mutex_exit(&mac_bw->mac_bw_lock);
2325			mutex_exit(&mac_srs->srs_lock);
2326			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2327			return;
2328		} else {
2329			if ((mac_bw->mac_bw_sz + sz) <=
2330			    mac_bw->mac_bw_drop_threshold) {
2331				mutex_exit(&mac_bw->mac_bw_lock);
2332				MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2333				    tail, count, sz);
2334			} else {
2335				mp = mp_chain;
2336				chain_sz = 0;
2337				count1 = 0;
2338				tail = NULL;
2339				head = NULL;
2340				while (mp != NULL) {
2341					sz1 = msgdsize(mp);
2342					if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2343					    mac_bw->mac_bw_drop_threshold)
2344						break;
2345					chain_sz += sz1;
2346					count1++;
2347					tail = mp;
2348					mp = mp->b_next;
2349				}
2350				mutex_exit(&mac_bw->mac_bw_lock);
2351				if (tail != NULL) {
2352					head = tail->b_next;
2353					tail->b_next = NULL;
2354					MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2355					    mp_chain, tail, count1, chain_sz);
2356					sz -= chain_sz;
2357					count -= count1;
2358				} else {
2359					/* Can't pick up any */
2360					head = mp_chain;
2361				}
2362				if (head != NULL) {
2363					/* Drop any packet over the threshold */
2364					srs_rx->sr_drop_count += count;
2365					mutex_enter(&mac_bw->mac_bw_lock);
2366					mac_bw->mac_bw_drop_bytes += sz;
2367					mutex_exit(&mac_bw->mac_bw_lock);
2368					freemsgchain(head);
2369				}
2370			}
2371			MAC_SRS_WORKER_WAKEUP(mac_srs);
2372			mutex_exit(&mac_srs->srs_lock);
2373			return;
2374		}
2375	}
2376
2377	/*
2378	 * If the total number of packets queued in the SRS and
2379	 * its associated soft rings exceeds the max allowed,
2380	 * then drop the chain. If we are polling capable, this
2381	 * shouldn't be happening.
2382	 */
2383	if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2384	    (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2385		mac_bw = mac_srs->srs_bw;
2386		srs_rx->sr_drop_count += count;
2387		mutex_enter(&mac_bw->mac_bw_lock);
2388		mac_bw->mac_bw_drop_bytes += sz;
2389		mutex_exit(&mac_bw->mac_bw_lock);
2390		freemsgchain(mp_chain);
2391		mutex_exit(&mac_srs->srs_lock);
2392		return;
2393	}
2394
2395	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2396	/* Count the packets entering via interrupt path */
2397	srs_rx->sr_intr_count += count;
2398
2399	if (!(mac_srs->srs_state & SRS_PROC)) {
2400		/*
2401		 * If we are coming via loopback or if we are not
2402		 * optimizing for latency, we should signal the
2403		 * worker thread.
2404		 */
2405		if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) {
2406			/*
2407			 * For loopback, We need to let the worker take
2408			 * over as we don't want to continue in the same
2409			 * thread even if we can. This could lead to stack
2410			 * overflows and may also end up using
2411			 * resources (cpu) incorrectly.
2412			 */
2413			cv_signal(&mac_srs->srs_async);
2414		} else {
2415			/*
2416			 * Seems like no one is processing the SRS and
2417			 * there is no backlog. We also inline process
2418			 * our packet if its a single packet in non
2419			 * latency optimized case (in latency optimized
2420			 * case, we inline process chains of any size).
2421			 */
2422			mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2423		}
2424	}
2425	mutex_exit(&mac_srs->srs_lock);
2426}
2427
2428/* TX SIDE ROUTINES (RUNTIME) */
2429
2430/*
2431 * mac_tx_srs_no_desc
2432 *
2433 * This routine is called by Tx single ring default mode
2434 * when Tx ring runs out of descs.
2435 */
2436mac_tx_cookie_t
2437mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2438    uint16_t flag, mblk_t **ret_mp)
2439{
2440	mac_tx_cookie_t cookie = NULL;
2441	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2442	boolean_t wakeup_worker = B_TRUE;
2443	uint32_t tx_mode = srs_tx->st_mode;
2444	int cnt, sz;
2445	mblk_t *tail;
2446
2447	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2448	if (flag & MAC_DROP_ON_NO_DESC) {
2449		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2450	} else {
2451		if (mac_srs->srs_first != NULL)
2452			wakeup_worker = B_FALSE;
2453		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2454		if (flag & MAC_TX_NO_ENQUEUE) {
2455			/*
2456			 * If TX_QUEUED is not set, queue the
2457			 * packet and let mac_tx_srs_drain()
2458			 * set the TX_BLOCKED bit for the
2459			 * reasons explained above. Otherwise,
2460			 * return the mblks.
2461			 */
2462			if (wakeup_worker) {
2463				MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2464				    mp_chain, tail, cnt, sz);
2465			} else {
2466				MAC_TX_SET_NO_ENQUEUE(mac_srs,
2467				    mp_chain, ret_mp, cookie);
2468			}
2469		} else {
2470			MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2471			    tail, cnt, sz, cookie);
2472		}
2473		if (wakeup_worker)
2474			cv_signal(&mac_srs->srs_async);
2475	}
2476	return (cookie);
2477}
2478
2479/*
2480 * mac_tx_srs_enqueue
2481 *
2482 * This routine is called when Tx SRS is operating in either serializer
2483 * or bandwidth mode. In serializer mode, a packet will get enqueued
2484 * when a thread cannot enter SRS exclusively. In bandwidth mode,
2485 * packets gets queued if allowed byte-count limit for a tick is
2486 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2487 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2488 * the default mode or fanout mode. Here packets get dropped or
2489 * returned back to the caller only after hi-watermark worth of data
2490 * is queued.
2491 */
2492static mac_tx_cookie_t
2493mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2494    uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2495{
2496	mac_tx_cookie_t cookie = NULL;
2497	int cnt, sz;
2498	mblk_t *tail;
2499	boolean_t wakeup_worker = B_TRUE;
2500
2501	/*
2502	 * Ignore fanout hint if we don't have multiple tx rings.
2503	 */
2504	if (!TX_MULTI_RING_MODE(mac_srs))
2505		fanout_hint = 0;
2506
2507	if (mac_srs->srs_first != NULL)
2508		wakeup_worker = B_FALSE;
2509	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2510	if (flag & MAC_DROP_ON_NO_DESC) {
2511		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2512			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2513		} else {
2514			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2515			    mp_chain, tail, cnt, sz);
2516		}
2517	} else if (flag & MAC_TX_NO_ENQUEUE) {
2518		if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2519		    (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2520			MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2521			    ret_mp, cookie);
2522		} else {
2523			mp_chain->b_prev = (mblk_t *)fanout_hint;
2524			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2525			    mp_chain, tail, cnt, sz);
2526		}
2527	} else {
2528		/*
2529		 * If you are BW_ENFORCED, just enqueue the
2530		 * packet. srs_worker will drain it at the
2531		 * prescribed rate. Before enqueueing, save
2532		 * the fanout hint.
2533		 */
2534		mp_chain->b_prev = (mblk_t *)fanout_hint;
2535		MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2536		    tail, cnt, sz, cookie);
2537	}
2538	if (wakeup_worker)
2539		cv_signal(&mac_srs->srs_async);
2540	return (cookie);
2541}
2542
2543/*
2544 * There are five tx modes:
2545 *
2546 * 1) Default mode (SRS_TX_DEFAULT)
2547 * 2) Serialization mode (SRS_TX_SERIALIZE)
2548 * 3) Fanout mode (SRS_TX_FANOUT)
2549 * 4) Bandwdith mode (SRS_TX_BW)
2550 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2551 *
2552 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2553 * based on the number of Tx rings requested for an SRS and whether
2554 * bandwidth control is requested or not.
2555 *
2556 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a
2557 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying
2558 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS.
2559 * When flow-control is relieved, the srs_worker drains the queued
2560 * packets and informs blocked clients to restart sending packets.
2561 *
2562 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized.
2563 *
2564 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2565 * Tx rings. Each Tx ring will have a soft ring associated with it.
2566 * These soft rings will be hung off the Tx SRS. Queueing if it happens
2567 * due to lack of Tx desc will be in individual soft ring (and not srs)
2568 * associated with Tx ring.
2569 *
2570 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2571 * only if bw is available. Otherwise the packets will be queued in
2572 * SRS. If fanout to multiple Tx rings is configured, the packets will
2573 * be fanned out among the soft rings associated with the Tx rings.
2574 *
2575 * Four flags are used in srs_state for indicating flow control
2576 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2577 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2578 * driver below.
2579 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2580 * and flow-control pressure is applied back to clients. The clients expect
2581 * wakeup when flow-control is relieved.
2582 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2583 * got returned back to client either due to lack of Tx descs or due to bw
2584 * control reasons. The clients expect a wakeup when condition is relieved.
2585 *
2586 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2587 * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2588 * MAC_TX_NO_ENQUEUE
2589 * Mac clients that do not want packets to be enqueued in the mac layer set
2590 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2591 * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2592 * behaviour of this flag is different when the Tx is running in serializer
2593 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2594 * get dropped when Tx high watermark is reached.
2595 * There are some mac clients like vsw, aggr that want the mblks to be
2596 * returned back to clients instead of being queued in Tx SRS (or Tx soft
2597 * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2598 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2599 * In the default and Tx fanout mode, the un-transmitted mblks will be
2600 * returned back to the clients when the driver runs out of Tx descs.
2601 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2602 * soft ring) so that the clients can be woken up when Tx desc become
2603 * available. When running in serializer or bandwidth mode mode,
2604 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2605 */
2606
2607mac_tx_func_t
2608mac_tx_get_func(uint32_t mode)
2609{
2610	return (mac_tx_mode_list[mode].mac_tx_func);
2611}
2612
2613/* ARGSUSED */
2614static mac_tx_cookie_t
2615mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2616    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2617{
2618	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2619	boolean_t		is_subflow;
2620	mac_tx_stats_t		stats;
2621	mac_tx_cookie_t		cookie = NULL;
2622
2623	ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2624
2625	/* Regular case with a single Tx ring */
2626	/*
2627	 * SRS_TX_BLOCKED is set when underlying NIC runs
2628	 * out of Tx descs and messages start getting
2629	 * queued. It won't get reset until
2630	 * tx_srs_drain() completely drains out the
2631	 * messages.
2632	 */
2633	if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2634		/* Tx descs/resources not available */
2635		mutex_enter(&mac_srs->srs_lock);
2636		if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2637			cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2638			    flag, ret_mp);
2639			mutex_exit(&mac_srs->srs_lock);
2640			return (cookie);
2641		}
2642		/*
2643		 * While we were computing mblk count, the
2644		 * flow control condition got relieved.
2645		 * Continue with the transmission.
2646		 */
2647		mutex_exit(&mac_srs->srs_lock);
2648	}
2649
2650	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2651
2652	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2653	    mp_chain, (is_subflow ? &stats : NULL));
2654
2655	/*
2656	 * Multiple threads could be here sending packets.
2657	 * Under such conditions, it is not possible to
2658	 * automically set SRS_TX_BLOCKED bit to indicate
2659	 * out of tx desc condition. To atomically set
2660	 * this, we queue the returned packet and do
2661	 * the setting of SRS_TX_BLOCKED in
2662	 * mac_tx_srs_drain().
2663	 */
2664	if (mp_chain != NULL) {
2665		mutex_enter(&mac_srs->srs_lock);
2666		cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2667		mutex_exit(&mac_srs->srs_lock);
2668		return (cookie);
2669	}
2670
2671	if (is_subflow)
2672		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2673
2674	return (NULL);
2675}
2676
2677/*
2678 * mac_tx_serialize_mode
2679 *
2680 * This is an experimental mode implemented as per the request of PAE.
2681 * In this mode, all callers attempting to send a packet to the NIC
2682 * will get serialized. Only one thread at any time will access the
2683 * NIC to send the packet out.
2684 */
2685/* ARGSUSED */
2686static mac_tx_cookie_t
2687mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2688    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2689{
2690	boolean_t		is_subflow;
2691	mac_tx_stats_t		stats;
2692	mac_tx_cookie_t		cookie = NULL;
2693	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2694
2695	/* Single ring, serialize below */
2696	ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2697	mutex_enter(&mac_srs->srs_lock);
2698	if ((mac_srs->srs_first != NULL) ||
2699	    (mac_srs->srs_state & SRS_PROC)) {
2700		/*
2701		 * In serialization mode, queue all packets until
2702		 * TX_HIWAT is set.
2703		 * If drop bit is set, drop if TX_HIWAT is set.
2704		 * If no_enqueue is set, still enqueue until hiwat
2705		 * is set and return mblks after TX_HIWAT is set.
2706		 */
2707		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2708		    flag, NULL, ret_mp);
2709		mutex_exit(&mac_srs->srs_lock);
2710		return (cookie);
2711	}
2712	/*
2713	 * No packets queued, nothing on proc and no flow
2714	 * control condition. Fast-path, ok. Do inline
2715	 * processing.
2716	 */
2717	mac_srs->srs_state |= SRS_PROC;
2718	mutex_exit(&mac_srs->srs_lock);
2719
2720	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2721
2722	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2723	    mp_chain, (is_subflow ? &stats : NULL));
2724
2725	mutex_enter(&mac_srs->srs_lock);
2726	mac_srs->srs_state &= ~SRS_PROC;
2727	if (mp_chain != NULL) {
2728		cookie = mac_tx_srs_enqueue(mac_srs,
2729		    mp_chain, flag, NULL, ret_mp);
2730	}
2731	if (mac_srs->srs_first != NULL) {
2732		/*
2733		 * We processed inline our packet and a new
2734		 * packet/s got queued while we were
2735		 * processing. Wakeup srs worker
2736		 */
2737		cv_signal(&mac_srs->srs_async);
2738	}
2739	mutex_exit(&mac_srs->srs_lock);
2740
2741	if (is_subflow && cookie == NULL)
2742		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2743
2744	return (cookie);
2745}
2746
2747/*
2748 * mac_tx_fanout_mode
2749 *
2750 * In this mode, the SRS will have access to multiple Tx rings to send
2751 * the packet out. The fanout hint that is passed as an argument is
2752 * used to find an appropriate ring to fanout the traffic. Each Tx
2753 * ring, in turn,  will have a soft ring associated with it. If a Tx
2754 * ring runs out of Tx desc's the returned packet will be queued in
2755 * the soft ring associated with that Tx ring. The srs itself will not
2756 * queue any packets.
2757 */
2758
2759#define	MAC_TX_SOFT_RING_PROCESS(chain) {		       		\
2760	index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count),	\
2761	softring = mac_srs->srs_oth_soft_rings[index];			\
2762	cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2763	DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);	\
2764}
2765
2766static mac_tx_cookie_t
2767mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2768    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2769{
2770	mac_soft_ring_t		*softring;
2771	uint64_t		hash;
2772	uint_t			index;
2773	mac_tx_cookie_t		cookie = NULL;
2774
2775	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
2776	if (fanout_hint != 0) {
2777		/*
2778		 * The hint is specified by the caller, simply pass the
2779		 * whole chain to the soft ring.
2780		 */
2781		hash = HASH_HINT(fanout_hint);
2782		MAC_TX_SOFT_RING_PROCESS(mp_chain);
2783	} else {
2784		mblk_t *last_mp, *cur_mp, *sub_chain;
2785		uint64_t last_hash = 0;
2786		uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2787
2788		/*
2789		 * Compute the hash from the contents (headers) of the
2790		 * packets of the mblk chain. Split the chains into
2791		 * subchains of the same conversation.
2792		 *
2793		 * Since there may be more than one ring used for
2794		 * sub-chains of the same call, and since the caller
2795		 * does not maintain per conversation state since it
2796		 * passed a zero hint, unsent subchains will be
2797		 * dropped.
2798		 */
2799
2800		flag |= MAC_DROP_ON_NO_DESC;
2801		ret_mp = NULL;
2802
2803		ASSERT(ret_mp == NULL);
2804
2805		sub_chain = NULL;
2806		last_mp = NULL;
2807
2808		for (cur_mp = mp_chain; cur_mp != NULL;
2809		    cur_mp = cur_mp->b_next) {
2810			hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2811			    B_TRUE);
2812			if (last_hash != 0 && hash != last_hash) {
2813				/*
2814				 * Starting a different subchain, send current
2815				 * chain out.
2816				 */
2817				ASSERT(last_mp != NULL);
2818				last_mp->b_next = NULL;
2819				MAC_TX_SOFT_RING_PROCESS(sub_chain);
2820				sub_chain = NULL;
2821			}
2822
2823			/* add packet to subchain */
2824			if (sub_chain == NULL)
2825				sub_chain = cur_mp;
2826			last_mp = cur_mp;
2827			last_hash = hash;
2828		}
2829
2830		if (sub_chain != NULL) {
2831			/* send last subchain */
2832			ASSERT(last_mp != NULL);
2833			last_mp->b_next = NULL;
2834			MAC_TX_SOFT_RING_PROCESS(sub_chain);
2835		}
2836
2837		cookie = NULL;
2838	}
2839
2840	return (cookie);
2841}
2842
2843/*
2844 * mac_tx_bw_mode
2845 *
2846 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2847 * only if bw is available. Otherwise the packets will be queued in
2848 * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2849 * out to a Tx rings.
2850 */
2851static mac_tx_cookie_t
2852mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2853    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2854{
2855	int			cnt, sz;
2856	mblk_t			*tail;
2857	mac_tx_cookie_t		cookie = NULL;
2858	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2859
2860	ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2861	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2862	mutex_enter(&mac_srs->srs_lock);
2863	if (mac_srs->srs_bw->mac_bw_limit == 0) {
2864		/*
2865		 * zero bandwidth, no traffic is sent: drop the packets,
2866		 * or return the whole chain if the caller requests all
2867		 * unsent packets back.
2868		 */
2869		if (flag & MAC_TX_NO_ENQUEUE) {
2870			cookie = (mac_tx_cookie_t)mac_srs;
2871			*ret_mp = mp_chain;
2872		} else {
2873			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2874		}
2875		mutex_exit(&mac_srs->srs_lock);
2876		return (cookie);
2877	} else if ((mac_srs->srs_first != NULL) ||
2878	    (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2879		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2880		    fanout_hint, ret_mp);
2881		mutex_exit(&mac_srs->srs_lock);
2882		return (cookie);
2883	}
2884	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2885	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
2886		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
2887		mac_srs->srs_bw->mac_bw_used = 0;
2888	} else if (mac_srs->srs_bw->mac_bw_used >
2889	    mac_srs->srs_bw->mac_bw_limit) {
2890		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2891		MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2892		    mp_chain, tail, cnt, sz);
2893		/*
2894		 * Wakeup worker thread. Note that worker
2895		 * thread has to be woken up so that it
2896		 * can fire up the timer to be woken up
2897		 * on the next tick. Also once
2898		 * BW_ENFORCED is set, it can only be
2899		 * reset by srs_worker thread. Until then
2900		 * all packets will get queued up in SRS
2901		 * and hence this this code path won't be
2902		 * entered until BW_ENFORCED is reset.
2903		 */
2904		cv_signal(&mac_srs->srs_async);
2905		mutex_exit(&mac_srs->srs_lock);
2906		return (cookie);
2907	}
2908
2909	mac_srs->srs_bw->mac_bw_used += sz;
2910	mutex_exit(&mac_srs->srs_lock);
2911
2912	if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2913		mac_soft_ring_t *softring;
2914		uint_t indx, hash;
2915
2916		hash = HASH_HINT(fanout_hint);
2917		indx = COMPUTE_INDEX(hash,
2918		    mac_srs->srs_oth_ring_count);
2919		softring = mac_srs->srs_oth_soft_rings[indx];
2920		return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2921		    ret_mp));
2922	} else {
2923		boolean_t		is_subflow;
2924		mac_tx_stats_t		stats;
2925
2926		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2927
2928		mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2929		    mp_chain, (is_subflow ? &stats : NULL));
2930
2931		if (mp_chain != NULL) {
2932			mutex_enter(&mac_srs->srs_lock);
2933			MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2934			if (mac_srs->srs_bw->mac_bw_used > sz)
2935				mac_srs->srs_bw->mac_bw_used -= sz;
2936			else
2937				mac_srs->srs_bw->mac_bw_used = 0;
2938			cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2939			    fanout_hint, ret_mp);
2940			mutex_exit(&mac_srs->srs_lock);
2941			return (cookie);
2942		}
2943		if (is_subflow)
2944			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2945
2946		return (NULL);
2947	}
2948}
2949
2950/* ARGSUSED */
2951void
2952mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
2953{
2954	mblk_t			*head, *tail;
2955	size_t			sz;
2956	uint32_t		tx_mode;
2957	uint_t			saved_pkt_count;
2958	boolean_t		is_subflow;
2959	mac_tx_stats_t		stats;
2960	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2961
2962	saved_pkt_count = 0;
2963	ASSERT(mutex_owned(&mac_srs->srs_lock));
2964	ASSERT(!(mac_srs->srs_state & SRS_PROC));
2965
2966	mac_srs->srs_state |= SRS_PROC;
2967
2968	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2969	tx_mode = srs_tx->st_mode;
2970	if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
2971		if (mac_srs->srs_first != NULL) {
2972			head = mac_srs->srs_first;
2973			tail = mac_srs->srs_last;
2974			saved_pkt_count = mac_srs->srs_count;
2975			mac_srs->srs_first = NULL;
2976			mac_srs->srs_last = NULL;
2977			mac_srs->srs_count = 0;
2978			mutex_exit(&mac_srs->srs_lock);
2979
2980			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2981			    head, &stats);
2982
2983			mutex_enter(&mac_srs->srs_lock);
2984			if (head != NULL) {
2985				/* Device out of tx desc, set block */
2986				if (head->b_next == NULL)
2987					VERIFY(head == tail);
2988				tail->b_next = mac_srs->srs_first;
2989				mac_srs->srs_first = head;
2990				mac_srs->srs_count +=
2991				    (saved_pkt_count - stats.ts_opackets);
2992				if (mac_srs->srs_last == NULL)
2993					mac_srs->srs_last = tail;
2994				MAC_TX_SRS_BLOCK(mac_srs, head);
2995			} else {
2996				srs_tx->st_woken_up = B_FALSE;
2997				if (is_subflow) {
2998					FLOW_TX_STATS_UPDATE(
2999					    mac_srs->srs_flent, &stats);
3000				}
3001			}
3002		}
3003	} else if (tx_mode == SRS_TX_BW) {
3004		/*
3005		 * We are here because the timer fired and we have some data
3006		 * to tranmit. Also mac_tx_srs_worker should have reset
3007		 * SRS_BW_ENFORCED flag
3008		 */
3009		ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3010		head = tail = mac_srs->srs_first;
3011		while (mac_srs->srs_first != NULL) {
3012			tail = mac_srs->srs_first;
3013			tail->b_prev = NULL;
3014			mac_srs->srs_first = tail->b_next;
3015			if (mac_srs->srs_first == NULL)
3016				mac_srs->srs_last = NULL;
3017			mac_srs->srs_count--;
3018			sz = msgdsize(tail);
3019			mac_srs->srs_size -= sz;
3020			saved_pkt_count++;
3021			MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3022
3023			if (mac_srs->srs_bw->mac_bw_used <
3024			    mac_srs->srs_bw->mac_bw_limit)
3025				continue;
3026
3027			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
3028				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
3029				mac_srs->srs_bw->mac_bw_used = sz;
3030				continue;
3031			}
3032			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3033			break;
3034		}
3035
3036		ASSERT((head == NULL && tail == NULL) ||
3037		    (head != NULL && tail != NULL));
3038		if (tail != NULL) {
3039			tail->b_next = NULL;
3040			mutex_exit(&mac_srs->srs_lock);
3041
3042			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3043			    head, &stats);
3044
3045			mutex_enter(&mac_srs->srs_lock);
3046			if (head != NULL) {
3047				uint_t size_sent;
3048
3049				/* Device out of tx desc, set block */
3050				if (head->b_next == NULL)
3051					VERIFY(head == tail);
3052				tail->b_next = mac_srs->srs_first;
3053				mac_srs->srs_first = head;
3054				mac_srs->srs_count +=
3055				    (saved_pkt_count - stats.ts_opackets);
3056				if (mac_srs->srs_last == NULL)
3057					mac_srs->srs_last = tail;
3058				size_sent = sz - stats.ts_obytes;
3059				mac_srs->srs_size += size_sent;
3060				mac_srs->srs_bw->mac_bw_sz += size_sent;
3061				if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3062					mac_srs->srs_bw->mac_bw_used -=
3063					    size_sent;
3064				} else {
3065					mac_srs->srs_bw->mac_bw_used = 0;
3066				}
3067				MAC_TX_SRS_BLOCK(mac_srs, head);
3068			} else {
3069				srs_tx->st_woken_up = B_FALSE;
3070				if (is_subflow) {
3071					FLOW_TX_STATS_UPDATE(
3072					    mac_srs->srs_flent, &stats);
3073				}
3074			}
3075		}
3076	} else if (tx_mode == SRS_TX_BW_FANOUT) {
3077		mblk_t *prev;
3078		mac_soft_ring_t *softring;
3079		uint64_t hint;
3080
3081		/*
3082		 * We are here because the timer fired and we
3083		 * have some quota to tranmit.
3084		 */
3085		prev = NULL;
3086		head = tail = mac_srs->srs_first;
3087		while (mac_srs->srs_first != NULL) {
3088			tail = mac_srs->srs_first;
3089			mac_srs->srs_first = tail->b_next;
3090			if (mac_srs->srs_first == NULL)
3091				mac_srs->srs_last = NULL;
3092			mac_srs->srs_count--;
3093			sz = msgdsize(tail);
3094			mac_srs->srs_size -= sz;
3095			mac_srs->srs_bw->mac_bw_used += sz;
3096			if (prev == NULL)
3097				hint = (ulong_t)tail->b_prev;
3098			if (hint != (ulong_t)tail->b_prev) {
3099				prev->b_next = NULL;
3100				mutex_exit(&mac_srs->srs_lock);
3101				TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3102				head = tail;
3103				hint = (ulong_t)tail->b_prev;
3104				mutex_enter(&mac_srs->srs_lock);
3105			}
3106
3107			prev = tail;
3108			tail->b_prev = NULL;
3109			if (mac_srs->srs_bw->mac_bw_used <
3110			    mac_srs->srs_bw->mac_bw_limit)
3111				continue;
3112
3113			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
3114				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
3115				mac_srs->srs_bw->mac_bw_used = 0;
3116				continue;
3117			}
3118			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3119			break;
3120		}
3121		ASSERT((head == NULL && tail == NULL) ||
3122		    (head != NULL && tail != NULL));
3123		if (tail != NULL) {
3124			tail->b_next = NULL;
3125			mutex_exit(&mac_srs->srs_lock);
3126			TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3127			mutex_enter(&mac_srs->srs_lock);
3128		}
3129	}
3130	/*
3131	 * SRS_TX_FANOUT case not considered here because packets
3132	 * won't be queued in the SRS for this case. Packets will
3133	 * be sent directly to soft rings underneath and if there
3134	 * is any queueing at all, it would be in Tx side soft
3135	 * rings.
3136	 */
3137
3138	/*
3139	 * When srs_count becomes 0, reset SRS_TX_HIWAT and
3140	 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3141	 */
3142	if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3143	    (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3144		mac_tx_notify_cb_t *mtnfp;
3145		mac_cb_t *mcb;
3146		mac_client_impl_t *mcip = mac_srs->srs_mcip;
3147		boolean_t wakeup_required = B_FALSE;
3148
3149		if (mac_srs->srs_state &
3150		    (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3151			wakeup_required = B_TRUE;
3152		}
3153		mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3154		    SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3155		mutex_exit(&mac_srs->srs_lock);
3156		if (wakeup_required) {
3157			/* Wakeup callback registered clients */
3158			MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3159			for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3160			    mcb = mcb->mcb_nextp) {
3161				mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3162				mtnfp->mtnf_fn(mtnfp->mtnf_arg,
3163				    (mac_tx_cookie_t)mac_srs);
3164			}
3165			MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3166			    &mcip->mci_tx_notify_cb_list);
3167			/*
3168			 * If the client is not the primary MAC client, then we
3169			 * need to send the notification to the clients upper
3170			 * MAC, i.e. mci_upper_mip.
3171			 */
3172			mac_tx_notify(mcip->mci_upper_mip != NULL ?
3173			    mcip->mci_upper_mip : mcip->mci_mip);
3174		}
3175		mutex_enter(&mac_srs->srs_lock);
3176	}
3177	mac_srs->srs_state &= ~SRS_PROC;
3178}
3179
3180/*
3181 * Given a packet, get the flow_entry that identifies the flow
3182 * to which that packet belongs. The flow_entry will contain
3183 * the transmit function to be used to send the packet. If the
3184 * function returns NULL, the packet should be sent using the
3185 * underlying NIC.
3186 */
3187static flow_entry_t *
3188mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3189{
3190	flow_entry_t		*flent = NULL;
3191	mac_client_impl_t	*mcip;
3192	int	err;
3193
3194	/*
3195	 * Do classification on the packet.
3196	 */
3197	err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3198	if (err != 0)
3199		return (NULL);
3200
3201	/*
3202	 * This flent might just be an additional one on the MAC client,
3203	 * i.e. for classification purposes (different fdesc), however
3204	 * the resources, SRS et. al., are in the mci_flent, so if
3205	 * this isn't the mci_flent, we need to get it.
3206	 */
3207	if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3208		FLOW_REFRELE(flent);
3209		flent = mcip->mci_flent;
3210		FLOW_TRY_REFHOLD(flent, err);
3211		if (err != 0)
3212			return (NULL);
3213	}
3214
3215	return (flent);
3216}
3217
3218/*
3219 * This macro is only meant to be used by mac_tx_send().
3220 */
3221#define	CHECK_VID_AND_ADD_TAG(mp) {			\
3222	if (vid_check) {				\
3223		int err = 0;				\
3224							\
3225		MAC_VID_CHECK(src_mcip, (mp), err);	\
3226		if (err != 0) {				\
3227			freemsg((mp));			\
3228			(mp) = next;			\
3229			oerrors++;			\
3230			continue;			\
3231		}					\
3232	}						\
3233	if (add_tag) {					\
3234		(mp) = mac_add_vlan_tag((mp), 0, vid);	\
3235		if ((mp) == NULL) {			\
3236			(mp) = next;			\
3237			oerrors++;			\
3238			continue;			\
3239		}					\
3240	}						\
3241}
3242
3243mblk_t *
3244mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3245    mac_tx_stats_t *stats)
3246{
3247	mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3248	mac_impl_t *mip = src_mcip->mci_mip;
3249	uint_t obytes = 0, opackets = 0, oerrors = 0;
3250	mblk_t *mp = NULL, *next;
3251	boolean_t vid_check, add_tag;
3252	uint16_t vid = 0;
3253
3254	if (mip->mi_nclients > 1) {
3255		vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3256		add_tag = MAC_TAG_NEEDED(src_mcip);
3257		if (add_tag)
3258			vid = mac_client_vid(mch);
3259	} else {
3260		ASSERT(mip->mi_nclients == 1);
3261		vid_check = add_tag = B_FALSE;
3262	}
3263
3264	/*
3265	 * Fastpath: if there's only one client, and there's no
3266	 * multicast listeners, we simply send the packet down to the
3267	 * underlying NIC.
3268	 */
3269	if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL)  {
3270		DTRACE_PROBE2(fastpath,
3271		    mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3272
3273		mp = mp_chain;
3274		while (mp != NULL) {
3275			next = mp->b_next;
3276			mp->b_next = NULL;
3277			opackets++;
3278			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3279			    msgdsize(mp));
3280
3281			CHECK_VID_AND_ADD_TAG(mp);
3282			MAC_TX(mip, ring, mp,
3283			    ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) !=
3284			    0));
3285
3286			/*
3287			 * If the driver is out of descriptors and does a
3288			 * partial send it will return a chain of unsent
3289			 * mblks. Adjust the accounting stats.
3290			 */
3291			if (mp != NULL) {
3292				opackets--;
3293				obytes -= msgdsize(mp);
3294				mp->b_next = next;
3295				break;
3296			}
3297			mp = next;
3298		}
3299		goto done;
3300	}
3301
3302	/*
3303	 * No fastpath, we either have more than one MAC client
3304	 * defined on top of the same MAC, or one or more MAC
3305	 * client promiscuous callbacks.
3306	 */
3307	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3308	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3309
3310	mp = mp_chain;
3311	while (mp != NULL) {
3312		flow_entry_t *dst_flow_ent;
3313		void *flow_cookie;
3314		size_t	pkt_size;
3315		mblk_t *mp1;
3316
3317		next = mp->b_next;
3318		mp->b_next = NULL;
3319		opackets++;
3320		pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3321		obytes += pkt_size;
3322		CHECK_VID_AND_ADD_TAG(mp);
3323
3324		/*
3325		 * Check if there are promiscuous mode callbacks defined.
3326		 */
3327		if (mip->mi_promisc_list != NULL)
3328			mac_promisc_dispatch(mip, mp, src_mcip);
3329
3330		/*
3331		 * Find the destination.
3332		 */
3333		dst_flow_ent = mac_tx_classify(mip, mp);
3334
3335		if (dst_flow_ent != NULL) {
3336			size_t	hdrsize;
3337			int	err = 0;
3338
3339			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3340				struct ether_vlan_header *evhp =
3341				    (struct ether_vlan_header *)mp->b_rptr;
3342
3343				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3344					hdrsize = sizeof (*evhp);
3345				else
3346					hdrsize = sizeof (struct ether_header);
3347			} else {
3348				mac_header_info_t	mhi;
3349
3350				err = mac_header_info((mac_handle_t)mip,
3351				    mp, &mhi);
3352				if (err == 0)
3353					hdrsize = mhi.mhi_hdrsize;
3354			}
3355
3356			/*
3357			 * Got a matching flow. It's either another
3358			 * MAC client, or a broadcast/multicast flow.
3359			 * Make sure the packet size is within the
3360			 * allowed size. If not drop the packet and
3361			 * move to next packet.
3362			 */
3363			if (err != 0 ||
3364			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
3365				oerrors++;
3366				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3367				    mblk_t *, mp);
3368				freemsg(mp);
3369				mp = next;
3370				FLOW_REFRELE(dst_flow_ent);
3371				continue;
3372			}
3373			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3374			if (flow_cookie != NULL) {
3375				/*
3376				 * The vnic_bcast_send function expects
3377				 * to receive the sender MAC client
3378				 * as value for arg2.
3379				 */
3380				mac_bcast_send(flow_cookie, src_mcip, mp,
3381				    B_TRUE);
3382			} else {
3383				/*
3384				 * loopback the packet to a
3385				 * local MAC client. We force a context
3386				 * switch if both source and destination
3387				 * MAC clients are used by IP, i.e. bypass
3388				 * is set.
3389				 */
3390				boolean_t do_switch;
3391				mac_client_impl_t *dst_mcip =
3392				    dst_flow_ent->fe_mcip;
3393
3394				do_switch = ((src_mcip->mci_state_flags &
3395				    dst_mcip->mci_state_flags &
3396				    MCIS_CLIENT_POLL_CAPABLE) != 0);
3397
3398				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3399					(dst_flow_ent->fe_cb_fn)(
3400					    dst_flow_ent->fe_cb_arg1,
3401					    dst_flow_ent->fe_cb_arg2,
3402					    mp1, do_switch);
3403				}
3404			}
3405			FLOW_REFRELE(dst_flow_ent);
3406		} else {
3407			/*
3408			 * Unknown destination, send via the underlying
3409			 * NIC.
3410			 */
3411			MAC_TX(mip, ring, mp,
3412			    ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) !=
3413			    0));
3414			if (mp != NULL) {
3415				/*
3416				 * Adjust for the last packet that
3417				 * could not be transmitted
3418				 */
3419				opackets--;
3420				obytes -= pkt_size;
3421				mp->b_next = next;
3422				break;
3423			}
3424		}
3425		mp = next;
3426	}
3427
3428done:
3429	src_mcip->mci_stat_obytes += obytes;
3430	src_mcip->mci_stat_opackets += opackets;
3431	src_mcip->mci_stat_oerrors += oerrors;
3432
3433	if (stats != NULL) {
3434		stats->ts_opackets = opackets;
3435		stats->ts_obytes = obytes;
3436		stats->ts_oerrors = oerrors;
3437	}
3438	return (mp);
3439}
3440
3441/*
3442 * mac_tx_srs_ring_present
3443 *
3444 * Returns whether the specified ring is part of the specified SRS.
3445 */
3446boolean_t
3447mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3448{
3449	int i;
3450	mac_soft_ring_t *soft_ring;
3451
3452	if (srs->srs_tx.st_arg2 == tx_ring)
3453		return (B_TRUE);
3454
3455	for (i = 0; i < srs->srs_oth_ring_count; i++) {
3456		soft_ring =  srs->srs_oth_soft_rings[i];
3457		if (soft_ring->s_ring_tx_arg2 == tx_ring)
3458			return (B_TRUE);
3459	}
3460
3461	return (B_FALSE);
3462}
3463
3464/*
3465 * mac_tx_srs_wakeup
3466 *
3467 * Called when Tx desc become available. Wakeup the appropriate worker
3468 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3469 * state field.
3470 */
3471void
3472mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3473{
3474	int i;
3475	mac_soft_ring_t *sringp;
3476	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3477
3478	mutex_enter(&mac_srs->srs_lock);
3479	if (TX_SINGLE_RING_MODE(mac_srs)) {
3480		if (srs_tx->st_arg2 == ring &&
3481		    mac_srs->srs_state & SRS_TX_BLOCKED) {
3482			mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3483			srs_tx->st_unblocked_cnt++;
3484			cv_signal(&mac_srs->srs_async);
3485		}
3486		/*
3487		 * A wakeup can come before tx_srs_drain() could
3488		 * grab srs lock and set SRS_TX_BLOCKED. So
3489		 * always set woken_up flag when we come here.
3490		 */
3491		srs_tx->st_woken_up = B_TRUE;
3492		mutex_exit(&mac_srs->srs_lock);
3493		return;
3494	}
3495
3496	/* If you are here, it is for FANOUT or BW_FANOUT case */
3497	ASSERT(TX_MULTI_RING_MODE(mac_srs));
3498	for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
3499		sringp = mac_srs->srs_oth_soft_rings[i];
3500		mutex_enter(&sringp->s_ring_lock);
3501		if (sringp->s_ring_tx_arg2 == ring) {
3502			if (sringp->s_ring_state & S_RING_BLOCK) {
3503				sringp->s_ring_state &= ~S_RING_BLOCK;
3504				sringp->s_ring_unblocked_cnt++;
3505				cv_signal(&sringp->s_ring_async);
3506			}
3507			sringp->s_ring_tx_woken_up = B_TRUE;
3508		}
3509		mutex_exit(&sringp->s_ring_lock);
3510	}
3511	mutex_exit(&mac_srs->srs_lock);
3512}
3513
3514/*
3515 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3516 * the blocked clients again.
3517 */
3518void
3519mac_tx_notify(mac_impl_t *mip)
3520{
3521	i_mac_notify(mip, MAC_NOTE_TX);
3522}
3523
3524/*
3525 * RX SOFTRING RELATED FUNCTIONS
3526 *
3527 * These functions really belong in mac_soft_ring.c and here for
3528 * a short period.
3529 */
3530
3531#define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3532	/*								\
3533	 * Enqueue our mblk chain.					\
3534	 */								\
3535	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
3536									\
3537	if ((ringp)->s_ring_last != NULL)				\
3538		(ringp)->s_ring_last->b_next = (mp);			\
3539	else								\
3540		(ringp)->s_ring_first = (mp);				\
3541	(ringp)->s_ring_last = (tail);					\
3542	(ringp)->s_ring_count += (cnt);					\
3543	ASSERT((ringp)->s_ring_count > 0);				\
3544	if ((ringp)->s_ring_type & ST_RING_BW_CTL) {			\
3545		(ringp)->s_ring_size += sz;				\
3546	}								\
3547}
3548
3549/*
3550 * Default entry point to deliver a packet chain to a MAC client.
3551 * If the MAC client has flows, do the classification with these
3552 * flows as well.
3553 */
3554/* ARGSUSED */
3555void
3556mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3557    mac_header_info_t *arg3)
3558{
3559	mac_client_impl_t *mcip = arg1;
3560
3561	if (mcip->mci_nvids == 1 &&
3562	    !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3563		/*
3564		 * If the client has exactly one VID associated with it
3565		 * and striping of VLAN header is not disabled,
3566		 * remove the VLAN tag from the packet before
3567		 * passing it on to the client's receive callback.
3568		 * Note that this needs to be done after we dispatch
3569		 * the packet to the promiscuous listeners of the
3570		 * client, since they expect to see the whole
3571		 * frame including the VLAN headers.
3572		 */
3573		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3574	}
3575
3576	mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3577}
3578
3579/*
3580 * mac_rx_soft_ring_process
3581 *
3582 * process a chain for a given soft ring. The number of packets queued
3583 * in the SRS and its associated soft rings (including this one) is
3584 * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3585 * thread (interrupt or poll thread) to do inline processing. This
3586 * helps keep the latency down under low load.
3587 *
3588 * The proc and arg for each mblk is already stored in the mblk in
3589 * appropriate places.
3590 */
3591/* ARGSUSED */
3592void
3593mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3594    mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3595{
3596	mac_direct_rx_t		proc;
3597	void			*arg1;
3598	mac_resource_handle_t	arg2;
3599	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
3600
3601	ASSERT(ringp != NULL);
3602	ASSERT(mp_chain != NULL);
3603	ASSERT(tail != NULL);
3604	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3605
3606	mutex_enter(&ringp->s_ring_lock);
3607	ringp->s_ring_total_inpkt += cnt;
3608	if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3609	    !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3610		/* If on processor or blanking on, then enqueue and return */
3611		if (ringp->s_ring_state & S_RING_BLANK ||
3612		    ringp->s_ring_state & S_RING_PROC) {
3613			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3614			mutex_exit(&ringp->s_ring_lock);
3615			return;
3616		}
3617		proc = ringp->s_ring_rx_func;
3618		arg1 = ringp->s_ring_rx_arg1;
3619		arg2 = ringp->s_ring_rx_arg2;
3620		/*
3621		 * See if anything is already queued. If we are the
3622		 * first packet, do inline processing else queue the
3623		 * packet and do the drain.
3624		 */
3625		if (ringp->s_ring_first == NULL) {
3626			/*
3627			 * Fast-path, ok to process and nothing queued.
3628			 */
3629			ringp->s_ring_run = curthread;
3630			ringp->s_ring_state |= (S_RING_PROC);
3631
3632			mutex_exit(&ringp->s_ring_lock);
3633
3634			/*
3635			 * We are the chain of 1 packet so
3636			 * go through this fast path.
3637			 */
3638			ASSERT(mp_chain->b_next == NULL);
3639
3640			(*proc)(arg1, arg2, mp_chain, NULL);
3641
3642			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3643			/*
3644			 * If we have a soft ring set which is doing
3645			 * bandwidth control, we need to decrement
3646			 * srs_size and count so it the SRS can have a
3647			 * accurate idea of what is the real data
3648			 * queued between SRS and its soft rings. We
3649			 * decrement the counters only when the packet
3650			 * gets processed by both SRS and the soft ring.
3651			 */
3652			mutex_enter(&mac_srs->srs_lock);
3653			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3654			MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3655			mutex_exit(&mac_srs->srs_lock);
3656
3657			mutex_enter(&ringp->s_ring_lock);
3658			ringp->s_ring_run = NULL;
3659			ringp->s_ring_state &= ~S_RING_PROC;
3660			if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3661				cv_signal(&ringp->s_ring_client_cv);
3662
3663			if ((ringp->s_ring_first == NULL) ||
3664			    (ringp->s_ring_state & S_RING_BLANK)) {
3665				/*
3666				 * We processed inline our packet and
3667				 * nothing new has arrived or our
3668				 * receiver doesn't want to receive
3669				 * any packets. We are done.
3670				 */
3671				mutex_exit(&ringp->s_ring_lock);
3672				return;
3673			}
3674		} else {
3675			SOFT_RING_ENQUEUE_CHAIN(ringp,
3676			    mp_chain, tail, cnt, sz);
3677		}
3678
3679		/*
3680		 * We are here because either we couldn't do inline
3681		 * processing (because something was already
3682		 * queued), or we had a chain of more than one
3683		 * packet, or something else arrived after we were
3684		 * done with inline processing.
3685		 */
3686		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3687		ASSERT(ringp->s_ring_first != NULL);
3688
3689		ringp->s_ring_drain_func(ringp);
3690		mutex_exit(&ringp->s_ring_lock);
3691		return;
3692	} else {
3693		/* ST_RING_WORKER_ONLY case */
3694		SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3695		mac_soft_ring_worker_wakeup(ringp);
3696		mutex_exit(&ringp->s_ring_lock);
3697	}
3698}
3699
3700/*
3701 * TX SOFTRING RELATED FUNCTIONS
3702 *
3703 * These functions really belong in mac_soft_ring.c and here for
3704 * a short period.
3705 */
3706
3707#define	TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3708	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));			\
3709	ringp->s_ring_state |= S_RING_ENQUEUED;				\
3710	SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);	\
3711}
3712
3713/*
3714 * mac_tx_sring_queued
3715 *
3716 * When we are out of transmit descriptors and we already have a
3717 * queue that exceeds hiwat (or the client called us with
3718 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3719 * soft ring pointer as the opaque cookie for the client enable
3720 * flow control.
3721 */
3722static mac_tx_cookie_t
3723mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3724    mblk_t **ret_mp)
3725{
3726	int cnt;
3727	size_t sz;
3728	mblk_t *tail;
3729	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3730	mac_tx_cookie_t cookie = NULL;
3731	boolean_t wakeup_worker = B_TRUE;
3732
3733	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3734	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3735	if (flag & MAC_DROP_ON_NO_DESC) {
3736		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3737		/* increment freed stats */
3738		ringp->s_ring_drops += cnt;
3739		cookie = (mac_tx_cookie_t)ringp;
3740	} else {
3741		if (ringp->s_ring_first != NULL)
3742			wakeup_worker = B_FALSE;
3743
3744		if (flag & MAC_TX_NO_ENQUEUE) {
3745			/*
3746			 * If QUEUED is not set, queue the packet
3747			 * and let mac_tx_soft_ring_drain() set
3748			 * the TX_BLOCKED bit for the reasons
3749			 * explained above. Otherwise, return the
3750			 * mblks.
3751			 */
3752			if (wakeup_worker) {
3753				TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3754				    mp_chain, tail, cnt, sz);
3755			} else {
3756				ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3757				cookie = (mac_tx_cookie_t)ringp;
3758				*ret_mp = mp_chain;
3759			}
3760		} else {
3761			boolean_t enqueue = B_TRUE;
3762
3763			if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3764				/*
3765				 * flow-controlled. Store ringp in cookie
3766				 * so that it can be returned as
3767				 * mac_tx_cookie_t to client
3768				 */
3769				ringp->s_ring_state |= S_RING_TX_HIWAT;
3770				cookie = (mac_tx_cookie_t)ringp;
3771				ringp->s_ring_hiwat_cnt++;
3772				if (ringp->s_ring_count >
3773				    ringp->s_ring_tx_max_q_cnt) {
3774					/* increment freed stats */
3775					ringp->s_ring_drops += cnt;
3776					/*
3777					 * b_prev may be set to the fanout hint
3778					 * hence can't use freemsg directly
3779					 */
3780					mac_pkt_drop(NULL, NULL,
3781					    mp_chain, B_FALSE);
3782					DTRACE_PROBE1(tx_queued_hiwat,
3783					    mac_soft_ring_t *, ringp);
3784					enqueue = B_FALSE;
3785				}
3786			}
3787			if (enqueue) {
3788				TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3789				    tail, cnt, sz);
3790			}
3791		}
3792		if (wakeup_worker)
3793			cv_signal(&ringp->s_ring_async);
3794	}
3795	return (cookie);
3796}
3797
3798
3799/*
3800 * mac_tx_soft_ring_process
3801 *
3802 * This routine is called when fanning out outgoing traffic among
3803 * multipe Tx rings.
3804 * Note that a soft ring is associated with a h/w Tx ring.
3805 */
3806mac_tx_cookie_t
3807mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3808    uint16_t flag, mblk_t **ret_mp)
3809{
3810	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3811	int	cnt;
3812	size_t	sz;
3813	mblk_t	*tail;
3814	mac_tx_cookie_t cookie = NULL;
3815
3816	ASSERT(ringp != NULL);
3817	ASSERT(mp_chain != NULL);
3818	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3819	/*
3820	 * Only two modes can come here; either it can be
3821	 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT
3822	 */
3823	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3824	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
3825
3826	if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3827		/* Serialization mode */
3828
3829		mutex_enter(&ringp->s_ring_lock);
3830		if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3831			cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3832			    flag, ret_mp);
3833			mutex_exit(&ringp->s_ring_lock);
3834			return (cookie);
3835		}
3836		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3837		TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3838		if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3839			/*
3840			 * If ring is blocked due to lack of Tx
3841			 * descs, just return. Worker thread
3842			 * will get scheduled when Tx desc's
3843			 * become available.
3844			 */
3845			mutex_exit(&ringp->s_ring_lock);
3846			return (cookie);
3847		}
3848		mac_soft_ring_worker_wakeup(ringp);
3849		mutex_exit(&ringp->s_ring_lock);
3850		return (cookie);
3851	} else {
3852		/* Default fanout mode */
3853		/*
3854		 * S_RING_BLOCKED is set when underlying NIC runs
3855		 * out of Tx descs and messages start getting
3856		 * queued. It won't get reset until
3857		 * tx_srs_drain() completely drains out the
3858		 * messages.
3859		 */
3860		boolean_t		is_subflow;
3861		mac_tx_stats_t		stats;
3862
3863		if (ringp->s_ring_state & S_RING_ENQUEUED) {
3864			/* Tx descs/resources not available */
3865			mutex_enter(&ringp->s_ring_lock);
3866			if (ringp->s_ring_state & S_RING_ENQUEUED) {
3867				cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3868				    flag, ret_mp);
3869				mutex_exit(&ringp->s_ring_lock);
3870				return (cookie);
3871			}
3872			/*
3873			 * While we were computing mblk count, the
3874			 * flow control condition got relieved.
3875			 * Continue with the transmission.
3876			 */
3877			mutex_exit(&ringp->s_ring_lock);
3878		}
3879		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
3880
3881		mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
3882		    ringp->s_ring_tx_arg2, mp_chain,
3883		    (is_subflow ? &stats : NULL));
3884
3885		/*
3886		 * Multiple threads could be here sending packets.
3887		 * Under such conditions, it is not possible to
3888		 * automically set S_RING_BLOCKED bit to indicate
3889		 * out of tx desc condition. To atomically set
3890		 * this, we queue the returned packet and do
3891		 * the setting of S_RING_BLOCKED in
3892		 * mac_tx_soft_ring_drain().
3893		 */
3894		if (mp_chain != NULL) {
3895			mutex_enter(&ringp->s_ring_lock);
3896			cookie =
3897			    mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
3898			mutex_exit(&ringp->s_ring_lock);
3899			return (cookie);
3900		}
3901		if (is_subflow) {
3902			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
3903		}
3904		return (NULL);
3905	}
3906}
3907