mac_datapath_setup.c revision 11878:ac93462db6d7
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/callb.h>
28#include <sys/cpupart.h>
29#include <sys/pool.h>
30#include <sys/pool_pset.h>
31#include <sys/sdt.h>
32#include <sys/strsubr.h>
33#include <sys/strsun.h>
34#include <sys/vlan.h>
35#include <inet/ipsec_impl.h>
36#include <inet/ip_impl.h>
37#include <inet/sadb.h>
38#include <inet/ipsecesp.h>
39#include <inet/ipsecah.h>
40
41#include <sys/mac_impl.h>
42#include <sys/mac_client_impl.h>
43#include <sys/mac_client_priv.h>
44#include <sys/mac_soft_ring.h>
45#include <sys/mac_flow_impl.h>
46#include <sys/mac_stat.h>
47
48static void mac_srs_soft_rings_signal(mac_soft_ring_set_t *, uint_t);
49static void mac_srs_update_fanout_list(mac_soft_ring_set_t *);
50static void mac_srs_poll_unbind(mac_soft_ring_set_t *);
51static void mac_srs_worker_unbind(mac_soft_ring_set_t *);
52static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *, uint_t);
53
54static int mac_srs_cpu_setup(cpu_setup_t, int, void *);
55static void mac_srs_worker_bind(mac_soft_ring_set_t *, processorid_t);
56static void mac_srs_poll_bind(mac_soft_ring_set_t *, processorid_t);
57static void mac_srs_threads_unbind(mac_soft_ring_set_t *);
58static void mac_srs_add_glist(mac_soft_ring_set_t *);
59static void mac_srs_remove_glist(mac_soft_ring_set_t *);
60static void mac_srs_fanout_list_free(mac_soft_ring_set_t *);
61static void mac_soft_ring_remove(mac_soft_ring_set_t *, mac_soft_ring_t *);
62
63static int mac_compute_soft_ring_count(flow_entry_t *, int, int);
64static void mac_walk_srs_and_bind(int);
65static void mac_walk_srs_and_unbind(int);
66
67extern boolean_t mac_latency_optimize;
68
69static kmem_cache_t *mac_srs_cache;
70kmem_cache_t *mac_soft_ring_cache;
71
72/*
73 * The duration in msec we wait before signalling the soft ring
74 * worker thread in case packets get queued.
75 */
76uint32_t mac_soft_ring_worker_wait = 0;
77
78/*
79 * A global tunable for turning polling on/off. By default, dynamic
80 * polling is always on and is always very beneficial. It should be
81 * turned off with absolute care and for the rare workload (very
82 * low latency sensitive traffic).
83 */
84int mac_poll_enable = B_TRUE;
85
86/*
87 * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
88 * Large values could end up in consuming lot of system memory and cause
89 * system hang.
90 */
91int mac_soft_ring_max_q_cnt = 1024;
92int mac_soft_ring_min_q_cnt = 256;
93int mac_soft_ring_poll_thres = 16;
94
95boolean_t mac_tx_serialize = B_FALSE;
96
97/*
98 * mac_tx_srs_hiwat is the queue depth threshold at which callers of
99 * mac_tx() will be notified of flow control condition.
100 *
101 * TCP does not honour flow control condition sent up by mac_tx().
102 * Thus provision is made for TCP to allow more packets to be queued
103 * in SRS upto a maximum of mac_tx_srs_max_q_cnt.
104 *
105 * Note that mac_tx_srs_hiwat is always be lesser than
106 * mac_tx_srs_max_q_cnt.
107 */
108uint32_t mac_tx_srs_max_q_cnt = 100000;
109uint32_t mac_tx_srs_hiwat = 1000;
110
111/*
112 * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
113 *
114 * Global tunables that determines the number of soft rings to be used for
115 * fanning out incoming traffic on a link. These count will be used only
116 * when no explicit set of CPUs was assigned to the data-links.
117 *
118 * mac_rx_soft_ring_count tunable will come into effect only if
119 * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by
120 * default only for sun4v platforms.
121 *
122 * mac_rx_soft_ring_10gig_count will come into effect if you are running on a
123 * 10Gbps link and is not dependent upon mac_soft_ring_enable.
124 *
125 * The number of soft rings for fanout for a link or a flow is determined
126 * by mac_compute_soft_ring_count() routine. This routine will take into
127 * account mac_soft_ring_enable, mac_rx_soft_ring_count and
128 * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link.
129 *
130 * If a bandwidth is specified, the determination of the number of soft
131 * rings is based on specified bandwidth, CPU speed and number of CPUs in
132 * the system.
133 */
134uint_t mac_rx_soft_ring_count = 8;
135uint_t mac_rx_soft_ring_10gig_count = 8;
136
137/*
138 * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
139 * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The
140 * list is used to walk the list of all MAC threads when a CPU is
141 * coming online or going offline.
142 */
143static mac_soft_ring_set_t *mac_srs_g_list = NULL;
144static krwlock_t mac_srs_g_lock;
145
146/*
147 * Whether the SRS threads should be bound, or not.
148 */
149boolean_t mac_srs_thread_bind = B_TRUE;
150
151/*
152 * Whether Rx/Tx interrupts should be re-targeted. Disabled by default.
153 * dladm command would override this.
154 */
155boolean_t mac_tx_intr_retarget = B_FALSE;
156boolean_t mac_rx_intr_retarget = B_FALSE;
157
158/*
159 * If cpu bindings are specified by user, then Tx SRS and its soft
160 * rings should also be bound to the CPUs specified by user. The
161 * CPUs for Tx bindings are at the end of the cpu list provided by
162 * the user. If enough CPUs are not available (for Tx and Rx
163 * SRSes), then the CPUs are shared by both Tx and Rx SRSes.
164 */
165#define	BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) {			\
166	processorid_t cpuid;						\
167	int i;								\
168	mac_soft_ring_t *softring;					\
169	mac_cpus_t *srs_cpu;						\
170									\
171	srs_cpu = &mac_tx_srs->srs_cpu;					\
172	cpuid = srs_cpu->mc_tx_fanout_cpus[0];				\
173	mac_srs_worker_bind(mac_tx_srs, cpuid);				\
174	if (MAC_TX_SOFT_RINGS(mac_tx_srs)) {				\
175		for (i = 0; i < mac_tx_srs->srs_tx_ring_count; i++) {	\
176			cpuid = srs_cpu->mc_tx_fanout_cpus[i];		\
177			softring = mac_tx_srs->srs_tx_soft_rings[i];	\
178			if (cpuid != -1) {				\
179				(void) mac_soft_ring_bind(softring,	\
180				    cpuid);				\
181			}						\
182		}							\
183	}								\
184}
185
186/*
187 * Re-targeting is allowed only for exclusive group or for primary.
188 */
189#define	RETARGETABLE_CLIENT(group, mcip)				\
190	((((group) != NULL) &&						\
191	    ((group)->mrg_state == MAC_GROUP_STATE_RESERVED)) ||	\
192	    mac_is_primary_client(mcip))
193
194#define	MAC_RING_RETARGETABLE(ring)					\
195	(((ring) != NULL) &&						\
196	    ((ring)->mr_info.mri_intr.mi_ddi_handle != NULL) &&		\
197	    !((ring)->mr_info.mri_intr.mi_ddi_shared))
198
199
200/* INIT and FINI ROUTINES */
201
202void
203mac_soft_ring_init(void)
204{
205	mac_soft_ring_cache = kmem_cache_create("mac_soft_ring_cache",
206	    sizeof (mac_soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
207
208	mac_srs_cache = kmem_cache_create("mac_srs_cache",
209	    sizeof (mac_soft_ring_set_t),
210	    64, NULL, NULL, NULL, NULL, NULL, 0);
211
212	rw_init(&mac_srs_g_lock, NULL, RW_DEFAULT, NULL);
213	mutex_enter(&cpu_lock);
214	register_cpu_setup_func(mac_srs_cpu_setup, NULL);
215	mutex_exit(&cpu_lock);
216}
217
218void
219mac_soft_ring_finish(void)
220{
221	mutex_enter(&cpu_lock);
222	unregister_cpu_setup_func(mac_srs_cpu_setup, NULL);
223	mutex_exit(&cpu_lock);
224	rw_destroy(&mac_srs_g_lock);
225	kmem_cache_destroy(mac_soft_ring_cache);
226	kmem_cache_destroy(mac_srs_cache);
227}
228
229static void
230mac_srs_soft_rings_free(mac_soft_ring_set_t *mac_srs)
231{
232	mac_soft_ring_t	*softring, *next, *head;
233
234	/*
235	 * Synchronize with mac_walk_srs_bind/unbind which are callbacks from
236	 * DR. The callbacks from DR are called with cpu_lock held, and hence
237	 * can't wait to grab the mac perimeter. The soft ring list is hence
238	 * protected for read access by srs_lock. Changing the soft ring list
239	 * needs the mac perimeter and the srs_lock.
240	 */
241	mutex_enter(&mac_srs->srs_lock);
242
243	head = mac_srs->srs_soft_ring_head;
244	mac_srs->srs_soft_ring_head = NULL;
245	mac_srs->srs_soft_ring_tail = NULL;
246	mac_srs->srs_soft_ring_count = 0;
247
248	mutex_exit(&mac_srs->srs_lock);
249
250	for (softring = head; softring != NULL; softring = next) {
251		next = softring->s_ring_next;
252		mac_soft_ring_free(softring);
253	}
254}
255
256static void
257mac_srs_add_glist(mac_soft_ring_set_t *mac_srs)
258{
259	ASSERT(mac_srs->srs_next == NULL && mac_srs->srs_prev == NULL);
260	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
261
262	rw_enter(&mac_srs_g_lock, RW_WRITER);
263	mutex_enter(&mac_srs->srs_lock);
264
265	ASSERT((mac_srs->srs_state & SRS_IN_GLIST) == 0);
266
267	if (mac_srs_g_list == NULL) {
268		mac_srs_g_list = mac_srs;
269	} else {
270		mac_srs->srs_next = mac_srs_g_list;
271		mac_srs_g_list->srs_prev = mac_srs;
272		mac_srs->srs_prev = NULL;
273		mac_srs_g_list = mac_srs;
274	}
275	mac_srs->srs_state |= SRS_IN_GLIST;
276
277	mutex_exit(&mac_srs->srs_lock);
278	rw_exit(&mac_srs_g_lock);
279}
280
281static void
282mac_srs_remove_glist(mac_soft_ring_set_t *mac_srs)
283{
284	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
285
286	rw_enter(&mac_srs_g_lock, RW_WRITER);
287	mutex_enter(&mac_srs->srs_lock);
288
289	ASSERT((mac_srs->srs_state & SRS_IN_GLIST) != 0);
290
291	if (mac_srs == mac_srs_g_list) {
292		mac_srs_g_list = mac_srs->srs_next;
293		if (mac_srs_g_list != NULL)
294			mac_srs_g_list->srs_prev = NULL;
295	} else {
296		mac_srs->srs_prev->srs_next = mac_srs->srs_next;
297		if (mac_srs->srs_next != NULL)
298			mac_srs->srs_next->srs_prev = mac_srs->srs_prev;
299	}
300	mac_srs->srs_state &= ~SRS_IN_GLIST;
301
302	mutex_exit(&mac_srs->srs_lock);
303	rw_exit(&mac_srs_g_lock);
304}
305
306/* POLLING SETUP AND TEAR DOWN ROUTINES */
307
308/*
309 * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart
310 *
311 * These routines are used to call back into the upper layer
312 * (primarily TCP squeue) to stop polling the soft rings or
313 * restart polling.
314 */
315void
316mac_srs_client_poll_quiesce(mac_client_impl_t *mcip,
317    mac_soft_ring_set_t *mac_srs)
318{
319	mac_soft_ring_t	*softring;
320
321	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
322
323	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
324		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
325		return;
326	}
327
328	for (softring = mac_srs->srs_soft_ring_head;
329	    softring != NULL; softring = softring->s_ring_next) {
330		if ((softring->s_ring_type & ST_RING_TCP) &&
331		    (softring->s_ring_rx_arg2 != NULL)) {
332			mcip->mci_resource_quiesce(mcip->mci_resource_arg,
333			    softring->s_ring_rx_arg2);
334		}
335	}
336}
337
338void
339mac_srs_client_poll_restart(mac_client_impl_t *mcip,
340    mac_soft_ring_set_t *mac_srs)
341{
342	mac_soft_ring_t	*softring;
343
344	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
345
346	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
347		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
348		return;
349	}
350
351	for (softring = mac_srs->srs_soft_ring_head;
352	    softring != NULL; softring = softring->s_ring_next) {
353		if ((softring->s_ring_type & ST_RING_TCP) &&
354		    (softring->s_ring_rx_arg2 != NULL)) {
355			mcip->mci_resource_restart(mcip->mci_resource_arg,
356			    softring->s_ring_rx_arg2);
357		}
358	}
359}
360
361/*
362 * Register the given SRS and associated soft rings with the consumer and
363 * enable the polling interface used by the consumer.(i.e IP) over this
364 * SRS and associated soft rings.
365 */
366void
367mac_srs_client_poll_enable(mac_client_impl_t *mcip,
368    mac_soft_ring_set_t *mac_srs)
369{
370	mac_rx_fifo_t		mrf;
371	mac_soft_ring_t		*softring;
372
373	ASSERT(mac_srs->srs_mcip == mcip);
374	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
375
376	if (!(mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE))
377		return;
378
379	bzero(&mrf, sizeof (mac_rx_fifo_t));
380	mrf.mrf_type = MAC_RX_FIFO;
381
382	/*
383	 * A SRS is capable of acting as a soft ring for cases
384	 * where no fanout is needed. This is the case for userland
385	 * flows.
386	 */
387	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
388		return;
389
390	mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
391	mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable;
392	mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable;
393	mac_srs->srs_type |= SRST_CLIENT_POLL_ENABLED;
394
395	softring = mac_srs->srs_soft_ring_head;
396	while (softring != NULL) {
397		if (softring->s_ring_type & (ST_RING_TCP | ST_RING_UDP)) {
398			/*
399			 * TCP and UDP support DLS bypass. Squeue polling
400			 * support implies DLS bypass since the squeue poll
401			 * path does not have DLS processing.
402			 */
403			mac_soft_ring_dls_bypass(softring,
404			    mcip->mci_direct_rx_fn, mcip->mci_direct_rx_arg);
405		}
406		/*
407		 * Non-TCP protocols don't support squeues. Hence we don't
408		 * make any ring addition callbacks for non-TCP rings
409		 */
410		if (!(softring->s_ring_type & ST_RING_TCP)) {
411			softring->s_ring_rx_arg2 = NULL;
412			softring = softring->s_ring_next;
413			continue;
414		}
415		mrf.mrf_rx_arg = softring;
416		mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
417		mrf.mrf_cpu_id = softring->s_ring_cpuid;
418		mrf.mrf_flow_priority = mac_srs->srs_pri;
419
420		softring->s_ring_rx_arg2 = mcip->mci_resource_add(
421		    mcip->mci_resource_arg, (mac_resource_t *)&mrf);
422
423		softring = softring->s_ring_next;
424	}
425}
426
427/*
428 * Unregister the given SRS and associated soft rings with the consumer and
429 * disable the polling interface used by the consumer.(i.e IP) over this
430 * SRS and associated soft rings.
431 */
432void
433mac_srs_client_poll_disable(mac_client_impl_t *mcip,
434    mac_soft_ring_set_t *mac_srs)
435{
436	mac_soft_ring_t		*softring;
437
438	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
439
440	/*
441	 * A SRS is capable of acting as a soft ring for cases
442	 * where no protocol fanout is needed. This is the case
443	 * for userland flows. Nothing to do here.
444	 */
445	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
446		return;
447
448	mutex_enter(&mac_srs->srs_lock);
449	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
450		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
451		mutex_exit(&mac_srs->srs_lock);
452		return;
453	}
454	mac_srs->srs_type &= ~(SRST_CLIENT_POLL_ENABLED | SRST_DLS_BYPASS);
455	mutex_exit(&mac_srs->srs_lock);
456
457	/*
458	 * DLS bypass is now disabled in the case of both TCP and UDP.
459	 * Reset the soft ring callbacks to the standard 'mac_rx_deliver'
460	 * callback. In addition, in the case of TCP, invoke IP's callback
461	 * for ring removal.
462	 */
463	for (softring = mac_srs->srs_soft_ring_head;
464	    softring != NULL; softring = softring->s_ring_next) {
465		if (!(softring->s_ring_type & (ST_RING_UDP | ST_RING_TCP)))
466			continue;
467
468		if ((softring->s_ring_type & ST_RING_TCP) &&
469		    softring->s_ring_rx_arg2 != NULL) {
470			mcip->mci_resource_remove(mcip->mci_resource_arg,
471			    softring->s_ring_rx_arg2);
472		}
473
474		mutex_enter(&softring->s_ring_lock);
475		while (softring->s_ring_state & S_RING_PROC) {
476			softring->s_ring_state |= S_RING_CLIENT_WAIT;
477			cv_wait(&softring->s_ring_client_cv,
478			    &softring->s_ring_lock);
479		}
480		softring->s_ring_state &= ~S_RING_CLIENT_WAIT;
481		softring->s_ring_rx_arg2 = NULL;
482		softring->s_ring_rx_func = mac_rx_deliver;
483		softring->s_ring_rx_arg1 = mcip;
484		mutex_exit(&softring->s_ring_lock);
485	}
486}
487
488/*
489 * Enable or disable poll capability of the SRS on the underlying Rx ring.
490 *
491 * There is a need to enable or disable the poll capability of an SRS over an
492 * Rx ring depending on the number of mac clients sharing the ring and also
493 * whether user flows are configured on it. However the poll state is actively
494 * manipulated by the SRS worker and poll threads and uncoordinated changes by
495 * yet another thread to the underlying capability can surprise them leading
496 * to assert failures. Instead we quiesce the SRS, make the changes and then
497 * restart the SRS.
498 */
499static void
500mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs,
501    boolean_t turn_off_poll_capab, mac_rx_func_t rx_func)
502{
503	boolean_t	need_restart = B_FALSE;
504	mac_srs_rx_t	*srs_rx = &mac_srs->srs_rx;
505	mac_ring_t	*ring;
506
507	if (!SRS_QUIESCED(mac_srs)) {
508		mac_rx_srs_quiesce(mac_srs, SRS_QUIESCE);
509		need_restart = B_TRUE;
510	}
511
512	ring = mac_srs->srs_ring;
513	if ((ring != NULL) &&
514	    (ring->mr_classify_type == MAC_HW_CLASSIFIER)) {
515		if (turn_off_poll_capab)
516			mac_srs->srs_state &= ~SRS_POLLING_CAPAB;
517		else if (mac_poll_enable)
518			mac_srs->srs_state |= SRS_POLLING_CAPAB;
519	}
520	srs_rx->sr_lower_proc = rx_func;
521
522	if (need_restart)
523		mac_rx_srs_restart(mac_srs);
524}
525
526/* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */
527
528/*
529 * Return the next CPU to be used to bind a MAC kernel thread.
530 * If a cpupart is specified, the cpu chosen must be from that
531 * cpu partition.
532 */
533static processorid_t
534mac_next_bind_cpu(cpupart_t *cpupart)
535{
536	static cpu_t		*cp = NULL;
537	cpu_t			*cp_start;
538
539	ASSERT(MUTEX_HELD(&cpu_lock));
540
541	if (cp == NULL)
542		cp = cpu_list;
543
544	cp = cp->cpu_next_onln;
545	cp_start = cp;
546
547	do {
548		if ((cpupart == NULL) || (cp->cpu_part == cpupart))
549			return (cp->cpu_id);
550
551	} while ((cp = cp->cpu_next_onln) != cp_start);
552
553	return (NULL);
554}
555
556/* ARGSUSED */
557static int
558mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
559{
560	ASSERT(MUTEX_HELD(&cpu_lock));
561	switch (what) {
562	case CPU_CONFIG:
563	case CPU_ON:
564	case CPU_CPUPART_IN:
565		mac_walk_srs_and_bind(id);
566		break;
567
568	case CPU_UNCONFIG:
569	case CPU_OFF:
570	case CPU_CPUPART_OUT:
571		mac_walk_srs_and_unbind(id);
572		break;
573
574	default:
575		break;
576	}
577	return (0);
578}
579
580/*
581 * mac_compute_soft_ring_count():
582 *
583 * This routine computes the number of soft rings needed to handle incoming
584 * load given a flow_entry.
585 *
586 * The routine does the following:
587 * 1) soft rings will be created if mac_soft_ring_enable is set.
588 * 2) If the underlying link is a 10Gbps link, then soft rings will be
589 * created even if mac_soft_ring_enable is not set. The number of soft
590 * rings, so created,  will equal mac_rx_soft_ring_10gig_count.
591 * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the
592 * mac_rx_soft_ring_10gig_count number of soft rings will be created for a
593 * 10Gbps link.
594 *
595 * If a bandwidth limit is specified, the number that gets computed is
596 * dependent upon CPU speed, the number of Rx rings configured, and
597 * the bandwidth limit.
598 * If more Rx rings are available, less number of soft rings is needed.
599 *
600 * mac_use_bw_heuristic is another "hidden" variable that can be used to
601 * override the default use of soft ring count computation. Depending upon
602 * the usefulness of it, mac_use_bw_heuristic can later be made into a
603 * data-link property or removed altogether.
604 *
605 * TODO: Cleanup and tighten some of the assumptions.
606 */
607boolean_t mac_use_bw_heuristic = B_TRUE;
608static int
609mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
610{
611	uint64_t cpu_speed, bw = 0;
612	int srings = 0;
613	boolean_t bw_enabled = B_FALSE;
614
615	ASSERT(!(flent->fe_type & FLOW_USER));
616	if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
617	    mac_use_bw_heuristic) {
618		/* bandwidth enabled */
619		bw_enabled = B_TRUE;
620		bw = flent->fe_resource_props.mrp_maxbw;
621	}
622	if (!bw_enabled) {
623		/* No bandwidth enabled */
624		if (mac_soft_ring_enable)
625			srings = mac_rx_soft_ring_count;
626
627		/* Is this a 10Gig link? */
628		flent->fe_nic_speed = mac_client_stat_get(flent->fe_mcip,
629		    MAC_STAT_IFSPEED);
630		/* convert to Mbps */
631		if (((flent->fe_nic_speed)/1000000) > 1000 &&
632		    mac_rx_soft_ring_10gig_count > 0) {
633			/* This is a 10Gig link */
634			srings = mac_rx_soft_ring_10gig_count;
635			/*
636			 * Use 2 times mac_rx_soft_ring_10gig_count for
637			 * sun4v systems.
638			 */
639			if (mac_soft_ring_enable)
640				srings = srings * 2;
641		}
642	} else {
643		/*
644		 * Soft ring computation using CPU speed and specified
645		 * bandwidth limit.
646		 */
647		/* Assumption: all CPUs have the same frequency */
648		cpu_speed = (uint64_t)CPU->cpu_type_info.pi_clock;
649
650		/* cpu_speed is in MHz; make bw in units of Mbps.  */
651		bw = bw/1000000;
652
653		if (bw >= 1000) {
654			/*
655			 * bw is greater than or equal to 1Gbps.
656			 * The number of soft rings required is a function
657			 * of bandwidth and CPU speed. To keep this simple,
658			 * let's use this rule: 1GHz CPU can handle 1Gbps.
659			 * If bw is less than 1 Gbps, then there is no need
660			 * for soft rings. Assumption is that CPU speeds
661			 * (on modern systems) are at least 1GHz.
662			 */
663			srings = bw/cpu_speed;
664			if (srings <= 1 && mac_soft_ring_enable) {
665				/*
666				 * Give at least 2 soft rings
667				 * for sun4v systems
668				 */
669				srings = 2;
670			}
671		}
672	}
673	/*
674	 * If the flent has multiple Rx SRSs, then each SRS need not
675	 * have that many soft rings on top of it. The number of
676	 * soft rings for each Rx SRS is found by dividing srings by
677	 * rx_srs_cnt.
678	 */
679	if (rx_srs_cnt > 1) {
680		int remainder;
681
682		remainder = srings%rx_srs_cnt;
683		srings = srings/rx_srs_cnt;
684		if (remainder != 0)
685			srings++;
686		/*
687		 * Fanning out to 1 soft ring is not very useful.
688		 * Set it as well to 0 and mac_srs_fanout_init()
689		 * will take care of creating a single soft ring
690		 * for proto fanout.
691		 */
692		if (srings == 1)
693			srings = 0;
694	}
695	/* Do some more massaging */
696	srings = min(srings, maxcpus);
697	srings = min(srings, MAX_SR_FANOUT);
698	return (srings);
699}
700
701/*
702 * mac_tx_cpu_init:
703 * set up CPUs for Tx interrupt re-targeting and Tx worker
704 * thread binding
705 */
706static void
707mac_tx_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp,
708    cpupart_t *cpupart)
709{
710	mac_soft_ring_set_t *tx_srs = flent->fe_tx_srs;
711	mac_srs_tx_t *srs_tx = &tx_srs->srs_tx;
712	mac_cpus_t *srs_cpu = &tx_srs->srs_cpu;
713	mac_soft_ring_t *sringp;
714	mac_ring_t *ring;
715	processorid_t worker_cpuid;
716	boolean_t retargetable_client = B_FALSE;
717	int i, j;
718
719	if (RETARGETABLE_CLIENT((mac_group_t *)flent->fe_tx_ring_group,
720	    flent->fe_mcip)) {
721		retargetable_client = B_TRUE;
722	}
723
724	if (MAC_TX_SOFT_RINGS(tx_srs)) {
725		if (mrp != NULL)
726			j = mrp->mrp_ncpus - 1;
727		for (i = 0; i < tx_srs->srs_tx_ring_count; i++) {
728			if (mrp != NULL) {
729				if (j < 0)
730					j = mrp->mrp_ncpus - 1;
731				worker_cpuid = mrp->mrp_cpu[j];
732			} else {
733				/*
734				 * Bind interrupt to the next CPU available
735				 * and leave the worker unbound.
736				 */
737				worker_cpuid = -1;
738			}
739			sringp = tx_srs->srs_tx_soft_rings[i];
740			ring = (mac_ring_t *)sringp->s_ring_tx_arg2;
741			srs_cpu->mc_tx_fanout_cpus[i] = worker_cpuid;
742			if (MAC_RING_RETARGETABLE(ring) &&
743			    retargetable_client) {
744				mutex_enter(&cpu_lock);
745				srs_cpu->mc_tx_intr_cpu[i] =
746				    (mrp != NULL) ? mrp->mrp_cpu[j] :
747				    (mac_tx_intr_retarget ?
748				    mac_next_bind_cpu(cpupart) : -1);
749				mutex_exit(&cpu_lock);
750			} else {
751				srs_cpu->mc_tx_intr_cpu[i] = -1;
752			}
753			if (mrp != NULL)
754				j--;
755		}
756	} else {
757		/* Tx mac_ring_handle_t is stored in st_arg2 */
758		srs_cpu->mc_tx_fanout_cpus[0] =
759		    (mrp != NULL) ? mrp->mrp_cpu[mrp->mrp_ncpus - 1] : -1;
760		ring = (mac_ring_t *)srs_tx->st_arg2;
761		if (MAC_RING_RETARGETABLE(ring) && retargetable_client) {
762			mutex_enter(&cpu_lock);
763			srs_cpu->mc_tx_intr_cpu[0] = (mrp != NULL) ?
764			    mrp->mrp_cpu[mrp->mrp_ncpus - 1] :
765			    (mac_tx_intr_retarget ?
766			    mac_next_bind_cpu(cpupart) : -1);
767			mutex_exit(&cpu_lock);
768		} else {
769			srs_cpu->mc_tx_intr_cpu[0] = -1;
770		}
771	}
772}
773
774/*
775 * Assignment of user specified CPUs to a link.
776 *
777 * Minimum CPUs required to get an optimal assignmet:
778 * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize
779 * flag is set -- one for polling, one for fanout soft ring.
780 * If mac_latency_optimize is not set, then 3 CPUs are needed -- one
781 * for polling, one for SRS worker thread and one for fanout soft ring.
782 *
783 * The CPUs needed for Tx side is equal to the number of Tx rings
784 * the link is using.
785 *
786 * mac_flow_user_cpu_init() categorizes the CPU assignment depending
787 * upon the number of CPUs in 3 different buckets.
788 *
789 * In the first bucket, the most optimal case is handled. The user has
790 * passed enough number of CPUs and every thread gets its own CPU.
791 *
792 * The second and third are the sub-optimal cases. Enough CPUs are not
793 * available.
794 *
795 * The second bucket handles the case where atleast one distinct CPU is
796 * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx
797 * SRS or soft rings).
798 *
799 * In the third case (worst case scenario), specified CPU count is less
800 * than the Rx rings configured for the link. In this case, we round
801 * robin the CPUs among the Rx SRSes and Tx SRS/soft rings.
802 */
803static void
804mac_flow_user_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp)
805{
806	mac_soft_ring_set_t *rx_srs, *tx_srs;
807	int i, srs_cnt;
808	mac_cpus_t *srs_cpu;
809	int no_of_cpus, cpu_cnt;
810	int rx_srs_cnt, reqd_rx_cpu_cnt;
811	int fanout_cpu_cnt, reqd_tx_cpu_cnt;
812	int reqd_poll_worker_cnt, fanout_cnt_per_srs;
813	mac_resource_props_t *emrp = &flent->fe_effective_props;
814
815	ASSERT(mrp->mrp_fanout_mode == MCM_CPUS);
816	/*
817	 * The check for nbc_ncpus to be within limits for
818	 * the user specified case was done earlier and if
819	 * not within limits, an error would have been
820	 * returned to the user.
821	 */
822	ASSERT(mrp->mrp_ncpus > 0 && mrp->mrp_ncpus <= MAX_SR_FANOUT);
823
824	no_of_cpus = mrp->mrp_ncpus;
825
826	if (mrp->mrp_rx_intr_cpu != -1) {
827		/*
828		 * interrupt has been re-targetted. Poll
829		 * thread needs to be bound to interrupt
830		 * CPU.
831		 *
832		 * Find where in the list is the intr
833		 * CPU and swap it with the first one.
834		 * We will be using the first CPU in the
835		 * list for poll.
836		 */
837		for (i = 0; i < no_of_cpus; i++) {
838			if (mrp->mrp_cpu[i] == mrp->mrp_rx_intr_cpu)
839				break;
840		}
841		mrp->mrp_cpu[i] = mrp->mrp_cpu[0];
842		mrp->mrp_cpu[0] = mrp->mrp_rx_intr_cpu;
843	}
844
845	/*
846	 * Requirements:
847	 * The number of CPUs that each Rx ring needs is dependent
848	 * upon mac_latency_optimize flag.
849	 * 1) If set, atleast 2 CPUs are needed -- one for
850	 * polling, one for fanout soft ring.
851	 * 2) If not set, then atleast 3 CPUs are needed -- one
852	 * for polling, one for srs worker thread, and one for
853	 * fanout soft ring.
854	 */
855	rx_srs_cnt = (flent->fe_rx_srs_cnt > 1) ?
856	    (flent->fe_rx_srs_cnt - 1) : flent->fe_rx_srs_cnt;
857	reqd_rx_cpu_cnt = mac_latency_optimize ?
858	    (rx_srs_cnt * 2) : (rx_srs_cnt * 3);
859
860	/* How many CPUs are needed for Tx side? */
861	tx_srs = flent->fe_tx_srs;
862	reqd_tx_cpu_cnt = MAC_TX_SOFT_RINGS(tx_srs) ?
863	    tx_srs->srs_tx_ring_count : 1;
864
865	/* CPUs needed for Rx SRSes poll and worker threads */
866	reqd_poll_worker_cnt = mac_latency_optimize ?
867	    rx_srs_cnt : rx_srs_cnt * 2;
868
869	/* Has the user provided enough CPUs? */
870	if (no_of_cpus >= (reqd_rx_cpu_cnt + reqd_tx_cpu_cnt)) {
871		/*
872		 * Best case scenario. There is enough CPUs. All
873		 * Rx rings will get their own set of CPUs plus
874		 * Tx soft rings will get their own.
875		 */
876		/*
877		 * fanout_cpu_cnt is the number of CPUs available
878		 * for Rx side fanout soft rings.
879		 */
880		fanout_cpu_cnt = no_of_cpus -
881		    reqd_poll_worker_cnt - reqd_tx_cpu_cnt;
882
883		/*
884		 * Divide fanout_cpu_cnt by rx_srs_cnt to find
885		 * out how many fanout soft rings each Rx SRS
886		 * can have.
887		 */
888		fanout_cnt_per_srs = fanout_cpu_cnt/rx_srs_cnt;
889
890		/* Do the assignment for the default Rx ring */
891		cpu_cnt = 0;
892		rx_srs = flent->fe_rx_srs[0];
893		ASSERT(rx_srs->srs_ring == NULL);
894		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
895			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
896		srs_cpu = &rx_srs->srs_cpu;
897		srs_cpu->mc_ncpus = no_of_cpus;
898		bcopy(mrp->mrp_cpu,
899		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
900		srs_cpu->mc_rx_fanout_cnt = fanout_cnt_per_srs;
901		srs_cpu->mc_rx_pollid = mrp->mrp_cpu[cpu_cnt++];
902		/* Retarget the interrupt to the same CPU as the poll */
903		srs_cpu->mc_rx_intr_cpu = srs_cpu->mc_rx_pollid;
904		srs_cpu->mc_rx_workerid = (mac_latency_optimize ?
905		    srs_cpu->mc_rx_pollid : mrp->mrp_cpu[cpu_cnt++]);
906		for (i = 0; i < fanout_cnt_per_srs; i++)
907			srs_cpu->mc_rx_fanout_cpus[i] = mrp->mrp_cpu[cpu_cnt++];
908
909		/* Do the assignment for h/w Rx SRSes */
910		if (flent->fe_rx_srs_cnt > 1) {
911			cpu_cnt = 0;
912			for (srs_cnt = 1;
913			    srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
914				rx_srs = flent->fe_rx_srs[srs_cnt];
915				ASSERT(rx_srs->srs_ring != NULL);
916				if (rx_srs->srs_fanout_state ==
917				    SRS_FANOUT_INIT) {
918					rx_srs->srs_fanout_state =
919					    SRS_FANOUT_REINIT;
920				}
921				srs_cpu = &rx_srs->srs_cpu;
922				srs_cpu->mc_ncpus = no_of_cpus;
923				bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
924				    sizeof (srs_cpu->mc_cpus));
925				srs_cpu->mc_rx_fanout_cnt = fanout_cnt_per_srs;
926				/* The first CPU in the list is the intr CPU */
927				srs_cpu->mc_rx_pollid = mrp->mrp_cpu[cpu_cnt++];
928				srs_cpu->mc_rx_intr_cpu = srs_cpu->mc_rx_pollid;
929				srs_cpu->mc_rx_workerid =
930				    (mac_latency_optimize ?
931				    srs_cpu->mc_rx_pollid :
932				    mrp->mrp_cpu[cpu_cnt++]);
933				for (i = 0; i < fanout_cnt_per_srs; i++) {
934					srs_cpu->mc_rx_fanout_cpus[i] =
935					    mrp->mrp_cpu[cpu_cnt++];
936				}
937				ASSERT(cpu_cnt <= no_of_cpus);
938			}
939		}
940		goto tx_cpu_init;
941	}
942
943	/*
944	 * Sub-optimal case.
945	 * We have the following information:
946	 * no_of_cpus - no. of cpus that user passed.
947	 * rx_srs_cnt - no. of rx rings.
948	 * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3
949	 * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side.
950	 * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2
951	 */
952	/*
953	 * If we bind the Rx fanout soft rings to the same CPUs
954	 * as poll/worker, would that be enough?
955	 */
956	if (no_of_cpus >= (rx_srs_cnt + reqd_tx_cpu_cnt)) {
957		boolean_t worker_assign = B_FALSE;
958
959		/*
960		 * If mac_latency_optimize is not set, are there
961		 * enough CPUs to assign a CPU for worker also?
962		 */
963		if (no_of_cpus >= (reqd_poll_worker_cnt + reqd_tx_cpu_cnt))
964			worker_assign = B_TRUE;
965		/*
966		 * Zero'th Rx SRS is the default Rx ring. It is not
967		 * associated with h/w Rx ring.
968		 */
969		rx_srs = flent->fe_rx_srs[0];
970		ASSERT(rx_srs->srs_ring == NULL);
971		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
972			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
973		cpu_cnt = 0;
974		srs_cpu = &rx_srs->srs_cpu;
975		srs_cpu->mc_ncpus = no_of_cpus;
976		bcopy(mrp->mrp_cpu,
977		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
978		srs_cpu->mc_rx_fanout_cnt = 1;
979		srs_cpu->mc_rx_pollid = mrp->mrp_cpu[cpu_cnt++];
980		/* Retarget the interrupt to the same CPU as the poll */
981		srs_cpu->mc_rx_intr_cpu = srs_cpu->mc_rx_pollid;
982		srs_cpu->mc_rx_workerid =
983		    ((!mac_latency_optimize && worker_assign) ?
984		    mrp->mrp_cpu[cpu_cnt++] : srs_cpu->mc_rx_pollid);
985
986		srs_cpu->mc_rx_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
987
988		/* Do CPU bindings for SRSes having h/w Rx rings */
989		if (flent->fe_rx_srs_cnt > 1) {
990			cpu_cnt = 0;
991			for (srs_cnt = 1;
992			    srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
993				rx_srs = flent->fe_rx_srs[srs_cnt];
994				ASSERT(rx_srs->srs_ring != NULL);
995				if (rx_srs->srs_fanout_state ==
996				    SRS_FANOUT_INIT) {
997					rx_srs->srs_fanout_state =
998					    SRS_FANOUT_REINIT;
999				}
1000				srs_cpu = &rx_srs->srs_cpu;
1001				srs_cpu->mc_ncpus = no_of_cpus;
1002				bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
1003				    sizeof (srs_cpu->mc_cpus));
1004				srs_cpu->mc_rx_pollid =
1005				    mrp->mrp_cpu[cpu_cnt];
1006				srs_cpu->mc_rx_intr_cpu = srs_cpu->mc_rx_pollid;
1007				srs_cpu->mc_rx_workerid =
1008				    ((!mac_latency_optimize && worker_assign) ?
1009				    mrp->mrp_cpu[++cpu_cnt] :
1010				    srs_cpu->mc_rx_pollid);
1011				srs_cpu->mc_rx_fanout_cnt = 1;
1012				srs_cpu->mc_rx_fanout_cpus[0] =
1013				    mrp->mrp_cpu[cpu_cnt];
1014				cpu_cnt++;
1015				ASSERT(cpu_cnt <= no_of_cpus);
1016			}
1017		}
1018		goto tx_cpu_init;
1019	}
1020
1021	/*
1022	 * Real sub-optimal case. Not enough CPUs for poll and
1023	 * Tx soft rings. Do a round robin assignment where
1024	 * each Rx SRS will get the same CPU for poll, worker
1025	 * and fanout soft ring.
1026	 */
1027	cpu_cnt = 0;
1028	for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
1029		rx_srs = flent->fe_rx_srs[srs_cnt];
1030		srs_cpu = &rx_srs->srs_cpu;
1031		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
1032			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
1033		srs_cpu->mc_ncpus = no_of_cpus;
1034		bcopy(mrp->mrp_cpu,
1035		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
1036		srs_cpu->mc_rx_fanout_cnt = 1;
1037		srs_cpu->mc_rx_pollid = mrp->mrp_cpu[cpu_cnt];
1038		/* Retarget the interrupt to the same CPU as the poll */
1039		srs_cpu->mc_rx_intr_cpu = srs_cpu->mc_rx_pollid;
1040		srs_cpu->mc_rx_workerid = mrp->mrp_cpu[cpu_cnt];
1041		srs_cpu->mc_rx_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
1042		if (++cpu_cnt >= no_of_cpus)
1043			cpu_cnt = 0;
1044	}
1045
1046tx_cpu_init:
1047	mac_tx_cpu_init(flent, mrp, NULL);
1048
1049	/*
1050	 * Copy the user specified CPUs to the effective CPUs
1051	 */
1052	for (i = 0; i < mrp->mrp_ncpus; i++) {
1053		emrp->mrp_cpu[i] = mrp->mrp_cpu[i];
1054	}
1055	emrp->mrp_ncpus = mrp->mrp_ncpus;
1056	emrp->mrp_mask = mrp->mrp_mask;
1057	bzero(emrp->mrp_pool, MAXPATHLEN);
1058}
1059
1060/*
1061 * mac_flow_cpu_init():
1062 *
1063 * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in
1064 * the CPU binding information in srs_cpu for all Rx SRSes associated
1065 * with a flent.
1066 */
1067static void
1068mac_flow_cpu_init(flow_entry_t *flent, cpupart_t *cpupart)
1069{
1070	mac_soft_ring_set_t *rx_srs;
1071	processorid_t cpuid;
1072	int i, j, k, srs_cnt, nscpus, maxcpus, soft_ring_cnt = 0;
1073	mac_cpus_t *srs_cpu;
1074	mac_resource_props_t *emrp = &flent->fe_effective_props;
1075	uint32_t cpus[MRP_NCPUS];
1076
1077	/*
1078	 * The maximum number of CPUs available can either be
1079	 * the number of CPUs in the pool or the number of CPUs
1080	 * in the system.
1081	 */
1082	maxcpus = (cpupart != NULL) ? cpupart->cp_ncpus : ncpus;
1083
1084	/*
1085	 * Compute the number of soft rings needed on top for each Rx
1086	 * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS
1087	 * associated with h/w Rx rings. Soft ring count needed for
1088	 * each h/w Rx SRS is computed and the same is applied to
1089	 * software classified Rx SRS. The first Rx SRS in fe_rx_srs[]
1090	 * is the software classified Rx SRS.
1091	 */
1092	soft_ring_cnt = mac_compute_soft_ring_count(flent,
1093	    flent->fe_rx_srs_cnt - 1, maxcpus);
1094	if (soft_ring_cnt == 0) {
1095		/*
1096		 * Even when soft_ring_cnt is 0, we still need
1097		 * to create a soft ring for TCP, UDP and
1098		 * OTHER. So set it to 1.
1099		 */
1100		soft_ring_cnt = 1;
1101	}
1102	for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
1103		rx_srs = flent->fe_rx_srs[srs_cnt];
1104		srs_cpu = &rx_srs->srs_cpu;
1105		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
1106			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
1107		srs_cpu->mc_ncpus = soft_ring_cnt;
1108		srs_cpu->mc_rx_fanout_cnt = soft_ring_cnt;
1109		mutex_enter(&cpu_lock);
1110		for (j = 0; j < soft_ring_cnt; j++) {
1111			cpuid = mac_next_bind_cpu(cpupart);
1112			srs_cpu->mc_cpus[j] = cpuid;
1113			srs_cpu->mc_rx_fanout_cpus[j] = cpuid;
1114		}
1115		cpuid = mac_next_bind_cpu(cpupart);
1116		srs_cpu->mc_rx_pollid = cpuid;
1117		srs_cpu->mc_rx_intr_cpu = (mac_rx_intr_retarget ?
1118		    srs_cpu->mc_rx_pollid : -1);
1119		/* increment ncpus to account for polling cpu */
1120		srs_cpu->mc_ncpus++;
1121		srs_cpu->mc_cpus[j++] = cpuid;
1122		if (!mac_latency_optimize) {
1123			cpuid = mac_next_bind_cpu(cpupart);
1124			srs_cpu->mc_ncpus++;
1125			srs_cpu->mc_cpus[j++] = cpuid;
1126		}
1127		srs_cpu->mc_rx_workerid = cpuid;
1128		mutex_exit(&cpu_lock);
1129	}
1130
1131	nscpus = 0;
1132	for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
1133		rx_srs = flent->fe_rx_srs[srs_cnt];
1134		srs_cpu = &rx_srs->srs_cpu;
1135		for (j = 0; j < srs_cpu->mc_ncpus; j++) {
1136			cpus[nscpus++] = srs_cpu->mc_cpus[j];
1137		}
1138	}
1139
1140
1141	/*
1142	 * Copy cpu list to fe_effective_props
1143	 * without duplicates.
1144	 */
1145	k = 0;
1146	for (i = 0; i < nscpus; i++) {
1147		for (j = 0; j < k; j++) {
1148			if (emrp->mrp_cpu[j] == cpus[i])
1149				break;
1150		}
1151		if (j == k)
1152			emrp->mrp_cpu[k++] = cpus[i];
1153	}
1154	emrp->mrp_ncpus = k;
1155
1156	mac_tx_cpu_init(flent, NULL, cpupart);
1157}
1158
1159/*
1160 * DATAPATH SETUP ROUTINES
1161 * (setup SRS and set/update FANOUT, B/W and PRIORITY)
1162 */
1163
1164/*
1165 * mac_srs_fanout_list_alloc:
1166 *
1167 * The underlying device can expose upto MAX_RINGS_PER_GROUP worth of
1168 * rings to a client. In such a case, MAX_RINGS_PER_GROUP worth of
1169 * array space is needed to store Tx soft rings. Thus we allocate so
1170 * much array space for srs_tx_soft_rings.
1171 *
1172 * And when it is an aggr, again we allocate MAX_RINGS_PER_GROUP worth
1173 * of space to st_soft_rings. This array is used for quick access to
1174 * soft ring associated with a pseudo Tx ring based on the pseudo
1175 * ring's index (mr_index).
1176 */
1177static void
1178mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs)
1179{
1180	mac_client_impl_t *mcip = mac_srs->srs_mcip;
1181
1182	if (mac_srs->srs_type & SRST_TX) {
1183		mac_srs->srs_tx_soft_rings = (mac_soft_ring_t **)
1184		    kmem_zalloc(sizeof (mac_soft_ring_t *) *
1185		    MAX_RINGS_PER_GROUP, KM_SLEEP);
1186		if (mcip->mci_state_flags & MCIS_IS_AGGR) {
1187			mac_srs_tx_t *tx = &mac_srs->srs_tx;
1188
1189			tx->st_soft_rings = (mac_soft_ring_t **)
1190			    kmem_zalloc(sizeof (mac_soft_ring_t *) *
1191			    MAX_RINGS_PER_GROUP, KM_SLEEP);
1192		}
1193	} else {
1194		mac_srs->srs_tcp_soft_rings = (mac_soft_ring_t **)
1195		    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT,
1196		    KM_SLEEP);
1197		mac_srs->srs_udp_soft_rings = (mac_soft_ring_t **)
1198		    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT,
1199		    KM_SLEEP);
1200		mac_srs->srs_oth_soft_rings = (mac_soft_ring_t **)
1201		    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT,
1202		    KM_SLEEP);
1203	}
1204}
1205
1206static void
1207mac_srs_worker_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
1208{
1209	cpu_t *cp;
1210	boolean_t clear = B_FALSE;
1211
1212	ASSERT(MUTEX_HELD(&cpu_lock));
1213
1214	if (!mac_srs_thread_bind)
1215		return;
1216
1217	cp = cpu_get(cpuid);
1218	if (cp == NULL || !cpu_is_online(cp))
1219		return;
1220
1221	mutex_enter(&mac_srs->srs_lock);
1222	mac_srs->srs_state |= SRS_WORKER_BOUND;
1223	if (mac_srs->srs_worker_cpuid != -1)
1224		clear = B_TRUE;
1225	mac_srs->srs_worker_cpuid = cpuid;
1226	mutex_exit(&mac_srs->srs_lock);
1227
1228	if (clear)
1229		thread_affinity_clear(mac_srs->srs_worker);
1230
1231	thread_affinity_set(mac_srs->srs_worker, cpuid);
1232	DTRACE_PROBE1(worker__CPU, processorid_t, cpuid);
1233}
1234
1235static void
1236mac_srs_poll_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
1237{
1238	cpu_t *cp;
1239	boolean_t clear = B_FALSE;
1240
1241	ASSERT(MUTEX_HELD(&cpu_lock));
1242
1243	if (!mac_srs_thread_bind || mac_srs->srs_poll_thr == NULL)
1244		return;
1245
1246	cp = cpu_get(cpuid);
1247	if (cp == NULL || !cpu_is_online(cp))
1248		return;
1249
1250	mutex_enter(&mac_srs->srs_lock);
1251	mac_srs->srs_state |= SRS_POLL_BOUND;
1252	if (mac_srs->srs_poll_cpuid != -1)
1253		clear = B_TRUE;
1254	mac_srs->srs_poll_cpuid = cpuid;
1255	mutex_exit(&mac_srs->srs_lock);
1256
1257	if (clear)
1258		thread_affinity_clear(mac_srs->srs_poll_thr);
1259
1260	thread_affinity_set(mac_srs->srs_poll_thr, cpuid);
1261	DTRACE_PROBE1(poll__CPU, processorid_t, cpuid);
1262}
1263
1264/*
1265 * Re-target interrupt to the passed CPU. If re-target is successful,
1266 * set mc_rx_intr_cpu to the re-targeted CPU. Otherwise set it to -1.
1267 */
1268void
1269mac_rx_srs_retarget_intr(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
1270{
1271	cpu_t *cp;
1272	mac_ring_t *ring = mac_srs->srs_ring;
1273	mac_intr_t *mintr = &ring->mr_info.mri_intr;
1274	flow_entry_t *flent = mac_srs->srs_flent;
1275	boolean_t primary = mac_is_primary_client(mac_srs->srs_mcip);
1276
1277	ASSERT(MUTEX_HELD(&cpu_lock));
1278
1279	/*
1280	 * Don't re-target the interrupt for these cases:
1281	 * 1) ring is NULL
1282	 * 2) the interrupt is shared (mi_ddi_shared)
1283	 * 3) ddi_handle is NULL and !primary
1284	 * 4) primary, ddi_handle is NULL but fe_rx_srs_cnt > 2
1285	 * Case 3 & 4 are because of mac_client_intr_cpu() routine.
1286	 * This routine will re-target fixed interrupt for primary
1287	 * mac client if the client has only one ring. In that
1288	 * case, mc_rx_intr_cpu will already have the correct value.
1289	 */
1290	if (ring == NULL || mintr->mi_ddi_shared || cpuid == -1 ||
1291	    (mintr->mi_ddi_handle == NULL && !primary) || (primary &&
1292	    mintr->mi_ddi_handle == NULL && flent->fe_rx_srs_cnt > 2)) {
1293		mac_srs->srs_cpu.mc_rx_intr_cpu = -1;
1294		return;
1295	}
1296
1297	if (mintr->mi_ddi_handle == NULL)
1298		return;
1299
1300	cp = cpu_get(cpuid);
1301	if (cp == NULL || !cpu_is_online(cp))
1302		return;
1303
1304	/* Drop the cpu_lock as ddi_intr_set_affinity() holds it */
1305	mutex_exit(&cpu_lock);
1306	if (ddi_intr_set_affinity(mintr->mi_ddi_handle, cpuid) == DDI_SUCCESS)
1307		mac_srs->srs_cpu.mc_rx_intr_cpu = cpuid;
1308	else
1309		mac_srs->srs_cpu.mc_rx_intr_cpu = -1;
1310	mutex_enter(&cpu_lock);
1311}
1312
1313/*
1314 * Re-target Tx interrupts
1315 */
1316void
1317mac_tx_srs_retarget_intr(mac_soft_ring_set_t *mac_srs)
1318{
1319	cpu_t *cp;
1320	mac_ring_t *ring;
1321	mac_intr_t *mintr;
1322	mac_soft_ring_t *sringp;
1323	mac_srs_tx_t *srs_tx;
1324	mac_cpus_t *srs_cpu;
1325	processorid_t cpuid;
1326	int i;
1327
1328	ASSERT(MUTEX_HELD(&cpu_lock));
1329
1330	srs_cpu = &mac_srs->srs_cpu;
1331	if (MAC_TX_SOFT_RINGS(mac_srs)) {
1332		for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
1333			sringp = mac_srs->srs_tx_soft_rings[i];
1334			ring = (mac_ring_t *)sringp->s_ring_tx_arg2;
1335			cpuid = srs_cpu->mc_tx_intr_cpu[i];
1336			cp = cpu_get(cpuid);
1337			if (cp == NULL || !cpu_is_online(cp) ||
1338			    !MAC_RING_RETARGETABLE(ring)) {
1339				srs_cpu->mc_tx_retargeted_cpu[i] = -1;
1340				continue;
1341			}
1342			mintr = &ring->mr_info.mri_intr;
1343			/*
1344			 * Drop the cpu_lock as ddi_intr_set_affinity()
1345			 * holds it
1346			 */
1347			mutex_exit(&cpu_lock);
1348			if (ddi_intr_set_affinity(mintr->mi_ddi_handle,
1349			    cpuid) == DDI_SUCCESS) {
1350				srs_cpu->mc_tx_retargeted_cpu[i] = cpuid;
1351			} else {
1352				srs_cpu->mc_tx_retargeted_cpu[i] = -1;
1353			}
1354			mutex_enter(&cpu_lock);
1355		}
1356	} else {
1357		cpuid = srs_cpu->mc_tx_intr_cpu[0];
1358		cp = cpu_get(cpuid);
1359		if (cp == NULL || !cpu_is_online(cp)) {
1360			srs_cpu->mc_tx_retargeted_cpu[0] = -1;
1361			return;
1362		}
1363		srs_tx = &mac_srs->srs_tx;
1364		ring = (mac_ring_t *)srs_tx->st_arg2;
1365		if (MAC_RING_RETARGETABLE(ring)) {
1366			mintr = &ring->mr_info.mri_intr;
1367			mutex_exit(&cpu_lock);
1368			if ((ddi_intr_set_affinity(mintr->mi_ddi_handle,
1369			    cpuid) == DDI_SUCCESS)) {
1370				srs_cpu->mc_tx_retargeted_cpu[0] = cpuid;
1371			} else {
1372				srs_cpu->mc_tx_retargeted_cpu[0] = -1;
1373			}
1374			mutex_enter(&cpu_lock);
1375		}
1376	}
1377}
1378
1379/*
1380 * When a CPU comes back online, bind the MAC kernel threads which
1381 * were previously bound to that CPU, and had to be unbound because
1382 * the CPU was going away.
1383 *
1384 * These functions are called with cpu_lock held and hence we can't
1385 * cv_wait to grab the mac perimeter. Since these functions walk the soft
1386 * ring list of an SRS without being in the perimeter, the list itself
1387 * is protected by the SRS lock.
1388 */
1389static void
1390mac_walk_srs_and_bind(int cpuid)
1391{
1392	mac_soft_ring_set_t *mac_srs;
1393	mac_soft_ring_t *soft_ring;
1394
1395	rw_enter(&mac_srs_g_lock, RW_READER);
1396
1397	if ((mac_srs = mac_srs_g_list) == NULL)
1398		goto done;
1399
1400	for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
1401		if (mac_srs->srs_worker_cpuid == -1 &&
1402		    mac_srs->srs_worker_cpuid_save == cpuid) {
1403			mac_srs->srs_worker_cpuid_save = -1;
1404			mac_srs_worker_bind(mac_srs, cpuid);
1405		}
1406
1407		if (!(mac_srs->srs_type & SRST_TX)) {
1408			if (mac_srs->srs_poll_cpuid == -1 &&
1409			    mac_srs->srs_poll_cpuid_save == cpuid) {
1410				mac_srs->srs_poll_cpuid_save = -1;
1411				mac_srs_poll_bind(mac_srs, cpuid);
1412			}
1413		}
1414
1415		/* Next tackle the soft rings associated with the srs */
1416		mutex_enter(&mac_srs->srs_lock);
1417		for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
1418		    soft_ring = soft_ring->s_ring_next) {
1419			if (soft_ring->s_ring_cpuid == -1 &&
1420			    soft_ring->s_ring_cpuid_save == cpuid) {
1421				soft_ring->s_ring_cpuid_save = -1;
1422				(void) mac_soft_ring_bind(soft_ring, cpuid);
1423			}
1424		}
1425		mutex_exit(&mac_srs->srs_lock);
1426	}
1427done:
1428	rw_exit(&mac_srs_g_lock);
1429}
1430
1431/*
1432 * Change the priority of the SRS's poll and worker thread. Additionally,
1433 * update the priority of the worker threads for the SRS's soft rings.
1434 * Need to modify any associated squeue threads.
1435 */
1436void
1437mac_update_srs_priority(mac_soft_ring_set_t *mac_srs, pri_t prival)
1438{
1439	mac_soft_ring_t		*ringp;
1440
1441	mac_srs->srs_pri = prival;
1442	thread_lock(mac_srs->srs_worker);
1443	(void) thread_change_pri(mac_srs->srs_worker, mac_srs->srs_pri, 0);
1444	thread_unlock(mac_srs->srs_worker);
1445	if (mac_srs->srs_poll_thr != NULL) {
1446		thread_lock(mac_srs->srs_poll_thr);
1447		(void) thread_change_pri(mac_srs->srs_poll_thr,
1448		    mac_srs->srs_pri, 0);
1449		thread_unlock(mac_srs->srs_poll_thr);
1450	}
1451	if ((ringp = mac_srs->srs_soft_ring_head) == NULL)
1452		return;
1453	while (ringp != mac_srs->srs_soft_ring_tail) {
1454		thread_lock(ringp->s_ring_worker);
1455		(void) thread_change_pri(ringp->s_ring_worker,
1456		    mac_srs->srs_pri, 0);
1457		thread_unlock(ringp->s_ring_worker);
1458		ringp = ringp->s_ring_next;
1459	}
1460	ASSERT(ringp == mac_srs->srs_soft_ring_tail);
1461	thread_lock(ringp->s_ring_worker);
1462	(void) thread_change_pri(ringp->s_ring_worker, mac_srs->srs_pri, 0);
1463	thread_unlock(ringp->s_ring_worker);
1464}
1465
1466/*
1467 * Change the receive bandwidth limit.
1468 */
1469static void
1470mac_rx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
1471{
1472	mac_soft_ring_t		*softring;
1473
1474	mutex_enter(&srs->srs_lock);
1475	mutex_enter(&srs->srs_bw->mac_bw_lock);
1476
1477	if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
1478		/* Reset bandwidth limit */
1479		if (srs->srs_type & SRST_BW_CONTROL) {
1480			softring = srs->srs_soft_ring_head;
1481			while (softring != NULL) {
1482				softring->s_ring_type &= ~ST_RING_BW_CTL;
1483				softring = softring->s_ring_next;
1484			}
1485			srs->srs_type &= ~SRST_BW_CONTROL;
1486			srs->srs_drain_func = mac_rx_srs_drain;
1487		}
1488	} else {
1489		/* Set/Modify bandwidth limit */
1490		srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
1491		/*
1492		 * Give twice the queuing capability before
1493		 * dropping packets. The unit is bytes/tick.
1494		 */
1495		srs->srs_bw->mac_bw_drop_threshold =
1496		    srs->srs_bw->mac_bw_limit << 1;
1497		if (!(srs->srs_type & SRST_BW_CONTROL)) {
1498			softring = srs->srs_soft_ring_head;
1499			while (softring != NULL) {
1500				softring->s_ring_type |= ST_RING_BW_CTL;
1501				softring = softring->s_ring_next;
1502			}
1503			srs->srs_type |= SRST_BW_CONTROL;
1504			srs->srs_drain_func = mac_rx_srs_drain_bw;
1505		}
1506	}
1507done:
1508	mutex_exit(&srs->srs_bw->mac_bw_lock);
1509	mutex_exit(&srs->srs_lock);
1510}
1511
1512/* Change the transmit bandwidth limit */
1513static void
1514mac_tx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
1515{
1516	uint32_t		tx_mode, ring_info = 0;
1517	mac_srs_tx_t		*srs_tx = &srs->srs_tx;
1518	mac_client_impl_t	*mcip = srs->srs_mcip;
1519
1520	/*
1521	 * We need to quiesce/restart the client here because mac_tx() and
1522	 * srs->srs_tx->st_func do not hold srs->srs_lock while accessing
1523	 * st_mode and related fields, which are modified by the code below.
1524	 */
1525	mac_tx_client_quiesce((mac_client_handle_t)mcip);
1526
1527	mutex_enter(&srs->srs_lock);
1528	mutex_enter(&srs->srs_bw->mac_bw_lock);
1529
1530	tx_mode = srs_tx->st_mode;
1531	if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
1532		/* Reset bandwidth limit */
1533		if (tx_mode == SRS_TX_BW) {
1534			if (srs_tx->st_arg2 != NULL)
1535				ring_info = mac_hwring_getinfo(srs_tx->st_arg2);
1536			if (mac_tx_serialize ||
1537			    (ring_info & MAC_RING_TX_SERIALIZE)) {
1538				srs_tx->st_mode = SRS_TX_SERIALIZE;
1539			} else {
1540				srs_tx->st_mode = SRS_TX_DEFAULT;
1541			}
1542		} else if (tx_mode == SRS_TX_BW_FANOUT) {
1543			srs_tx->st_mode = SRS_TX_FANOUT;
1544		} else if (tx_mode == SRS_TX_BW_AGGR) {
1545			srs_tx->st_mode = SRS_TX_AGGR;
1546		}
1547		srs->srs_type &= ~SRST_BW_CONTROL;
1548	} else {
1549		/* Set/Modify bandwidth limit */
1550		srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
1551		/*
1552		 * Give twice the queuing capability before
1553		 * dropping packets. The unit is bytes/tick.
1554		 */
1555		srs->srs_bw->mac_bw_drop_threshold =
1556		    srs->srs_bw->mac_bw_limit << 1;
1557		srs->srs_type |= SRST_BW_CONTROL;
1558		if (tx_mode != SRS_TX_BW && tx_mode != SRS_TX_BW_FANOUT &&
1559		    tx_mode != SRS_TX_BW_AGGR) {
1560			if (tx_mode == SRS_TX_SERIALIZE ||
1561			    tx_mode == SRS_TX_DEFAULT) {
1562				srs_tx->st_mode = SRS_TX_BW;
1563			} else if (tx_mode == SRS_TX_FANOUT) {
1564				srs_tx->st_mode = SRS_TX_BW_FANOUT;
1565			} else if (tx_mode == SRS_TX_AGGR) {
1566				srs_tx->st_mode = SRS_TX_BW_AGGR;
1567			} else {
1568				ASSERT(0);
1569			}
1570		}
1571	}
1572done:
1573	srs_tx->st_func = mac_tx_get_func(srs_tx->st_mode);
1574	mutex_exit(&srs->srs_bw->mac_bw_lock);
1575	mutex_exit(&srs->srs_lock);
1576
1577	mac_tx_client_restart((mac_client_handle_t)mcip);
1578}
1579
1580/*
1581 * The uber function that deals with any update to bandwidth limits.
1582 */
1583void
1584mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp)
1585{
1586	int			count;
1587
1588	for (count = 0; count < flent->fe_rx_srs_cnt; count++)
1589		mac_rx_srs_update_bwlimit(flent->fe_rx_srs[count], mrp);
1590	mac_tx_srs_update_bwlimit(flent->fe_tx_srs, mrp);
1591}
1592
1593void
1594mac_srs_change_upcall(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
1595{
1596	mac_soft_ring_set_t	*mac_srs = arg;
1597	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1598	mac_soft_ring_t		*softring;
1599
1600	mutex_enter(&mac_srs->srs_lock);
1601	ASSERT((mac_srs->srs_type & SRST_TX) == 0);
1602	srs_rx->sr_func = rx_func;
1603	srs_rx->sr_arg1 = rx_arg1;
1604
1605	softring = mac_srs->srs_soft_ring_head;
1606	while (softring != NULL) {
1607		mutex_enter(&softring->s_ring_lock);
1608		softring->s_ring_rx_func = rx_func;
1609		softring->s_ring_rx_arg1 = rx_arg1;
1610		mutex_exit(&softring->s_ring_lock);
1611		softring = softring->s_ring_next;
1612	}
1613
1614	mutex_exit(&mac_srs->srs_lock);
1615}
1616
1617/*
1618 * When the first sub-flow is added to a link, we disable polling on the
1619 * link and also modify the entry point to mac_rx_srs_subflow_process.
1620 * (polling is disabled because with the subflow added, accounting
1621 * for polling needs additional logic, it is assumed that when a subflow is
1622 * added, we can take some hit as a result of disabling polling rather than
1623 * adding more complexity - if this becomes a perf. issue we need to
1624 * re-rvaluate this logic).  When the last subflow is removed, we turn back
1625 * polling and also reset the entry point to mac_rx_srs_process.
1626 *
1627 * In the future if there are multiple SRS, we can simply
1628 * take one and give it to the flow rather than disabling polling and
1629 * resetting the entry point.
1630 */
1631void
1632mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable)
1633{
1634	flow_entry_t		*flent = mcip->mci_flent;
1635	int			i;
1636	mac_impl_t		*mip = mcip->mci_mip;
1637	mac_rx_func_t		rx_func;
1638	uint_t			rx_srs_cnt;
1639	boolean_t		enable_classifier;
1640
1641	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1642
1643	enable_classifier = !FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && enable;
1644
1645	rx_func = enable_classifier ? mac_rx_srs_subflow_process :
1646	    mac_rx_srs_process;
1647
1648	/* Tell mac_srs_poll_state_change to disable polling if necessary */
1649	if (mip->mi_state_flags & MIS_POLL_DISABLE)
1650		enable_classifier = B_TRUE;
1651
1652	/*
1653	 * If receive function has already been configured correctly for
1654	 * current subflow configuration, do nothing.
1655	 */
1656	if (flent->fe_cb_fn == (flow_fn_t)rx_func)
1657		return;
1658
1659	rx_srs_cnt = flent->fe_rx_srs_cnt;
1660	for (i = 0; i < rx_srs_cnt; i++) {
1661		ASSERT(flent->fe_rx_srs[i] != NULL);
1662		mac_srs_poll_state_change(flent->fe_rx_srs[i],
1663		    enable_classifier, rx_func);
1664	}
1665
1666	/*
1667	 * Change the S/W classifier so that we can land in the
1668	 * correct processing function with correct argument.
1669	 * If all subflows have been removed we can revert to
1670	 * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
1671	 */
1672	mutex_enter(&flent->fe_lock);
1673	flent->fe_cb_fn = (flow_fn_t)rx_func;
1674	flent->fe_cb_arg1 = (void *)mip;
1675	flent->fe_cb_arg2 = flent->fe_rx_srs[0];
1676	mutex_exit(&flent->fe_lock);
1677}
1678
1679static void
1680mac_srs_update_fanout_list(mac_soft_ring_set_t *mac_srs)
1681{
1682	int tcp_count = 0, udp_count = 0, oth_count = 0, tx_count = 0;
1683	mac_soft_ring_t *softring;
1684
1685	softring = mac_srs->srs_soft_ring_head;
1686	if (softring == NULL) {
1687		ASSERT(mac_srs->srs_soft_ring_count == 0);
1688		mac_srs->srs_tcp_ring_count = 0;
1689		mac_srs->srs_udp_ring_count = 0;
1690		mac_srs->srs_oth_ring_count = 0;
1691		mac_srs->srs_tx_ring_count = 0;
1692		return;
1693	}
1694
1695	while (softring != NULL) {
1696		if (softring->s_ring_type & ST_RING_TCP) {
1697			mac_srs->srs_tcp_soft_rings[tcp_count++] = softring;
1698		} else if (softring->s_ring_type & ST_RING_UDP) {
1699			mac_srs->srs_udp_soft_rings[udp_count++] = softring;
1700		} else if (softring->s_ring_type & ST_RING_OTH) {
1701			mac_srs->srs_oth_soft_rings[oth_count++] = softring;
1702		} else {
1703			ASSERT(softring->s_ring_type & ST_RING_TX);
1704			mac_srs->srs_tx_soft_rings[tx_count++] = softring;
1705		}
1706		softring = softring->s_ring_next;
1707	}
1708
1709	ASSERT(mac_srs->srs_soft_ring_count ==
1710	    (tcp_count + udp_count + oth_count + tx_count));
1711	mac_srs->srs_tcp_ring_count = tcp_count;
1712	mac_srs->srs_udp_ring_count = udp_count;
1713	mac_srs->srs_oth_ring_count = oth_count;
1714	mac_srs->srs_tx_ring_count = tx_count;
1715}
1716
1717void
1718mac_srs_create_proto_softrings(int id, uint16_t type, pri_t pri,
1719    mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
1720    processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
1721    mac_resource_handle_t x_arg2, boolean_t set_bypass)
1722{
1723	mac_soft_ring_t	*softring;
1724	mac_rx_fifo_t	mrf;
1725
1726	bzero(&mrf, sizeof (mac_rx_fifo_t));
1727	mrf.mrf_type = MAC_RX_FIFO;
1728	mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
1729	mrf.mrf_intr_enable =
1730	    (mac_intr_enable_t)mac_soft_ring_intr_enable;
1731	mrf.mrf_intr_disable =
1732	    (mac_intr_disable_t)mac_soft_ring_intr_disable;
1733	mrf.mrf_flow_priority = pri;
1734
1735	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
1736	    (type|ST_RING_TCP), pri, mcip, mac_srs,
1737	    cpuid, rx_func, x_arg1, x_arg2);
1738	softring->s_ring_rx_arg2 = NULL;
1739
1740	/*
1741	 * TCP and UDP support DLS bypass. In addition TCP
1742	 * squeue can also poll their corresponding soft rings.
1743	 */
1744	if (set_bypass && (mcip->mci_resource_arg != NULL)) {
1745		mac_soft_ring_dls_bypass(softring,
1746		    mcip->mci_direct_rx_fn,
1747		    mcip->mci_direct_rx_arg);
1748
1749		mrf.mrf_rx_arg = softring;
1750		mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
1751
1752		/*
1753		 * Make a call in IP to get a TCP squeue assigned to
1754		 * this softring to maintain full CPU locality through
1755		 * the stack and allow the squeue to be able to poll
1756		 * the softring so the flow control can be pushed
1757		 * all the way to H/W.
1758		 */
1759		softring->s_ring_rx_arg2 =
1760		    mcip->mci_resource_add((void *)mcip->mci_resource_arg,
1761		    (mac_resource_t *)&mrf);
1762	}
1763
1764	/*
1765	 * Non-TCP protocols don't support squeues. Hence we
1766	 * don't make any ring addition callbacks for non-TCP
1767	 * rings. Now create the UDP softring and allow it to
1768	 * bypass the DLS layer.
1769	 */
1770	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
1771	    (type|ST_RING_UDP), pri, mcip, mac_srs,
1772	    cpuid, rx_func, x_arg1, x_arg2);
1773	softring->s_ring_rx_arg2 = NULL;
1774
1775	if (set_bypass && (mcip->mci_resource_arg != NULL)) {
1776		mac_soft_ring_dls_bypass(softring,
1777		    mcip->mci_direct_rx_fn,
1778		    mcip->mci_direct_rx_arg);
1779	}
1780
1781	/* Create the Oth softrings which has to go through the DLS */
1782	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
1783	    (type|ST_RING_OTH), pri, mcip, mac_srs,
1784	    cpuid, rx_func, x_arg1, x_arg2);
1785	softring->s_ring_rx_arg2 = NULL;
1786}
1787
1788/*
1789 * This routine associates a CPU or a set of CPU to process incoming
1790 * traffic from a mac client. If multiple CPUs are specified, then
1791 * so many soft rings are created with each soft ring worker thread
1792 * bound to a CPU in the set. Each soft ring in turn will be
1793 * associated with an squeue and the squeue will be moved to the
1794 * same CPU as that of the soft ring's.
1795 */
1796static void
1797mac_srs_fanout_modify(mac_client_impl_t *mcip, mac_direct_rx_t rx_func,
1798    void *x_arg1, mac_resource_handle_t x_arg2,
1799    mac_soft_ring_set_t *mac_rx_srs, mac_soft_ring_set_t *mac_tx_srs)
1800{
1801	mac_soft_ring_t *softring;
1802	uint32_t soft_ring_flag = 0;
1803	processorid_t cpuid = -1;
1804	int i, srings_present, new_fanout_cnt;
1805	mac_cpus_t *srs_cpu;
1806
1807	/* fanout state is REINIT. Set it back to INIT */
1808	ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_REINIT);
1809	mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
1810
1811	/* how many are present right now */
1812	srings_present = mac_rx_srs->srs_tcp_ring_count;
1813	/* new request */
1814	srs_cpu = &mac_rx_srs->srs_cpu;
1815	new_fanout_cnt = srs_cpu->mc_rx_fanout_cnt;
1816
1817	mutex_enter(&mac_rx_srs->srs_lock);
1818	if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
1819		soft_ring_flag |= ST_RING_BW_CTL;
1820	mutex_exit(&mac_rx_srs->srs_lock);
1821
1822	if (new_fanout_cnt > srings_present) {
1823		/* soft rings increased */
1824		mutex_enter(&mac_rx_srs->srs_lock);
1825		mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
1826		mutex_exit(&mac_rx_srs->srs_lock);
1827
1828		for (i = mac_rx_srs->srs_tcp_ring_count;
1829		    i < new_fanout_cnt; i++) {
1830			/*
1831			 * Create the protocol softrings and set the
1832			 * DLS bypass where possible.
1833			 */
1834			mac_srs_create_proto_softrings(i, soft_ring_flag,
1835			    mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid,
1836			    rx_func, x_arg1, x_arg2, B_TRUE);
1837		}
1838		mac_srs_update_fanout_list(mac_rx_srs);
1839	} else if (new_fanout_cnt < srings_present) {
1840		/* soft rings decreased */
1841		if (new_fanout_cnt == 1) {
1842			mutex_enter(&mac_rx_srs->srs_lock);
1843			mac_rx_srs->srs_type &= ~SRST_FANOUT_SRC_IP;
1844			ASSERT(mac_rx_srs->srs_type & SRST_FANOUT_PROTO);
1845			mutex_exit(&mac_rx_srs->srs_lock);
1846		}
1847		/* Get rid of extra soft rings */
1848		for (i = new_fanout_cnt;
1849		    i < mac_rx_srs->srs_tcp_ring_count; i++) {
1850			softring = mac_rx_srs->srs_tcp_soft_rings[i];
1851			if (softring->s_ring_rx_arg2 != NULL) {
1852				mcip->mci_resource_remove(
1853				    (void *)mcip->mci_resource_arg,
1854				    softring->s_ring_rx_arg2);
1855			}
1856			mac_soft_ring_remove(mac_rx_srs,
1857			    mac_rx_srs->srs_tcp_soft_rings[i]);
1858			mac_soft_ring_remove(mac_rx_srs,
1859			    mac_rx_srs->srs_udp_soft_rings[i]);
1860			mac_soft_ring_remove(mac_rx_srs,
1861			    mac_rx_srs->srs_oth_soft_rings[i]);
1862		}
1863		mac_srs_update_fanout_list(mac_rx_srs);
1864	}
1865
1866	ASSERT(new_fanout_cnt == mac_rx_srs->srs_tcp_ring_count);
1867	mutex_enter(&cpu_lock);
1868	for (i = 0; i < mac_rx_srs->srs_tcp_ring_count; i++) {
1869		cpuid = srs_cpu->mc_rx_fanout_cpus[i];
1870		(void) mac_soft_ring_bind(mac_rx_srs->srs_udp_soft_rings[i],
1871		    cpuid);
1872		(void) mac_soft_ring_bind(mac_rx_srs->srs_oth_soft_rings[i],
1873		    cpuid);
1874		(void) mac_soft_ring_bind(mac_rx_srs->srs_tcp_soft_rings[i],
1875		    cpuid);
1876		softring = mac_rx_srs->srs_tcp_soft_rings[i];
1877		if (softring->s_ring_rx_arg2 != NULL) {
1878			mcip->mci_resource_bind((void *)mcip->mci_resource_arg,
1879			    softring->s_ring_rx_arg2, cpuid);
1880		}
1881	}
1882
1883	mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_rx_workerid);
1884	mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_rx_pollid);
1885	mac_rx_srs_retarget_intr(mac_rx_srs, srs_cpu->mc_rx_intr_cpu);
1886	/*
1887	 * Bind Tx srs and soft ring threads too. Let's bind tx
1888	 * srs to the last cpu in mrp list.
1889	 */
1890	if (mac_tx_srs != NULL) {
1891		BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
1892		mac_tx_srs_retarget_intr(mac_tx_srs);
1893	}
1894	mutex_exit(&cpu_lock);
1895}
1896
1897/*
1898 * Bind SRS threads and soft rings to CPUs/create fanout list.
1899 */
1900void
1901mac_srs_fanout_init(mac_client_impl_t *mcip, mac_resource_props_t *mrp,
1902    mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2,
1903    mac_soft_ring_set_t *mac_rx_srs, mac_soft_ring_set_t *mac_tx_srs,
1904    cpupart_t *cpupart)
1905{
1906	int		i;
1907	processorid_t	cpuid;
1908	uint32_t	soft_ring_flag = 0;
1909	int soft_ring_cnt;
1910	mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu;
1911
1912	/*
1913	 * Remove the no soft ring flag and we will adjust it
1914	 * appropriately further down.
1915	 */
1916	mutex_enter(&mac_rx_srs->srs_lock);
1917	mac_rx_srs->srs_type &= ~SRST_NO_SOFT_RINGS;
1918	mutex_exit(&mac_rx_srs->srs_lock);
1919
1920	ASSERT(mac_rx_srs->srs_soft_ring_head == NULL);
1921
1922	if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
1923		soft_ring_flag |= ST_RING_BW_CTL;
1924
1925	ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_UNINIT);
1926	mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
1927	/*
1928	 * Ring count can be 0 if no fanout is required and no cpu
1929	 * were specified. Leave the SRS worker and poll thread
1930	 * unbound
1931	 */
1932	ASSERT(mrp != NULL);
1933	soft_ring_cnt = srs_cpu->mc_rx_fanout_cnt;
1934
1935	/* Step 1: bind cpu contains cpu list where threads need to bind */
1936	if (soft_ring_cnt > 0) {
1937		mutex_enter(&cpu_lock);
1938		for (i = 0; i < soft_ring_cnt; i++) {
1939			cpuid = srs_cpu->mc_rx_fanout_cpus[i];
1940			/* Create the protocol softrings */
1941			mac_srs_create_proto_softrings(i, soft_ring_flag,
1942			    mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid,
1943			    rx_func, x_arg1, x_arg2, B_FALSE);
1944		}
1945		mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_rx_workerid);
1946		mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_rx_pollid);
1947		mac_rx_srs_retarget_intr(mac_rx_srs, srs_cpu->mc_rx_intr_cpu);
1948		/*
1949		 * Bind Tx srs and soft ring threads too.
1950		 * Let's bind tx srs to the last cpu in
1951		 * mrp list.
1952		 */
1953		if (mac_tx_srs == NULL) {
1954			mutex_exit(&cpu_lock);
1955			goto alldone;
1956		}
1957
1958		BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
1959		mac_tx_srs_retarget_intr(mac_tx_srs);
1960		mutex_exit(&cpu_lock);
1961	} else {
1962		mutex_enter(&cpu_lock);
1963		/*
1964		 * For a subflow, mrp_workerid and mrp_pollid
1965		 * is not set.
1966		 */
1967		mac_srs_worker_bind(mac_rx_srs, mrp->mrp_rx_workerid);
1968		mac_srs_poll_bind(mac_rx_srs, mrp->mrp_rx_pollid);
1969		mutex_exit(&cpu_lock);
1970		goto no_softrings;
1971	}
1972
1973alldone:
1974	if (soft_ring_cnt > 1)
1975		mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
1976	mac_srs_update_fanout_list(mac_rx_srs);
1977	mac_srs_client_poll_enable(mcip, mac_rx_srs);
1978	return;
1979
1980no_softrings:
1981	if (mac_rx_srs->srs_type & SRST_FANOUT_PROTO) {
1982		mutex_enter(&cpu_lock);
1983		cpuid = mac_next_bind_cpu(cpupart);
1984		/* Create the protocol softrings */
1985		mac_srs_create_proto_softrings(0, soft_ring_flag,
1986		    mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid,
1987		    rx_func, x_arg1, x_arg2, B_FALSE);
1988		mutex_exit(&cpu_lock);
1989	} else {
1990		/*
1991		 * This is the case when there is no fanout which is
1992		 * true for subflows.
1993		 */
1994		mac_rx_srs->srs_type |= SRST_NO_SOFT_RINGS;
1995	}
1996	mac_srs_update_fanout_list(mac_rx_srs);
1997	mac_srs_client_poll_enable(mcip, mac_rx_srs);
1998}
1999
2000/*
2001 * mac_fanout_setup:
2002 *
2003 * Calls mac_srs_fanout_init() or modify() depending upon whether
2004 * the SRS is getting initialized or re-initialized.
2005 */
2006void
2007mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
2008    mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
2009    mac_resource_handle_t x_arg2, cpupart_t *cpupart)
2010{
2011	mac_soft_ring_set_t *mac_rx_srs, *mac_tx_srs;
2012	int i, rx_srs_cnt;
2013
2014	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2015	/*
2016	 * This is an aggregation port. Fanout will be setup
2017	 * over the aggregation itself.
2018	 */
2019	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
2020		return;
2021
2022	mac_rx_srs = flent->fe_rx_srs[0];
2023	/*
2024	 * Set up the fanout on the tx side only once, with the
2025	 * first rx SRS. The CPU binding, fanout, and bandwidth
2026	 * criteria are common to both RX and TX, so
2027	 * initializing them along side avoids redundant code.
2028	 */
2029	mac_tx_srs = flent->fe_tx_srs;
2030	rx_srs_cnt = flent->fe_rx_srs_cnt;
2031
2032	/* No fanout for subflows */
2033	if (flent->fe_type & FLOW_USER) {
2034		mac_srs_fanout_init(mcip, mrp, rx_func,
2035		    x_arg1, x_arg2, mac_rx_srs, mac_tx_srs,
2036		    cpupart);
2037		return;
2038	}
2039
2040	if (mrp->mrp_mask & MRP_CPUS_USERSPEC)
2041		mac_flow_user_cpu_init(flent, mrp);
2042	else
2043		mac_flow_cpu_init(flent, cpupart);
2044
2045	mrp->mrp_rx_fanout_cnt = mac_rx_srs->srs_cpu.mc_rx_fanout_cnt;
2046
2047	/*
2048	 * Set up fanout for both SW (0th SRS) and HW classified
2049	 * SRS (the rest of Rx SRSs in flent).
2050	 */
2051	for (i = 0; i < rx_srs_cnt; i++) {
2052		mac_rx_srs = flent->fe_rx_srs[i];
2053		if (i != 0)
2054			mac_tx_srs = NULL;
2055		switch (mac_rx_srs->srs_fanout_state) {
2056		case SRS_FANOUT_UNINIT:
2057			mac_srs_fanout_init(mcip, mrp, rx_func,
2058			    x_arg1, x_arg2, mac_rx_srs, mac_tx_srs,
2059			    cpupart);
2060			break;
2061		case SRS_FANOUT_INIT:
2062			break;
2063		case SRS_FANOUT_REINIT:
2064			mac_rx_srs_quiesce(mac_rx_srs, SRS_QUIESCE);
2065			mac_srs_fanout_modify(mcip, rx_func, x_arg1,
2066			    x_arg2, mac_rx_srs, mac_tx_srs);
2067			mac_rx_srs_restart(mac_rx_srs);
2068			break;
2069		default:
2070			VERIFY(mac_rx_srs->srs_fanout_state <=
2071			    SRS_FANOUT_REINIT);
2072			break;
2073		}
2074	}
2075}
2076
2077/*
2078 * mac_srs_create:
2079 *
2080 * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
2081 * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
2082 * processing is created.
2083 *
2084 * Details on Rx SRS:
2085 * Create a SRS and also add the necessary soft rings for TCP and
2086 * non-TCP based on fanout type and count specified.
2087 *
2088 * mac_soft_ring_fanout, mac_srs_fanout_modify (?),
2089 * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need
2090 * to be heavily modified.
2091 *
2092 * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear.
2093 */
2094mac_soft_ring_set_t *
2095mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
2096    mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2,
2097    mac_ring_t *ring)
2098{
2099	mac_soft_ring_set_t 	*mac_srs;
2100	mac_srs_rx_t		*srs_rx;
2101	mac_srs_tx_t		*srs_tx;
2102	mac_bw_ctl_t		*mac_bw;
2103	mac_resource_props_t	*mrp;
2104	boolean_t		is_tx_srs = ((srs_type & SRST_TX) != 0);
2105
2106	mac_srs = kmem_cache_alloc(mac_srs_cache, KM_SLEEP);
2107	bzero(mac_srs, sizeof (mac_soft_ring_set_t));
2108	srs_rx = &mac_srs->srs_rx;
2109	srs_tx = &mac_srs->srs_tx;
2110
2111	mutex_enter(&flent->fe_lock);
2112
2113	/*
2114	 * Get the bandwidth control structure from the flent. Get
2115	 * rid of any residual values in the control structure for
2116	 * the tx bw struct and also for the rx, if the rx srs is
2117	 * the 1st one being brought up (the rx bw ctl struct may
2118	 * be shared by multiple SRSs)
2119	 */
2120	if (is_tx_srs) {
2121		mac_srs->srs_bw = &flent->fe_tx_bw;
2122		bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
2123		flent->fe_tx_srs = mac_srs;
2124	} else {
2125		/*
2126		 * The bw counter (stored in the flent) is shared
2127		 * by SRS's within an rx group.
2128		 */
2129		mac_srs->srs_bw = &flent->fe_rx_bw;
2130		/* First rx SRS, clear the bw structure */
2131		if (flent->fe_rx_srs_cnt == 0)
2132			bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
2133
2134		/*
2135		 * It is better to panic here rather than just assert because
2136		 * on a non-debug kernel we might end up courrupting memory
2137		 * and making it difficult to debug.
2138		 */
2139		if (flent->fe_rx_srs_cnt >= MAX_RINGS_PER_GROUP) {
2140			panic("Array Overrun detected due to MAC client %p "
2141			    " having more rings than %d", (void *)mcip,
2142			    MAX_RINGS_PER_GROUP);
2143		}
2144		flent->fe_rx_srs[flent->fe_rx_srs_cnt] = mac_srs;
2145		flent->fe_rx_srs_cnt++;
2146	}
2147	mac_srs->srs_flent = flent;
2148	mutex_exit(&flent->fe_lock);
2149
2150	mac_srs->srs_state = 0;
2151	mac_srs->srs_type = (srs_type | SRST_NO_SOFT_RINGS);
2152	mac_srs->srs_worker_cpuid = mac_srs->srs_worker_cpuid_save = -1;
2153	mac_srs->srs_poll_cpuid = mac_srs->srs_poll_cpuid_save = -1;
2154	mac_srs->srs_mcip = mcip;
2155	mac_srs_fanout_list_alloc(mac_srs);
2156
2157	/*
2158	 * For a flow we use the underlying MAC client's priority range with
2159	 * the priority value to find an absolute priority value. For a MAC
2160	 * client we use the MAC client's maximum priority as the value.
2161	 */
2162	mrp = &flent->fe_effective_props;
2163	if ((mac_srs->srs_type & SRST_FLOW) != 0) {
2164		mac_srs->srs_pri = FLOW_PRIORITY(mcip->mci_min_pri,
2165		    mcip->mci_max_pri, mrp->mrp_priority);
2166	} else {
2167		mac_srs->srs_pri = mcip->mci_max_pri;
2168	}
2169	/*
2170	 * We need to insert the SRS in the global list before
2171	 * binding the SRS and SR threads. Otherwise there is a
2172	 * is a small window where the cpu reconfig callbacks
2173	 * may miss the SRS in the list walk and DR could fail
2174	 * as there are bound threads.
2175	 */
2176	mac_srs_add_glist(mac_srs);
2177
2178	/* Initialize bw limit */
2179	if ((mrp->mrp_mask & MRP_MAXBW) != 0) {
2180		mac_srs->srs_drain_func = mac_rx_srs_drain_bw;
2181
2182		mac_bw = mac_srs->srs_bw;
2183		mutex_enter(&mac_bw->mac_bw_lock);
2184		mac_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
2185
2186		/*
2187		 * Give twice the queuing capability before
2188		 * dropping packets. The unit is bytes/tick.
2189		 */
2190		mac_bw->mac_bw_drop_threshold = mac_bw->mac_bw_limit << 1;
2191		mutex_exit(&mac_bw->mac_bw_lock);
2192		mac_srs->srs_type |= SRST_BW_CONTROL;
2193	} else {
2194		mac_srs->srs_drain_func = mac_rx_srs_drain;
2195	}
2196
2197	/*
2198	 * We use the following policy to control Receive
2199	 * Side Dynamic Polling:
2200	 * 1) We switch to poll mode anytime the processing thread causes
2201	 *    a backlog to build up in SRS and its associated Soft Rings
2202	 *    (sr_poll_pkt_cnt > 0).
2203	 * 2) As long as the backlog stays under the low water mark
2204	 *    (sr_lowat), we poll the H/W for more packets.
2205	 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we
2206	 *    stay in poll mode but don't poll the H/W for more packets.
2207	 * 4) Anytime in polling mode, if we poll the H/W for packets and
2208	 *    find nothing plus we have an existing backlog
2209	 *    (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
2210	 *    the H/W for packets anymore (let the polling thread go to sleep).
2211	 * 5) Once the backlog is relived (packets are processed) we reenable
2212	 *    polling (by signalling the poll thread) only when the backlog
2213	 *    dips below sr_poll_thres.
2214	 * 6) sr_hiwat is used exclusively when we are not polling capable
2215	 *    and is used to decide when to drop packets so the SRS queue
2216	 *    length doesn't grow infinitely.
2217	 */
2218	if (!is_tx_srs) {
2219		srs_rx->sr_hiwat = mac_soft_ring_max_q_cnt;
2220		/* Low water mark needs to be less than high water mark */
2221		srs_rx->sr_lowat = mac_soft_ring_min_q_cnt <=
2222		    mac_soft_ring_max_q_cnt ? mac_soft_ring_min_q_cnt :
2223		    (mac_soft_ring_max_q_cnt >> 2);
2224		/* Poll threshold need to be half of low water mark or less */
2225		srs_rx->sr_poll_thres = mac_soft_ring_poll_thres <=
2226		    (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres :
2227		    (srs_rx->sr_lowat >> 1);
2228		if (mac_latency_optimize)
2229			mac_srs->srs_state |= SRS_LATENCY_OPT;
2230		else
2231			mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
2232	}
2233
2234	mac_srs->srs_worker = thread_create(NULL, 0,
2235	    mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
2236
2237	if (is_tx_srs) {
2238		/* Handle everything about Tx SRS and return */
2239		mac_srs->srs_drain_func = mac_tx_srs_drain;
2240		srs_tx->st_max_q_cnt = mac_tx_srs_max_q_cnt;
2241		srs_tx->st_hiwat =
2242		    (mac_tx_srs_hiwat > mac_tx_srs_max_q_cnt) ?
2243		    mac_tx_srs_max_q_cnt : mac_tx_srs_hiwat;
2244		srs_tx->st_arg1 = x_arg1;
2245		srs_tx->st_arg2 = x_arg2;
2246		goto done;
2247	}
2248
2249	if ((srs_type & SRST_FLOW) != 0 ||
2250	    FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
2251		srs_rx->sr_lower_proc = mac_rx_srs_process;
2252	else
2253		srs_rx->sr_lower_proc = mac_rx_srs_subflow_process;
2254
2255	srs_rx->sr_func = rx_func;
2256	srs_rx->sr_arg1 = x_arg1;
2257	srs_rx->sr_arg2 = x_arg2;
2258
2259	if (ring != NULL) {
2260		uint_t ring_info;
2261
2262		/* Is the mac_srs created over the RX default group? */
2263		if (ring->mr_gh == (mac_group_handle_t)
2264		    MAC_DEFAULT_RX_GROUP(mcip->mci_mip)) {
2265			mac_srs->srs_type |= SRST_DEFAULT_GRP;
2266		}
2267		mac_srs->srs_ring = ring;
2268		ring->mr_srs = mac_srs;
2269		ring->mr_classify_type = MAC_HW_CLASSIFIER;
2270		ring->mr_flag |= MR_INCIPIENT;
2271
2272		if (!(mcip->mci_mip->mi_state_flags & MIS_POLL_DISABLE) &&
2273		    FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && mac_poll_enable)
2274			mac_srs->srs_state |= SRS_POLLING_CAPAB;
2275
2276		mac_srs->srs_poll_thr = thread_create(NULL, 0,
2277		    mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN,
2278		    mac_srs->srs_pri);
2279		/*
2280		 * Some drivers require serialization and don't send
2281		 * packet chains in interrupt context. For such
2282		 * drivers, we should always queue in soft ring
2283		 * so that we get a chance to switch into a polling
2284		 * mode under backlog.
2285		 */
2286		ring_info = mac_hwring_getinfo((mac_ring_handle_t)ring);
2287		if (ring_info & MAC_RING_RX_ENQUEUE)
2288			mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
2289	}
2290done:
2291	mac_srs_stat_create(mac_srs);
2292	return (mac_srs);
2293}
2294
2295/*
2296 * Figure out the number of soft rings required. Its dependant on
2297 * if protocol fanout is required (for LINKs), global settings
2298 * require us to do fanout for performance (based on mac_soft_ring_enable),
2299 * or user has specifically requested fanout.
2300 */
2301static uint32_t
2302mac_find_fanout(flow_entry_t *flent, uint32_t link_type)
2303{
2304	uint32_t			fanout_type;
2305	mac_resource_props_t		*mrp = &flent->fe_effective_props;
2306
2307	/* no fanout for subflows */
2308	switch (link_type) {
2309	case SRST_FLOW:
2310		fanout_type = SRST_NO_SOFT_RINGS;
2311		break;
2312	case SRST_LINK:
2313		fanout_type = SRST_FANOUT_PROTO;
2314		break;
2315	}
2316
2317	/* A primary NIC/link is being plumbed */
2318	if (flent->fe_type & FLOW_PRIMARY_MAC) {
2319		if (mac_soft_ring_enable && mac_rx_soft_ring_count > 1) {
2320			fanout_type |= SRST_FANOUT_SRC_IP;
2321		}
2322	} else if (flent->fe_type & FLOW_VNIC) {
2323		/* A VNIC is being created */
2324		if (mrp != NULL && mrp->mrp_ncpus > 0) {
2325			fanout_type |= SRST_FANOUT_SRC_IP;
2326		}
2327	}
2328
2329	return (fanout_type);
2330}
2331
2332/*
2333 * Change a group from h/w to s/w classification.
2334 */
2335void
2336mac_rx_switch_grp_to_sw(mac_group_t *group)
2337{
2338	mac_ring_t		*ring;
2339	mac_soft_ring_set_t	*mac_srs;
2340
2341	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
2342		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
2343			/*
2344			 * Remove the SRS associated with the HW ring.
2345			 * As a result, polling will be disabled.
2346			 */
2347			mac_srs = ring->mr_srs;
2348			ASSERT(mac_srs != NULL);
2349			mac_rx_srs_remove(mac_srs);
2350			ring->mr_srs = NULL;
2351		}
2352
2353		if (ring->mr_state != MR_INUSE)
2354			(void) mac_start_ring(ring);
2355
2356		/*
2357		 * We need to perform SW classification
2358		 * for packets landing in these rings
2359		 */
2360		ring->mr_flag = 0;
2361		ring->mr_classify_type = MAC_SW_CLASSIFIER;
2362	}
2363}
2364
2365/*
2366 * Create the Rx SRS for S/W classifier and for each ring in the
2367 * group (if exclusive group). Also create the Tx SRS.
2368 */
2369void
2370mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
2371    uint32_t link_type)
2372{
2373	cpupart_t		*cpupart;
2374	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
2375	mac_resource_props_t	*emrp = MCIP_EFFECTIVE_PROPS(mcip);
2376	boolean_t		use_default = B_FALSE;
2377
2378	mac_rx_srs_group_setup(mcip, flent, link_type);
2379	mac_tx_srs_group_setup(mcip, flent, link_type);
2380
2381	pool_lock();
2382	cpupart = mac_pset_find(mrp, &use_default);
2383	mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
2384	    mac_rx_deliver, mcip, NULL, cpupart);
2385	mac_set_pool_effective(use_default, cpupart, mrp, emrp);
2386	pool_unlock();
2387}
2388
2389/*
2390 * Set up the RX SRSs. If the S/W SRS is not set, set  it up, if there
2391 * is a group associated with this MAC client, set up SRSs for individual
2392 * h/w rings.
2393 */
2394void
2395mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
2396    uint32_t link_type)
2397{
2398	mac_impl_t		*mip = mcip->mci_mip;
2399	mac_soft_ring_set_t	*mac_srs;
2400	mac_ring_t 		*ring;
2401	uint32_t		fanout_type;
2402	mac_group_t		*rx_group = flent->fe_rx_ring_group;
2403
2404	fanout_type = mac_find_fanout(flent, link_type);
2405
2406	/* Create the SRS for S/W classification if none exists */
2407	if (flent->fe_rx_srs[0] == NULL) {
2408		ASSERT(flent->fe_rx_srs_cnt == 0);
2409		/* Setup the Rx SRS */
2410		mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type,
2411		    mac_rx_deliver, mcip, NULL, NULL);
2412		mutex_enter(&flent->fe_lock);
2413		flent->fe_cb_fn = (flow_fn_t)mac_srs->srs_rx.sr_lower_proc;
2414		flent->fe_cb_arg1 = (void *)mip;
2415		flent->fe_cb_arg2 = (void *)mac_srs;
2416		mutex_exit(&flent->fe_lock);
2417	}
2418
2419	if (rx_group == NULL)
2420		return;
2421	/*
2422	 * fanout for default SRS is done when default SRS are created
2423	 * above. As each ring is added to the group, we setup the
2424	 * SRS and fanout to it.
2425	 */
2426	switch (rx_group->mrg_state) {
2427	case MAC_GROUP_STATE_RESERVED:
2428		for (ring = rx_group->mrg_rings; ring != NULL;
2429		    ring = ring->mr_next) {
2430			switch (ring->mr_state) {
2431			case MR_INUSE:
2432			case MR_FREE:
2433				if (ring->mr_srs != NULL)
2434					break;
2435				if (ring->mr_state != MR_INUSE)
2436					(void) mac_start_ring(ring);
2437
2438				/*
2439				 * Since the group is exclusively ours create
2440				 * an SRS for this ring to allow the
2441				 * individual SRS to dynamically poll the
2442				 * ring. Do this only if the  client is not
2443				 * a VLAN MAC client, since for VLAN we do
2444				 * s/w classification for the VID check, and
2445				 * if it has a unicast address.
2446				 */
2447				if ((mcip->mci_state_flags &
2448				    MCIS_NO_UNICAST_ADDR) ||
2449				    i_mac_flow_vid(mcip->mci_flent) !=
2450				    VLAN_ID_NONE) {
2451					break;
2452				}
2453				mac_srs = mac_srs_create(mcip, flent,
2454				    fanout_type | link_type,
2455				    mac_rx_deliver, mcip, NULL, ring);
2456				break;
2457			default:
2458				cmn_err(CE_PANIC,
2459				    "srs_setup: mcip = %p "
2460				    "trying to add UNKNOWN ring = %p\n",
2461				    (void *)mcip, (void *)ring);
2462				break;
2463			}
2464		}
2465		break;
2466	case MAC_GROUP_STATE_SHARED:
2467		/*
2468		 * Set all rings of this group to software classified.
2469		 *
2470		 * If the group is current RESERVED, the existing mac
2471		 * client (the only client on this group) is using
2472		 * this group exclusively.  In that case we need to
2473		 * disable polling on the rings of the group (if it
2474		 * was enabled), and free the SRS associated with the
2475		 * rings.
2476		 */
2477		mac_rx_switch_grp_to_sw(rx_group);
2478		break;
2479	default:
2480		ASSERT(B_FALSE);
2481		break;
2482	}
2483}
2484
2485/*
2486 * Set up the TX SRS.
2487 */
2488void
2489mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
2490    uint32_t link_type)
2491{
2492	int			cnt;
2493	int			ringcnt;
2494	mac_ring_t		*ring;
2495	mac_group_t		*grp;
2496
2497	/*
2498	 * If we are opened exclusively (like aggr does for aggr_ports),
2499	 * don't set up Tx SRS and Tx soft rings as they won't be used.
2500	 * The same thing has to be done for Rx side also. See bug:
2501	 * 6880080
2502	 */
2503	if (mcip->mci_state_flags & MCIS_EXCLUSIVE) {
2504		/*
2505		 * If we have rings, start them here.
2506		 */
2507		if (flent->fe_tx_ring_group == NULL)
2508			return;
2509		grp = (mac_group_t *)flent->fe_tx_ring_group;
2510		ringcnt = grp->mrg_cur_count;
2511		ring = grp->mrg_rings;
2512		for (cnt = 0; cnt < ringcnt; cnt++) {
2513			if (ring->mr_state != MR_INUSE) {
2514				(void) mac_start_ring(ring);
2515			}
2516			ring = ring->mr_next;
2517		}
2518		return;
2519	}
2520	if (flent->fe_tx_srs == NULL) {
2521		(void) mac_srs_create(mcip, flent, SRST_TX | link_type,
2522		    NULL, mcip, NULL, NULL);
2523	}
2524	mac_tx_srs_setup(mcip, flent);
2525}
2526
2527/*
2528 * Remove all the RX SRSs. If we want to remove only the SRSs associated
2529 * with h/w rings, leave the S/W SRS alone. This is used when we want to
2530 * move the MAC client from one group to another, so we need to teardown
2531 * on the h/w SRSs.
2532 */
2533void
2534mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly)
2535{
2536	mac_soft_ring_set_t	*mac_srs;
2537	int			i;
2538	int			count = flent->fe_rx_srs_cnt;
2539
2540	for (i = 0; i < count; i++) {
2541		if (i == 0 && hwonly)
2542			continue;
2543		mac_srs = flent->fe_rx_srs[i];
2544		mac_rx_srs_quiesce(mac_srs, SRS_CONDEMNED);
2545		mac_srs_free(mac_srs);
2546		flent->fe_rx_srs[i] = NULL;
2547		flent->fe_rx_srs_cnt--;
2548	}
2549	ASSERT(!hwonly || flent->fe_rx_srs_cnt == 1);
2550	ASSERT(hwonly || flent->fe_rx_srs_cnt == 0);
2551}
2552
2553/*
2554 * Remove the TX SRS.
2555 */
2556void
2557mac_tx_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
2558    uint32_t link_type)
2559{
2560	mac_soft_ring_set_t	*tx_srs;
2561	mac_srs_tx_t		*tx;
2562
2563	if ((tx_srs = flent->fe_tx_srs) == NULL)
2564		return;
2565
2566	tx = &tx_srs->srs_tx;
2567	switch (link_type) {
2568	case SRST_FLOW:
2569		/*
2570		 * For flows, we need to work with passed
2571		 * flent to find the Rx/Tx SRS.
2572		 */
2573		mac_tx_srs_quiesce(tx_srs, SRS_CONDEMNED);
2574		break;
2575	case SRST_LINK:
2576		mac_tx_client_condemn((mac_client_handle_t)mcip);
2577		if (tx->st_arg2 != NULL) {
2578			ASSERT(tx_srs->srs_type & SRST_TX);
2579			/*
2580			 * The ring itself will be stopped when
2581			 * we release the group or in the
2582			 * mac_datapath_teardown (for the default
2583			 * group)
2584			 */
2585			tx->st_arg2 = NULL;
2586		}
2587		break;
2588	default:
2589		ASSERT(B_FALSE);
2590		break;
2591	}
2592	mac_srs_free(tx_srs);
2593	flent->fe_tx_srs = NULL;
2594}
2595
2596/*
2597 * This is the group state machine.
2598 *
2599 * The state of an Rx group is given by
2600 * the following table. The default group and its rings are started in
2601 * mac_start itself and the default group stays in SHARED state until
2602 * mac_stop at which time the group and rings are stopped and and it
2603 * reverts to the Registered state.
2604 *
2605 * Typically this function is called on a group after adding or removing a
2606 * client from it, to find out what should be the new state of the group.
2607 * If the new state is RESERVED, then the client that owns this group
2608 * exclusively is also returned. Note that adding or removing a client from
2609 * a group could also impact the default group and the caller needs to
2610 * evaluate the effect on the default group.
2611 *
2612 * Group type		# of clients	mi_nactiveclients	Group State
2613 *			in the group
2614 *
2615 * Non-default		0		N.A.			REGISTERED
2616 * Non-default		1		N.A.			RESERVED
2617 *
2618 * Default		0		N.A.			SHARED
2619 * Default		1		1			RESERVED
2620 * Default		1		> 1			SHARED
2621 * Default		> 1		N.A.			SHARED
2622 *
2623 * For a TX group, the following is the state table.
2624 *
2625 * Group type		# of clients	Group State
2626 *			in the group
2627 *
2628 * Non-default		0		REGISTERED
2629 * Non-default		1		RESERVED
2630 *
2631 * Default		0		REGISTERED
2632 * Default		1		RESERVED
2633 * Default		> 1		SHARED
2634 */
2635mac_group_state_t
2636mac_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip,
2637    mac_group_t *defgrp, boolean_t rx_group)
2638{
2639	mac_impl_t		*mip = (mac_impl_t *)grp->mrg_mh;
2640
2641	*group_only_mcip = NULL;
2642
2643	/* Non-default group */
2644
2645	if (grp != defgrp) {
2646		if (MAC_GROUP_NO_CLIENT(grp))
2647			return (MAC_GROUP_STATE_REGISTERED);
2648
2649		*group_only_mcip = MAC_GROUP_ONLY_CLIENT(grp);
2650		if (*group_only_mcip != NULL)
2651			return (MAC_GROUP_STATE_RESERVED);
2652
2653		return (MAC_GROUP_STATE_SHARED);
2654	}
2655
2656	/* Default group */
2657
2658	if (MAC_GROUP_NO_CLIENT(grp)) {
2659		if (rx_group)
2660			return (MAC_GROUP_STATE_SHARED);
2661		else
2662			return (MAC_GROUP_STATE_REGISTERED);
2663	}
2664	*group_only_mcip = MAC_GROUP_ONLY_CLIENT(grp);
2665	if (*group_only_mcip == NULL)
2666		return (MAC_GROUP_STATE_SHARED);
2667
2668	if (rx_group && mip->mi_nactiveclients != 1)
2669		return (MAC_GROUP_STATE_SHARED);
2670
2671	ASSERT(*group_only_mcip != NULL);
2672	return (MAC_GROUP_STATE_RESERVED);
2673}
2674
2675/*
2676 * OVERVIEW NOTES FOR DATAPATH
2677 * ===========================
2678 *
2679 * Create an SRS and setup the corresponding flow function and args.
2680 * Add a classification rule for the flow specified by 'flent' and program
2681 * the hardware classifier when applicable.
2682 *
2683 * Rx ring assignment, SRS, polling and B/W enforcement
2684 * ----------------------------------------------------
2685 *
2686 * We try to use H/W classification on NIC and assign traffic to a
2687 * MAC address to a particular Rx ring. There is a 1-1 mapping
2688 * between a SRS and a Rx ring. The SRS (short for soft ring set)
2689 * dynamically switches the underlying Rx ring between interrupt
2690 * and polling mode and enforces any specified B/W control.
2691 *
2692 * There is always a SRS created and tied to each H/W and S/W rule.
2693 * Whenever we create a H/W rule, we always add the the same rule to
2694 * S/W classifier and tie a SRS to it.
2695 *
2696 * In case a B/W control is specified, its broken into bytes
2697 * per ticks and as soon as the quota for a tick is exhausted,
2698 * the underlying Rx ring is forced into poll mode for remianing
2699 * tick. The SRS poll thread only polls for bytes that are
2700 * allowed to come in the SRS. We typically let 4x the configured
2701 * B/W worth of packets to come in the SRS (to prevent unnecessary
2702 * drops due to bursts) but only process the specified amount.
2703 *
2704 * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more
2705 * Rx rings (and corresponding SRSs) assigned to it. The SRS
2706 * in turn can have softrings to do protocol level fanout or
2707 * softrings to do S/W based fanout or both. In case the NIC
2708 * has no Rx rings, we do S/W classification to respective SRS.
2709 * The S/W classification rule is always setup and ready. This
2710 * allows the MAC layer to reassign Rx rings whenever needed
2711 * but packets still continue to flow via the default path and
2712 * getting S/W classified to correct SRS.
2713 *
2714 * In other cases where a NIC or VNIC is plumbed, our goal is use
2715 * H/W classifier and get two Rx ring assigned for the Link. One
2716 * for TCP and one for UDP|SCTP. The respective SRS still do the
2717 * polling on the Rx ring. For Link that is plumbed for IP, there
2718 * is a TCP squeue which also does polling and can control the
2719 * the Rx ring directly (where SRS is just pass through). For
2720 * the following cases, the SRS does the polling underneath.
2721 * 1) non IP based Links (Links which are not plumbed via ifconfig)
2722 *    and paths which have no IP squeues (UDP & SCTP)
2723 * 2) If B/W control is specified on the Link
2724 * 3) If S/W fanout is secified
2725 *
2726 * Note1: As of current implementation, we try to assign only 1 Rx
2727 * ring per Link and more than 1 Rx ring for primary Link for
2728 * H/W based fanout. We always create following softrings per SRS:
2729 * 1) TCP softring which is polled by TCP squeue where possible
2730 *    (and also bypasses DLS)
2731 * 2) UDP/SCTP based which bypasses DLS
2732 * 3) OTH softring which goes via DLS (currently deal with IPv6
2733 *    and non TCP/UDP/SCTP for IPv4 packets).
2734 *
2735 * It is necessary to create 3 softrings since SRS has to poll
2736 * the single Rx ring underneath and enforce any link level B/W
2737 * control (we can't switch the Rx ring in poll mode just based
2738 * on TCP squeue if the same Rx ring is sharing UDP and other
2739 * traffic as well). Once polling is done and any Link level B/W
2740 * control is specified, the packets are assigned to respective
2741 * softring based on protocol. Since TCP has IP based squeue
2742 * which benefits by polling, we separate TCP packets into
2743 * its own softring which can be polled by IP squeue. We need
2744 * to separate out UDP/SCTP to UDP softring since it can bypass
2745 * the DLS layer which has heavy performance advanatges and we
2746 * need a softring (OTH) for rest.
2747 *
2748 * ToDo: The 3 softrings for protocol are needed only till we can
2749 * get rid of DLS from datapath, make IPv4 and IPv6 paths
2750 * symmetric (deal with mac_header_info for v6 and polling for
2751 * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues
2752 * are generic), and bring SAP based classification to MAC layer
2753 *
2754 * H/W and S/W based fanout and multiple Rx rings per Link
2755 * -------------------------------------------------------
2756 *
2757 * In case, fanout is requested (or determined automatically based
2758 * on Link speed and processor speed), we try to assign multiple
2759 * Rx rings per Link with their respective SRS. In this case
2760 * the NIC should be capable of fanning out incoming packets between
2761 * the assigned Rx rings (H/W based fanout). All the SRS
2762 * individually switch their Rx ring between interrupt and polling
2763 * mode but share a common B/W control counter in case of Link
2764 * level B/W is specified.
2765 *
2766 * If S/W based fanout is specified in lieu of H/W based fanout,
2767 * the Link SRS creates the specified number of softrings for
2768 * each protocol (TCP, UDP, OTH). Incoming packets are fanned
2769 * out to the correct softring based on their protocol and
2770 * protocol specific hash function.
2771 *
2772 * Primary and non primary MAC clients
2773 * -----------------------------------
2774 *
2775 * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links
2776 * and are a Layer 2 construct.
2777 *
2778 * Primary NIC:
2779 *	The Link that owns the primary MAC address and typically
2780 *	is used as the data NIC in non virtualized cases. As such
2781 *	H/W resources are preferntially given to primary NIC. As
2782 *	far as code is concerned, there is no difference in the
2783 *	primary NIC vs VNICs. They are all treated as Links.
2784 *	At the very first call to mac_unicast_add() we program the S/W
2785 *	classifier for the primary MAC address, get a soft ring set
2786 *	(and soft rings based on 'ip_soft_ring_cnt')
2787 *	and a Rx ring assigned for polling to get enabled.
2788 *	When IP get plumbed and negotiates polling, we can
2789 *	let squeue do the polling on TCP softring.
2790 *
2791 * VNICs:
2792 *	Same as any other Link. As long as the H/W resource assignments
2793 *	are equal, the data path and setup for all Links is same.
2794 *
2795 * Flows:
2796 *	Can be configured on Links. They have their own SRS and the
2797 *	S/W classifier is programmed appropriately based on the flow.
2798 *	The flows typically deal with layer 3 and above and
2799 *	creates a soft ring set specific to the flow. The receive
2800 *	side function is switched from mac_rx_srs_process to
2801 *	mac_rx_srs_subflow_process which first tries to assign the
2802 *	packet to appropriate flow SRS and failing which assigns it
2803 *	to link SRS. This allows us to avoid the layered approach
2804 *	which gets complex.
2805 *
2806 * By the time mac_datapath_setup() completes, we already have the
2807 * soft rings set, Rx rings, soft rings, etc figured out and both H/W
2808 * and S/W classifiers programmed. IP is not plumbed yet (and might
2809 * never be for Virtual Machines guest OS path). When IP is plumbed
2810 * (for both NIC and VNIC), we do a capability negotiation for polling
2811 * and upcall functions etc.
2812 *
2813 * Rx ring Assignement NOTES
2814 * -------------------------
2815 *
2816 * For NICs which have only 1 Rx ring (we treat  NICs with no Rx rings
2817 * as NIC with a single default ring), we assign the only ring to
2818 * primary Link. The primary Link SRS can do polling on it as long as
2819 * it is the only link in use and we compare the MAC address for unicast
2820 * packets before accepting an incoming packet (there is no need for S/W
2821 * classification in this case). We disable polling on the only ring the
2822 * moment 2nd link gets created (the polling remains enabled even though
2823 * there are broadcast and * multicast flows created).
2824 *
2825 * If the NIC has more than 1 Rx ring, we assign the default ring (the
2826 * 1st ring) to deal with broadcast, multicast and traffic for other
2827 * NICs which needs S/W classification. We assign the primary mac
2828 * addresses to another ring by specifiying a classification rule for
2829 * primary unicast MAC address to the selected ring. The primary Link
2830 * (and its SRS) can continue to poll the assigned Rx ring at all times
2831 * independantly.
2832 *
2833 * Note: In future, if no fanout is specified, we try to assign 2 Rx
2834 * rings for the primary Link with the primary MAC address + TCP going
2835 * to one ring and primary MAC address + UDP|SCTP going to other ring.
2836 * Any remaining traffic for primary MAC address can go to the default
2837 * Rx ring and get S/W classified. This way the respective SRSs don't
2838 * need to do proto fanout and don't need to have softrings at all and
2839 * can poll their respective Rx rings.
2840 *
2841 * As an optimization, when a new NIC or VNIC is created, we can get
2842 * only one Rx ring and make it a TCP specific Rx ring and use the
2843 * H/W default Rx ring for the rest (this Rx ring is never polled).
2844 *
2845 * For clients that don't have MAC address, but want to receive and
2846 * transmit packets (e.g, bpf, gvrp etc.), we need to setup the datapath.
2847 * For such clients (identified by the MCIS_NO_UNICAST_ADDR flag) we
2848 * always give the default group and use software classification (i.e.
2849 * even if this is the only client in the default group, we will
2850 * leave group as shared).
2851 */
2852int
2853mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
2854    uint32_t link_type)
2855{
2856	mac_impl_t		*mip = mcip->mci_mip;
2857	mac_group_t		*rgroup = NULL;
2858	mac_group_t		*tgroup = NULL;
2859	mac_group_t		*default_rgroup;
2860	mac_group_t		*default_tgroup;
2861	int			err;
2862	uint8_t 		*mac_addr;
2863	mac_group_state_t	next_state;
2864	mac_client_impl_t	*group_only_mcip;
2865	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
2866	mac_resource_props_t	*emrp = MCIP_EFFECTIVE_PROPS(mcip);
2867	boolean_t		rxhw;
2868	boolean_t		txhw;
2869	boolean_t		use_default = B_FALSE;
2870	cpupart_t		*cpupart;
2871	boolean_t		no_unicast;
2872	boolean_t		isprimary = flent->fe_type & FLOW_PRIMARY_MAC;
2873	mac_client_impl_t	*reloc_pmcip = NULL;
2874
2875	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2876
2877	switch (link_type) {
2878	case SRST_FLOW:
2879		mac_srs_group_setup(mcip, flent, link_type);
2880		return (0);
2881
2882	case SRST_LINK:
2883		no_unicast = mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR;
2884		mac_addr = flent->fe_flow_desc.fd_dst_mac;
2885
2886		/* Default RX group */
2887		default_rgroup = MAC_DEFAULT_RX_GROUP(mip);
2888
2889		/* Default TX group */
2890		default_tgroup = MAC_DEFAULT_TX_GROUP(mip);
2891
2892		if (no_unicast) {
2893			rgroup = default_rgroup;
2894			tgroup = default_tgroup;
2895			goto grp_found;
2896		}
2897		rxhw = (mrp->mrp_mask & MRP_RX_RINGS) &&
2898		    (mrp->mrp_nrxrings > 0 ||
2899		    (mrp->mrp_mask & MRP_RXRINGS_UNSPEC));
2900		txhw = (mrp->mrp_mask & MRP_TX_RINGS) &&
2901		    (mrp->mrp_ntxrings > 0 ||
2902		    (mrp->mrp_mask & MRP_TXRINGS_UNSPEC));
2903
2904		/*
2905		 * By default we have given the primary all the rings
2906		 * i.e. the default group. Let's see if the primary
2907		 * needs to be relocated so that the addition of this
2908		 * client doesn't impact the primary's performance,
2909		 * i.e. if the primary is in the default group and
2910		 * we add this client, the primary will lose polling.
2911		 * We do this only for NICs supporting dynamic ring
2912		 * grouping and only when this is the first client
2913		 * after the primary (i.e. nactiveclients is 2)
2914		 */
2915		if (!isprimary && mip->mi_nactiveclients == 2 &&
2916		    (group_only_mcip = mac_primary_client_handle(mip)) !=
2917		    NULL && mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
2918			reloc_pmcip = mac_check_primary_relocation(
2919			    group_only_mcip, rxhw);
2920		}
2921		/*
2922		 * Check to see if we can get an exclusive group for
2923		 * this mac address or if there already exists a
2924		 * group that has this mac address (case of VLANs).
2925		 * If no groups are available, use the default group.
2926		 */
2927		rgroup = mac_reserve_rx_group(mcip, mac_addr, B_FALSE);
2928		if (rgroup == NULL && rxhw) {
2929			err = ENOSPC;
2930			goto setup_failed;
2931		} else if (rgroup == NULL) {
2932			rgroup = default_rgroup;
2933		}
2934		/*
2935		 * Check to see if we can get an exclusive group for
2936		 * this mac client. If no groups are available, use
2937		 * the default group.
2938		 */
2939		tgroup = mac_reserve_tx_group(mcip, B_FALSE);
2940		if (tgroup == NULL && txhw) {
2941			if (rgroup != NULL && rgroup != default_rgroup)
2942				mac_release_rx_group(mcip, rgroup);
2943			err = ENOSPC;
2944			goto setup_failed;
2945		} else if (tgroup == NULL) {
2946			tgroup = default_tgroup;
2947		}
2948
2949		/*
2950		 * Some NICs don't support any Rx rings, so there may not
2951		 * even be a default group.
2952		 */
2953	grp_found:
2954		if (rgroup != NULL) {
2955			if (rgroup != default_rgroup &&
2956			    MAC_GROUP_NO_CLIENT(rgroup) &&
2957			    (rxhw || mcip->mci_share != NULL)) {
2958				MAC_RX_GRP_RESERVED(mip);
2959				if (mip->mi_rx_group_type ==
2960				    MAC_GROUP_TYPE_DYNAMIC) {
2961					MAC_RX_RING_RESERVED(mip,
2962					    rgroup->mrg_cur_count);
2963				}
2964			}
2965			flent->fe_rx_ring_group = rgroup;
2966			/*
2967			 * Add the client to the group. This could cause
2968			 * either this group to move to the shared state or
2969			 * cause the default group to move to the shared state.
2970			 * The actions on this group are done here, while the
2971			 * actions on the default group are postponed to
2972			 * the end of this function.
2973			 */
2974			mac_group_add_client(rgroup, mcip);
2975			next_state = mac_group_next_state(rgroup,
2976			    &group_only_mcip, default_rgroup, B_TRUE);
2977			mac_set_group_state(rgroup, next_state);
2978		}
2979
2980		if (tgroup != NULL) {
2981			if (tgroup != default_tgroup &&
2982			    MAC_GROUP_NO_CLIENT(tgroup) &&
2983			    (txhw || mcip->mci_share != NULL)) {
2984				MAC_TX_GRP_RESERVED(mip);
2985				if (mip->mi_tx_group_type ==
2986				    MAC_GROUP_TYPE_DYNAMIC) {
2987					MAC_TX_RING_RESERVED(mip,
2988					    tgroup->mrg_cur_count);
2989				}
2990			}
2991			flent->fe_tx_ring_group = tgroup;
2992			mac_group_add_client(tgroup, mcip);
2993			next_state = mac_group_next_state(tgroup,
2994			    &group_only_mcip, default_tgroup, B_FALSE);
2995			tgroup->mrg_state = next_state;
2996		}
2997		/*
2998		 * Setup the Rx and Tx SRSes. If we got a pristine group
2999		 * exclusively above, mac_srs_group_setup would simply create
3000		 * the required SRSes. If we ended up sharing a previously
3001		 * reserved group, mac_srs_group_setup would also dismantle the
3002		 * SRSes of the previously exclusive group
3003		 */
3004		mac_srs_group_setup(mcip, flent, link_type);
3005
3006		/* We are setting up minimal datapath only */
3007		if (no_unicast)
3008			break;
3009		/* Program the S/W Classifer */
3010		if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
3011			goto setup_failed;
3012
3013		/* Program the H/W Classifier */
3014		if ((err = mac_add_macaddr(mip, rgroup, mac_addr,
3015		    (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0)) != 0)
3016			goto setup_failed;
3017		mcip->mci_unicast = mac_find_macaddr(mip, mac_addr);
3018		ASSERT(mcip->mci_unicast != NULL);
3019		/* Initialize the v6 local addr used by link protection */
3020		mac_protect_update_v6_local_addr(mcip);
3021		break;
3022
3023	default:
3024		ASSERT(B_FALSE);
3025		break;
3026	}
3027
3028	/*
3029	 * All broadcast and multicast traffic is received only on the default
3030	 * group. If we have setup the datapath for a non-default group above
3031	 * then move the default group to shared state to allow distribution of
3032	 * incoming broadcast traffic to the other groups and dismantle the
3033	 * SRSes over the default group.
3034	 */
3035	if (rgroup != NULL) {
3036		if (rgroup != default_rgroup) {
3037			if (default_rgroup->mrg_state ==
3038			    MAC_GROUP_STATE_RESERVED) {
3039				group_only_mcip = MAC_GROUP_ONLY_CLIENT(
3040				    default_rgroup);
3041				ASSERT(group_only_mcip != NULL &&
3042				    mip->mi_nactiveclients > 1);
3043
3044				mac_set_group_state(default_rgroup,
3045				    MAC_GROUP_STATE_SHARED);
3046				mac_rx_srs_group_setup(group_only_mcip,
3047				    group_only_mcip->mci_flent, SRST_LINK);
3048				pool_lock();
3049				cpupart = mac_pset_find(mrp, &use_default);
3050				mac_fanout_setup(group_only_mcip,
3051				    group_only_mcip->mci_flent,
3052				    MCIP_RESOURCE_PROPS(group_only_mcip),
3053				    mac_rx_deliver, group_only_mcip, NULL,
3054				    cpupart);
3055				mac_set_pool_effective(use_default, cpupart,
3056				    mrp, emrp);
3057				pool_unlock();
3058			}
3059			ASSERT(default_rgroup->mrg_state ==
3060			    MAC_GROUP_STATE_SHARED);
3061		}
3062		/*
3063		 * If we get an exclusive group for a VLAN MAC client we
3064		 * need to take the s/w path to make the additional check for
3065		 * the vid. Disable polling and set it to s/w classification.
3066		 * Similarly for clients that don't have a unicast address.
3067		 */
3068		if (rgroup->mrg_state == MAC_GROUP_STATE_RESERVED &&
3069		    (i_mac_flow_vid(flent) != VLAN_ID_NONE || no_unicast)) {
3070			mac_rx_switch_grp_to_sw(rgroup);
3071		}
3072	}
3073	mac_set_rings_effective(mcip);
3074	return (0);
3075
3076setup_failed:
3077	/* Switch the primary back to default group */
3078	if (reloc_pmcip != NULL) {
3079		(void) mac_rx_switch_group(reloc_pmcip,
3080		    reloc_pmcip->mci_flent->fe_rx_ring_group, default_rgroup);
3081	}
3082	mac_datapath_teardown(mcip, flent, link_type);
3083	return (err);
3084}
3085
3086void
3087mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
3088    uint32_t link_type)
3089{
3090	mac_impl_t		*mip = mcip->mci_mip;
3091	mac_group_t		*group = NULL;
3092	mac_client_impl_t	*grp_only_mcip;
3093	flow_entry_t		*group_only_flent;
3094	mac_group_t		*default_group;
3095	boolean_t		check_default_group = B_FALSE;
3096	mac_group_state_t	next_state;
3097	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
3098
3099	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3100
3101	switch (link_type) {
3102	case SRST_FLOW:
3103		mac_rx_srs_group_teardown(flent, B_FALSE);
3104		mac_tx_srs_group_teardown(mcip, flent, SRST_FLOW);
3105		return;
3106
3107	case SRST_LINK:
3108		/* Stop sending packets */
3109		mac_tx_client_block(mcip);
3110
3111		/* Stop the packets coming from the H/W */
3112		if (mcip->mci_unicast != NULL) {
3113			int err;
3114			err = mac_remove_macaddr(mcip->mci_unicast);
3115			if (err != 0) {
3116				cmn_err(CE_WARN, "%s: failed to remove a MAC"
3117				    " address because of error 0x%x",
3118				    mip->mi_name, err);
3119			}
3120			mcip->mci_unicast = NULL;
3121		}
3122
3123		/* Stop the packets coming from the S/W classifier */
3124		mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
3125		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
3126
3127		/* Now quiesce and destroy all SRS and soft rings */
3128		mac_rx_srs_group_teardown(flent, B_FALSE);
3129		mac_tx_srs_group_teardown(mcip, flent, SRST_LINK);
3130
3131		ASSERT((mcip->mci_flent == flent) &&
3132		    (flent->fe_next == NULL));
3133
3134		/*
3135		 * Release our hold on the group as well. We need
3136		 * to check if the shared group has only one client
3137		 * left who can use it exclusively. Also, if we
3138		 * were the last client, release the group.
3139		 */
3140		group = flent->fe_rx_ring_group;
3141		default_group = MAC_DEFAULT_RX_GROUP(mip);
3142		if (group != NULL) {
3143			mac_group_remove_client(group, mcip);
3144			next_state = mac_group_next_state(group,
3145			    &grp_only_mcip, default_group, B_TRUE);
3146			if (next_state == MAC_GROUP_STATE_RESERVED) {
3147				/*
3148				 * Only one client left on this RX group.
3149				 */
3150				ASSERT(grp_only_mcip != NULL);
3151				mac_set_group_state(group,
3152				    MAC_GROUP_STATE_RESERVED);
3153				group_only_flent = grp_only_mcip->mci_flent;
3154
3155				/*
3156				 * The only remaining client has exclusive
3157				 * access on the group. Allow it to
3158				 * dynamically poll the H/W rings etc.
3159				 */
3160				mac_rx_srs_group_setup(grp_only_mcip,
3161				    group_only_flent, SRST_LINK);
3162				mac_fanout_setup(grp_only_mcip,
3163				    group_only_flent,
3164				    MCIP_RESOURCE_PROPS(grp_only_mcip),
3165				    mac_rx_deliver, grp_only_mcip, NULL, NULL);
3166				mac_rx_group_unmark(group, MR_INCIPIENT);
3167				mac_set_rings_effective(grp_only_mcip);
3168			} else if (next_state == MAC_GROUP_STATE_REGISTERED) {
3169				/*
3170				 * This is a non-default group being freed up.
3171				 * We need to reevaluate the default group
3172				 * to see if the primary client can get
3173				 * exclusive access to the default group.
3174				 */
3175				ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
3176				if (mrp->mrp_mask & MRP_RX_RINGS) {
3177					MAC_RX_GRP_RELEASED(mip);
3178					if (mip->mi_rx_group_type ==
3179					    MAC_GROUP_TYPE_DYNAMIC) {
3180						MAC_RX_RING_RELEASED(mip,
3181						    group->mrg_cur_count);
3182					}
3183				}
3184				mac_release_rx_group(mcip, group);
3185				mac_set_group_state(group,
3186				    MAC_GROUP_STATE_REGISTERED);
3187				check_default_group = B_TRUE;
3188			} else {
3189				ASSERT(next_state == MAC_GROUP_STATE_SHARED);
3190				mac_set_group_state(group,
3191				    MAC_GROUP_STATE_SHARED);
3192				mac_rx_group_unmark(group, MR_CONDEMNED);
3193			}
3194			flent->fe_rx_ring_group = NULL;
3195		}
3196		/*
3197		 * Remove the client from the TX group. Additionally, if
3198		 * this a non-default group, then we also need to release
3199		 * the group.
3200		 */
3201		group = flent->fe_tx_ring_group;
3202		default_group = MAC_DEFAULT_TX_GROUP(mip);
3203		if (group != NULL) {
3204			mac_group_remove_client(group, mcip);
3205			next_state = mac_group_next_state(group,
3206			    &grp_only_mcip, default_group, B_FALSE);
3207			if (next_state == MAC_GROUP_STATE_REGISTERED) {
3208				if (group != default_group) {
3209					if (mrp->mrp_mask & MRP_TX_RINGS) {
3210						MAC_TX_GRP_RELEASED(mip);
3211						if (mip->mi_tx_group_type ==
3212						    MAC_GROUP_TYPE_DYNAMIC) {
3213							MAC_TX_RING_RELEASED(
3214							    mip, group->
3215							    mrg_cur_count);
3216						}
3217					}
3218					mac_release_tx_group(mcip, group);
3219					/*
3220					 * If the default group is reserved,
3221					 * then we need to set the effective
3222					 * rings as we would have given
3223					 * back some rings when the group
3224					 * was released
3225					 */
3226					if (mip->mi_tx_group_type ==
3227					    MAC_GROUP_TYPE_DYNAMIC &&
3228					    default_group->mrg_state ==
3229					    MAC_GROUP_STATE_RESERVED) {
3230						grp_only_mcip =
3231						    MAC_GROUP_ONLY_CLIENT
3232						    (default_group);
3233						mac_set_rings_effective(
3234						    grp_only_mcip);
3235					}
3236				} else {
3237					mac_ring_t	*ring;
3238					int		cnt;
3239					int		ringcnt;
3240
3241					/*
3242					 * Stop all the rings except the
3243					 * default ring.
3244					 */
3245					ringcnt = group->mrg_cur_count;
3246					ring = group->mrg_rings;
3247					for (cnt = 0; cnt < ringcnt; cnt++) {
3248						if (ring->mr_state ==
3249						    MR_INUSE && ring !=
3250						    (mac_ring_t *)
3251						    mip->mi_default_tx_ring) {
3252							mac_stop_ring(ring);
3253							ring->mr_flag = 0;
3254						}
3255						ring = ring->mr_next;
3256					}
3257				}
3258			} else if (next_state == MAC_GROUP_STATE_RESERVED) {
3259				mac_set_rings_effective(grp_only_mcip);
3260			}
3261			flent->fe_tx_ring_group = NULL;
3262			group->mrg_state = next_state;
3263		}
3264		break;
3265	default:
3266		ASSERT(B_FALSE);
3267		break;
3268	}
3269
3270	/*
3271	 * The mac client using the default group gets exclusive access to the
3272	 * default group if and only if it is the sole client on the entire
3273	 * mip. If so set the group state to reserved, and set up the SRSes
3274	 * over the default group.
3275	 */
3276	if (check_default_group) {
3277		default_group = MAC_DEFAULT_RX_GROUP(mip);
3278		ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED);
3279		next_state = mac_group_next_state(default_group,
3280		    &grp_only_mcip, default_group, B_TRUE);
3281		if (next_state == MAC_GROUP_STATE_RESERVED) {
3282			ASSERT(grp_only_mcip != NULL &&
3283			    mip->mi_nactiveclients == 1);
3284			mac_set_group_state(default_group,
3285			    MAC_GROUP_STATE_RESERVED);
3286			mac_rx_srs_group_setup(grp_only_mcip,
3287			    grp_only_mcip->mci_flent, SRST_LINK);
3288			mac_fanout_setup(grp_only_mcip,
3289			    grp_only_mcip->mci_flent,
3290			    MCIP_RESOURCE_PROPS(grp_only_mcip), mac_rx_deliver,
3291			    grp_only_mcip, NULL, NULL);
3292			mac_rx_group_unmark(default_group, MR_INCIPIENT);
3293			mac_set_rings_effective(grp_only_mcip);
3294		}
3295	}
3296
3297	/*
3298	 * If the primary is the only one left and the MAC supports
3299	 * dynamic grouping, we need to see if the primary needs to
3300	 * be moved to the default group so that it can use all the
3301	 * H/W rings.
3302	 */
3303	if (!(flent->fe_type & FLOW_PRIMARY_MAC) &&
3304	    mip->mi_nactiveclients == 1 &&
3305	    mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
3306		default_group = MAC_DEFAULT_RX_GROUP(mip);
3307		grp_only_mcip = mac_primary_client_handle(mip);
3308		if (grp_only_mcip == NULL)
3309			return;
3310		group_only_flent = grp_only_mcip->mci_flent;
3311		mrp = MCIP_RESOURCE_PROPS(grp_only_mcip);
3312		/*
3313		 * If the primary has an explicit property set, leave it
3314		 * alone.
3315		 */
3316		if (mrp->mrp_mask & MRP_RX_RINGS)
3317			return;
3318		/*
3319		 * Switch the primary to the default group.
3320		 */
3321		(void) mac_rx_switch_group(grp_only_mcip,
3322		    group_only_flent->fe_rx_ring_group, default_group);
3323	}
3324}
3325
3326/* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */
3327
3328static void
3329mac_srs_fanout_list_free(mac_soft_ring_set_t *mac_srs)
3330{
3331	if (mac_srs->srs_type & SRST_TX) {
3332		mac_srs_tx_t *tx;
3333
3334		ASSERT(mac_srs->srs_tcp_soft_rings == NULL);
3335		ASSERT(mac_srs->srs_udp_soft_rings == NULL);
3336		ASSERT(mac_srs->srs_oth_soft_rings == NULL);
3337		ASSERT(mac_srs->srs_tx_soft_rings != NULL);
3338		kmem_free(mac_srs->srs_tx_soft_rings,
3339		    sizeof (mac_soft_ring_t *) * MAX_RINGS_PER_GROUP);
3340		mac_srs->srs_tx_soft_rings = NULL;
3341		tx = &mac_srs->srs_tx;
3342		if (tx->st_soft_rings != NULL) {
3343			kmem_free(tx->st_soft_rings,
3344			    sizeof (mac_soft_ring_t *) * MAX_RINGS_PER_GROUP);
3345		}
3346	} else {
3347		ASSERT(mac_srs->srs_tx_soft_rings == NULL);
3348		ASSERT(mac_srs->srs_tcp_soft_rings != NULL);
3349		kmem_free(mac_srs->srs_tcp_soft_rings,
3350		    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
3351		mac_srs->srs_tcp_soft_rings = NULL;
3352		ASSERT(mac_srs->srs_udp_soft_rings != NULL);
3353		kmem_free(mac_srs->srs_udp_soft_rings,
3354		    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
3355		mac_srs->srs_udp_soft_rings = NULL;
3356		ASSERT(mac_srs->srs_oth_soft_rings != NULL);
3357		kmem_free(mac_srs->srs_oth_soft_rings,
3358		    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
3359		mac_srs->srs_oth_soft_rings = NULL;
3360	}
3361}
3362
3363/*
3364 * An RX SRS is attached to at most one mac_ring.
3365 * A TX SRS  has no  rings.
3366 */
3367static void
3368mac_srs_ring_free(mac_soft_ring_set_t *mac_srs)
3369{
3370	mac_client_impl_t	*mcip;
3371	mac_ring_t		*ring;
3372	flow_entry_t		*flent;
3373
3374	ring = mac_srs->srs_ring;
3375	if (mac_srs->srs_type & SRST_TX) {
3376		ASSERT(ring == NULL);
3377		return;
3378	}
3379
3380	if (ring == NULL)
3381		return;
3382
3383	/*
3384	 * Broadcast flows don't have a client impl association, but they
3385	 * use only soft rings.
3386	 */
3387	flent = mac_srs->srs_flent;
3388	mcip = flent->fe_mcip;
3389	ASSERT(mcip != NULL);
3390
3391	ring->mr_classify_type = MAC_NO_CLASSIFIER;
3392	ring->mr_srs = NULL;
3393}
3394
3395/*
3396 * Physical unlink and free of the data structures happen below. This is
3397 * driven from mac_flow_destroy(), on the last refrele of a flow.
3398 *
3399 * Assumes Rx srs is 1-1 mapped with an ring.
3400 */
3401void
3402mac_srs_free(mac_soft_ring_set_t *mac_srs)
3403{
3404	ASSERT(mac_srs->srs_mcip == NULL ||
3405	    MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
3406	ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
3407	    SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
3408
3409	mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
3410	mac_srs_ring_free(mac_srs);
3411	mac_srs_soft_rings_free(mac_srs);
3412	mac_srs_fanout_list_free(mac_srs);
3413
3414	mac_srs->srs_bw = NULL;
3415	mac_srs_stat_delete(mac_srs);
3416	kmem_cache_free(mac_srs_cache, mac_srs);
3417}
3418
3419static void
3420mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *mac_srs, uint_t s_ring_flag)
3421{
3422	mac_soft_ring_t	*softring;
3423
3424	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
3425
3426	mac_srs_soft_rings_signal(mac_srs, s_ring_flag);
3427	if (s_ring_flag == S_RING_CONDEMNED) {
3428		while (mac_srs->srs_soft_ring_condemned_count !=
3429		    mac_srs->srs_soft_ring_count)
3430			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3431	} else {
3432		while (mac_srs->srs_soft_ring_quiesced_count !=
3433		    mac_srs->srs_soft_ring_count)
3434			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3435	}
3436	mutex_exit(&mac_srs->srs_lock);
3437
3438	for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
3439	    softring = softring->s_ring_next)
3440		(void) untimeout(softring->s_ring_tid);
3441
3442	(void) untimeout(mac_srs->srs_tid);
3443
3444	mutex_enter(&mac_srs->srs_lock);
3445}
3446
3447/*
3448 * The block comment above mac_rx_classify_flow_state_change explains the
3449 * background. At this point upcalls from the driver (both hardware classified
3450 * and software classified) have been cut off. We now need to quiesce the
3451 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3452 * the master controller. The steps involved are described below in the function
3453 */
3454void
3455mac_srs_worker_quiesce(mac_soft_ring_set_t *mac_srs)
3456{
3457	uint_t			s_ring_flag;
3458	uint_t			srs_poll_wait_flag;
3459
3460	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
3461	ASSERT(mac_srs->srs_state & (SRS_CONDEMNED | SRS_QUIESCE));
3462
3463	if (mac_srs->srs_state & SRS_CONDEMNED) {
3464		s_ring_flag = S_RING_CONDEMNED;
3465		srs_poll_wait_flag = SRS_POLL_THR_EXITED;
3466	} else {
3467		s_ring_flag = S_RING_QUIESCE;
3468		srs_poll_wait_flag = SRS_POLL_THR_QUIESCED;
3469	}
3470
3471	/*
3472	 * In the case of Rx SRS wait till the poll thread is done.
3473	 */
3474	if ((mac_srs->srs_type & SRST_TX) == 0 &&
3475	    mac_srs->srs_poll_thr != NULL) {
3476		while (!(mac_srs->srs_state & srs_poll_wait_flag))
3477			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3478
3479		/*
3480		 * Turn off polling as part of the quiesce operation.
3481		 */
3482		MAC_SRS_POLLING_OFF(mac_srs);
3483		mac_srs->srs_state &= ~(SRS_POLLING | SRS_GET_PKTS);
3484	}
3485
3486	/*
3487	 * Then signal the soft ring worker threads to quiesce or quit
3488	 * as needed and then wait till that happens.
3489	 */
3490	mac_srs_soft_rings_quiesce(mac_srs, s_ring_flag);
3491
3492	if (mac_srs->srs_state & SRS_CONDEMNED)
3493		mac_srs->srs_state |= (SRS_QUIESCE_DONE | SRS_CONDEMNED_DONE);
3494	else
3495		mac_srs->srs_state |= SRS_QUIESCE_DONE;
3496	cv_signal(&mac_srs->srs_quiesce_done_cv);
3497}
3498
3499/*
3500 * Signal an SRS to start a temporary quiesce, or permanent removal, or restart
3501 * a quiesced SRS by setting the appropriate flags and signaling the SRS worker
3502 * or poll thread. This function is internal to the quiescing logic and is
3503 * called internally from the SRS quiesce or flow quiesce or client quiesce
3504 * higher level functions.
3505 */
3506void
3507mac_srs_signal(mac_soft_ring_set_t *mac_srs, uint_t srs_flag)
3508{
3509	mac_ring_t	*ring;
3510
3511	ring = mac_srs->srs_ring;
3512	ASSERT(ring == NULL || ring->mr_refcnt == 0);
3513
3514	if (srs_flag == SRS_CONDEMNED) {
3515		/*
3516		 * The SRS is going away. We need to unbind the SRS and SR
3517		 * threads before removing from the global SRS list. Otherwise
3518		 * there is a small window where the cpu reconfig callbacks
3519		 * may miss the SRS in the list walk and DR could fail since
3520		 * there are still bound threads.
3521		 */
3522		mac_srs_threads_unbind(mac_srs);
3523		mac_srs_remove_glist(mac_srs);
3524	}
3525	/*
3526	 * Wakeup the SRS worker and poll threads.
3527	 */
3528	mutex_enter(&mac_srs->srs_lock);
3529	mac_srs->srs_state |= srs_flag;
3530	cv_signal(&mac_srs->srs_async);
3531	cv_signal(&mac_srs->srs_cv);
3532	mutex_exit(&mac_srs->srs_lock);
3533}
3534
3535/*
3536 * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
3537 * from the driver are done, then the Rx SRS is quiesced and only then can
3538 * we signal the soft rings. Thus this function can't be called arbitrarily
3539 * without satisfying the prerequisites. On the Tx side, the threads from
3540 * top need to quiesced, then the Tx SRS and only then can we signal the
3541 * Tx soft rings.
3542 */
3543static void
3544mac_srs_soft_rings_signal(mac_soft_ring_set_t *mac_srs, uint_t sr_flag)
3545{
3546	mac_soft_ring_t		*softring;
3547
3548	for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
3549	    softring = softring->s_ring_next)
3550		mac_soft_ring_signal(softring, sr_flag);
3551}
3552
3553/*
3554 * The block comment above mac_rx_classify_flow_state_change explains the
3555 * background. At this point the SRS is quiesced and we need to restart the
3556 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3557 * the master controller. The steps involved are described below in the function
3558 */
3559void
3560mac_srs_worker_restart(mac_soft_ring_set_t *mac_srs)
3561{
3562	boolean_t	iam_rx_srs;
3563	mac_soft_ring_t	*softring;
3564
3565	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
3566	if ((mac_srs->srs_type & SRST_TX) != 0) {
3567		iam_rx_srs = B_FALSE;
3568		ASSERT((mac_srs->srs_state &
3569		    (SRS_POLL_THR_QUIESCED | SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
3570		    (SRS_QUIESCE_DONE | SRS_QUIESCE));
3571	} else {
3572		iam_rx_srs = B_TRUE;
3573		ASSERT((mac_srs->srs_state &
3574		    (SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
3575		    (SRS_QUIESCE_DONE | SRS_QUIESCE));
3576		if (mac_srs->srs_poll_thr != NULL) {
3577			ASSERT((mac_srs->srs_state & SRS_POLL_THR_QUIESCED) ==
3578			    SRS_POLL_THR_QUIESCED);
3579		}
3580	}
3581
3582	/*
3583	 * Signal any quiesced soft ring workers to restart and wait for the
3584	 * soft ring down count to come down to zero.
3585	 */
3586	if (mac_srs->srs_soft_ring_quiesced_count != 0) {
3587		for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
3588		    softring = softring->s_ring_next) {
3589			if (!(softring->s_ring_state & S_RING_QUIESCE))
3590				continue;
3591			mac_soft_ring_signal(softring, S_RING_RESTART);
3592		}
3593		while (mac_srs->srs_soft_ring_quiesced_count != 0)
3594			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3595	}
3596
3597	mac_srs->srs_state &= ~(SRS_QUIESCE_DONE | SRS_QUIESCE | SRS_RESTART);
3598	if (iam_rx_srs && mac_srs->srs_poll_thr != NULL) {
3599		/*
3600		 * Signal the poll thread and ask it to restart. Wait till it
3601		 * actually restarts and the SRS_POLL_THR_QUIESCED flag gets
3602		 * cleared.
3603		 */
3604		mac_srs->srs_state |= SRS_POLL_THR_RESTART;
3605		cv_signal(&mac_srs->srs_cv);
3606		while (mac_srs->srs_state & SRS_POLL_THR_QUIESCED)
3607			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3608		ASSERT(!(mac_srs->srs_state & SRS_POLL_THR_RESTART));
3609	}
3610	/* Wake up any waiter waiting for the restart to complete */
3611	mac_srs->srs_state |= SRS_RESTART_DONE;
3612	cv_signal(&mac_srs->srs_quiesce_done_cv);
3613}
3614
3615static void
3616mac_srs_worker_unbind(mac_soft_ring_set_t *mac_srs)
3617{
3618	mutex_enter(&mac_srs->srs_lock);
3619	if (!(mac_srs->srs_state & SRS_WORKER_BOUND)) {
3620		ASSERT(mac_srs->srs_worker_cpuid == -1);
3621		mutex_exit(&mac_srs->srs_lock);
3622		return;
3623	}
3624
3625	mac_srs->srs_worker_cpuid = -1;
3626	mac_srs->srs_state &= ~SRS_WORKER_BOUND;
3627	thread_affinity_clear(mac_srs->srs_worker);
3628	mutex_exit(&mac_srs->srs_lock);
3629}
3630
3631static void
3632mac_srs_poll_unbind(mac_soft_ring_set_t *mac_srs)
3633{
3634	mutex_enter(&mac_srs->srs_lock);
3635	if (mac_srs->srs_poll_thr == NULL ||
3636	    (mac_srs->srs_state & SRS_POLL_BOUND) == 0) {
3637		ASSERT(mac_srs->srs_poll_cpuid == -1);
3638		mutex_exit(&mac_srs->srs_lock);
3639		return;
3640	}
3641
3642	mac_srs->srs_poll_cpuid = -1;
3643	mac_srs->srs_state &= ~SRS_POLL_BOUND;
3644	thread_affinity_clear(mac_srs->srs_poll_thr);
3645	mutex_exit(&mac_srs->srs_lock);
3646}
3647
3648static void
3649mac_srs_threads_unbind(mac_soft_ring_set_t *mac_srs)
3650{
3651	mac_soft_ring_t	*soft_ring;
3652
3653	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
3654
3655	mutex_enter(&cpu_lock);
3656	mac_srs_worker_unbind(mac_srs);
3657	if (!(mac_srs->srs_type & SRST_TX))
3658		mac_srs_poll_unbind(mac_srs);
3659
3660	for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
3661	    soft_ring = soft_ring->s_ring_next) {
3662		mac_soft_ring_unbind(soft_ring);
3663	}
3664	mutex_exit(&cpu_lock);
3665}
3666
3667/*
3668 * When a CPU is going away, unbind all MAC threads which are bound
3669 * to that CPU. The affinity of the thread to the CPU is saved to allow
3670 * the thread to be rebound to the CPU if it comes back online.
3671 */
3672static void
3673mac_walk_srs_and_unbind(int cpuid)
3674{
3675	mac_soft_ring_set_t *mac_srs;
3676	mac_soft_ring_t *soft_ring;
3677
3678	rw_enter(&mac_srs_g_lock, RW_READER);
3679
3680	if ((mac_srs = mac_srs_g_list) == NULL)
3681		goto done;
3682
3683	for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
3684		if (mac_srs->srs_worker_cpuid == cpuid) {
3685			mac_srs->srs_worker_cpuid_save = cpuid;
3686			mac_srs_worker_unbind(mac_srs);
3687		}
3688
3689		if (!(mac_srs->srs_type & SRST_TX)) {
3690			if (mac_srs->srs_poll_cpuid == cpuid) {
3691				mac_srs->srs_poll_cpuid_save = cpuid;
3692				mac_srs_poll_unbind(mac_srs);
3693			}
3694		}
3695
3696		/* Next tackle the soft rings associated with the srs */
3697		mutex_enter(&mac_srs->srs_lock);
3698		for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
3699		    soft_ring = soft_ring->s_ring_next) {
3700			if (soft_ring->s_ring_cpuid == cpuid) {
3701				soft_ring->s_ring_cpuid_save = cpuid;
3702				mac_soft_ring_unbind(soft_ring);
3703			}
3704		}
3705		mutex_exit(&mac_srs->srs_lock);
3706	}
3707done:
3708	rw_exit(&mac_srs_g_lock);
3709}
3710
3711/* TX SETUP and TEARDOWN ROUTINES */
3712
3713/*
3714 * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring()
3715 * handle the case where the number of rings is one. I.e. there is
3716 * a ring pointed to by mac_srs->srs_tx_arg2.
3717 */
3718void
3719mac_tx_srs_add_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
3720{
3721	mac_client_impl_t *mcip = mac_srs->srs_mcip;
3722	mac_soft_ring_t *soft_ring;
3723	int count = mac_srs->srs_tx_ring_count;
3724	uint32_t soft_ring_type = ST_RING_TX;
3725	uint_t ring_info;
3726
3727	ASSERT(mac_srs->srs_state & SRS_QUIESCE);
3728	ring_info = mac_hwring_getinfo((mac_ring_handle_t)tx_ring);
3729	if (mac_tx_serialize || (ring_info & MAC_RING_TX_SERIALIZE))
3730		soft_ring_type |= ST_RING_WORKER_ONLY;
3731	soft_ring = mac_soft_ring_create(count, 0,
3732	    soft_ring_type, maxclsyspri, mcip, mac_srs, -1,
3733	    NULL, mcip, (mac_resource_handle_t)tx_ring);
3734	mac_srs->srs_tx_ring_count++;
3735	mac_srs_update_fanout_list(mac_srs);
3736	/*
3737	 * put this soft ring in quiesce mode too so when we restart
3738	 * all soft rings in the srs are in the same state.
3739	 */
3740	mac_soft_ring_signal(soft_ring, S_RING_QUIESCE);
3741}
3742
3743static void
3744mac_soft_ring_remove(mac_soft_ring_set_t *mac_srs, mac_soft_ring_t *softring)
3745{
3746	int sringcnt;
3747
3748	mutex_enter(&mac_srs->srs_lock);
3749	sringcnt = mac_srs->srs_soft_ring_count;
3750	ASSERT(sringcnt > 0);
3751	mac_soft_ring_signal(softring, S_RING_CONDEMNED);
3752
3753	ASSERT(mac_srs->srs_soft_ring_condemned_count == 0);
3754	while (mac_srs->srs_soft_ring_condemned_count != 1)
3755		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3756
3757	if (softring == mac_srs->srs_soft_ring_head) {
3758		mac_srs->srs_soft_ring_head = softring->s_ring_next;
3759		if (mac_srs->srs_soft_ring_head != NULL) {
3760			mac_srs->srs_soft_ring_head->s_ring_prev = NULL;
3761		} else {
3762			mac_srs->srs_soft_ring_tail = NULL;
3763		}
3764	} else {
3765		softring->s_ring_prev->s_ring_next =
3766		    softring->s_ring_next;
3767		if (softring->s_ring_next != NULL) {
3768			softring->s_ring_next->s_ring_prev =
3769			    softring->s_ring_prev;
3770		} else {
3771			mac_srs->srs_soft_ring_tail =
3772			    softring->s_ring_prev;
3773		}
3774	}
3775	mac_srs->srs_soft_ring_count--;
3776
3777	mac_srs->srs_soft_ring_condemned_count--;
3778	mutex_exit(&mac_srs->srs_lock);
3779
3780	mac_soft_ring_free(softring);
3781}
3782
3783void
3784mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
3785{
3786	int i;
3787	mac_soft_ring_t *soft_ring, *remove_sring;
3788	mac_client_impl_t *mcip = mac_srs->srs_mcip;
3789
3790	mutex_enter(&mac_srs->srs_lock);
3791	for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3792		soft_ring =  mac_srs->srs_tx_soft_rings[i];
3793		if (soft_ring->s_ring_tx_arg2 == tx_ring)
3794			break;
3795	}
3796	mutex_exit(&mac_srs->srs_lock);
3797	ASSERT(i < mac_srs->srs_tx_ring_count);
3798	remove_sring = soft_ring;
3799	/*
3800	 * In the case of aggr, the soft ring associated with a Tx ring
3801	 * is also stored in st_soft_rings[] array. That entry should
3802	 * be removed.
3803	 */
3804	if (mcip->mci_state_flags & MCIS_IS_AGGR) {
3805		mac_srs_tx_t *tx = &mac_srs->srs_tx;
3806
3807		ASSERT(tx->st_soft_rings[tx_ring->mr_index] == remove_sring);
3808		tx->st_soft_rings[tx_ring->mr_index] = NULL;
3809	}
3810	mac_soft_ring_remove(mac_srs, remove_sring);
3811	mac_srs_update_fanout_list(mac_srs);
3812}
3813
3814/*
3815 * mac_tx_srs_setup():
3816 * Used to setup Tx rings. If no free Tx ring is available, then default
3817 * Tx ring is used.
3818 */
3819void
3820mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent)
3821{
3822	mac_impl_t		*mip = mcip->mci_mip;
3823	mac_soft_ring_set_t	*tx_srs = flent->fe_tx_srs;
3824	int			i;
3825	int			tx_ring_count = 0;
3826	uint32_t		soft_ring_type;
3827	mac_group_t		*grp = NULL;
3828	mac_ring_t		*ring;
3829	mac_srs_tx_t		*tx = &tx_srs->srs_tx;
3830	boolean_t		is_aggr;
3831	uint_t			ring_info = 0;
3832
3833	is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR) != 0;
3834	grp = flent->fe_tx_ring_group;
3835	if (grp == NULL) {
3836		ring = (mac_ring_t *)mip->mi_default_tx_ring;
3837		goto no_group;
3838	}
3839	tx_ring_count = grp->mrg_cur_count;
3840	ring = grp->mrg_rings;
3841	/*
3842	 * An attempt is made to reserve 'tx_ring_count' number
3843	 * of Tx rings. If tx_ring_count is 0, default Tx ring
3844	 * is used. If it is 1, an attempt is made to reserve one
3845	 * Tx ring. In both the cases, the ring information is
3846	 * stored in Tx SRS. If multiple Tx rings are specified,
3847	 * then each Tx ring will have a Tx-side soft ring. All
3848	 * these soft rings will be hang off Tx SRS.
3849	 */
3850	switch (grp->mrg_state) {
3851		case MAC_GROUP_STATE_SHARED:
3852		case MAC_GROUP_STATE_RESERVED:
3853			if (tx_ring_count <= 1 && !is_aggr) {
3854no_group:
3855				if (ring != NULL &&
3856				    ring->mr_state != MR_INUSE) {
3857					(void) mac_start_ring(ring);
3858					ring_info = mac_hwring_getinfo(
3859					    (mac_ring_handle_t)ring);
3860				}
3861				tx->st_arg2 = (void *)ring;
3862				mac_tx_srs_stat_recreate(tx_srs, B_FALSE);
3863				if (tx_srs->srs_type & SRST_BW_CONTROL) {
3864					tx->st_mode = SRS_TX_BW;
3865				} else if (mac_tx_serialize ||
3866				    (ring_info & MAC_RING_TX_SERIALIZE)) {
3867					tx->st_mode = SRS_TX_SERIALIZE;
3868				} else {
3869					tx->st_mode = SRS_TX_DEFAULT;
3870				}
3871				break;
3872			}
3873			soft_ring_type = ST_RING_TX;
3874			if (tx_srs->srs_type & SRST_BW_CONTROL) {
3875				tx->st_mode = is_aggr ?
3876				    SRS_TX_BW_AGGR : SRS_TX_BW_FANOUT;
3877			} else {
3878				tx->st_mode = is_aggr ? SRS_TX_AGGR :
3879				    SRS_TX_FANOUT;
3880			}
3881			for (i = 0; i < tx_ring_count; i++) {
3882				ASSERT(ring != NULL);
3883				switch (ring->mr_state) {
3884				case MR_INUSE:
3885				case MR_FREE:
3886					ASSERT(ring->mr_srs == NULL);
3887
3888					if (ring->mr_state != MR_INUSE)
3889						(void) mac_start_ring(ring);
3890					ring_info = mac_hwring_getinfo(
3891					    (mac_ring_handle_t)ring);
3892					if (mac_tx_serialize || (ring_info &
3893					    MAC_RING_TX_SERIALIZE)) {
3894						soft_ring_type |=
3895						    ST_RING_WORKER_ONLY;
3896					}
3897					(void) mac_soft_ring_create(i, 0,
3898					    soft_ring_type, maxclsyspri,
3899					    mcip, tx_srs, -1, NULL, mcip,
3900					    (mac_resource_handle_t)ring);
3901					break;
3902				default:
3903					cmn_err(CE_PANIC,
3904					    "srs_setup: mcip = %p "
3905					    "trying to add UNKNOWN ring = %p\n",
3906					    (void *)mcip, (void *)ring);
3907					break;
3908				}
3909				ring = ring->mr_next;
3910			}
3911			mac_srs_update_fanout_list(tx_srs);
3912			break;
3913		default:
3914			ASSERT(B_FALSE);
3915			break;
3916	}
3917	tx->st_func = mac_tx_get_func(tx->st_mode);
3918	if (is_aggr) {
3919		VERIFY(i_mac_capab_get((mac_handle_t)mip,
3920		    MAC_CAPAB_AGGR, &tx->st_capab_aggr));
3921	}
3922	DTRACE_PROBE3(tx__srs___setup__return, mac_soft_ring_set_t *, tx_srs,
3923	    int, tx->st_mode, int, tx_srs->srs_tx_ring_count);
3924}
3925
3926/*
3927 * Update the fanout of a client if its recorded link speed doesn't match
3928 * its current link speed.
3929 */
3930void
3931mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart)
3932{
3933	uint64_t link_speed;
3934	mac_resource_props_t *mcip_mrp;
3935	flow_entry_t *flent = mcip->mci_flent;
3936	mac_soft_ring_set_t *rx_srs;
3937	mac_cpus_t *srs_cpu;
3938	int soft_ring_count, maxcpus;
3939
3940	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
3941
3942	link_speed = mac_client_stat_get(mcip->mci_flent->fe_mcip,
3943	    MAC_STAT_IFSPEED);
3944
3945	if ((link_speed != 0) &&
3946	    (link_speed != mcip->mci_flent->fe_nic_speed)) {
3947		mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
3948		/*
3949		 * Before calling mac_fanout_setup(), check to see if
3950		 * the SRSes already have the right number of soft
3951		 * rings. mac_fanout_setup() is a heavy duty operation
3952		 * where new cpu bindings are done for SRS and soft
3953		 * ring threads and interrupts re-targeted.
3954		 */
3955		maxcpus = (cpupart != NULL) ? cpupart->cp_ncpus : ncpus;
3956		soft_ring_count = mac_compute_soft_ring_count(flent,
3957		    flent->fe_rx_srs_cnt - 1, maxcpus);
3958		/*
3959		 * If soft_ring_count returned by
3960		 * mac_compute_soft_ring_count() is 0, bump it
3961		 * up by 1 because we always have atleast one
3962		 * TCP, UDP, and OTH soft ring associated with
3963		 * an SRS.
3964		 */
3965		soft_ring_count = (soft_ring_count == 0) ?
3966		    1 : soft_ring_count;
3967		rx_srs = flent->fe_rx_srs[0];
3968		srs_cpu = &rx_srs->srs_cpu;
3969		if (soft_ring_count != srs_cpu->mc_rx_fanout_cnt) {
3970			mac_fanout_setup(mcip, flent, mcip_mrp,
3971			    mac_rx_deliver, mcip, NULL, cpupart);
3972		}
3973	}
3974}
3975
3976/*
3977 * Walk through the list of mac clients for the MAC.
3978 * For each active mac client, recompute the number of soft rings
3979 * associated with every client, only if current speed is different
3980 * from the speed that was previously used for soft ring computation.
3981 * If the cable is disconnected whlie the NIC is started, we would get
3982 * notification with speed set to 0. We do not recompute in that case.
3983 */
3984void
3985mac_fanout_recompute(mac_impl_t *mip)
3986{
3987	mac_client_impl_t	*mcip;
3988	cpupart_t		*cpupart;
3989	boolean_t		use_default;
3990	mac_resource_props_t	*mrp, *emrp;
3991
3992	i_mac_perim_enter(mip);
3993	if ((mip->mi_state_flags & MIS_IS_VNIC) != 0 ||
3994	    mip->mi_linkstate != LINK_STATE_UP) {
3995		i_mac_perim_exit(mip);
3996		return;
3997	}
3998
3999	for (mcip = mip->mi_clients_list; mcip != NULL;
4000	    mcip = mcip->mci_client_next) {
4001		if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 ||
4002		    !MCIP_DATAPATH_SETUP(mcip))
4003			continue;
4004		mrp = MCIP_RESOURCE_PROPS(mcip);
4005		emrp = MCIP_EFFECTIVE_PROPS(mcip);
4006		use_default = B_FALSE;
4007		pool_lock();
4008		cpupart = mac_pset_find(mrp, &use_default);
4009		mac_fanout_recompute_client(mcip, cpupart);
4010		mac_set_pool_effective(use_default, cpupart, mrp, emrp);
4011		pool_unlock();
4012	}
4013	i_mac_perim_exit(mip);
4014}
4015
4016/*
4017 * Given a MAC, change the polling state for all its MAC clients.  'enable' is
4018 * B_TRUE to enable polling or B_FALSE to disable.  Polling is enabled by
4019 * default.
4020 */
4021void
4022mac_poll_state_change(mac_handle_t mh, boolean_t enable)
4023{
4024	mac_impl_t *mip = (mac_impl_t *)mh;
4025	mac_client_impl_t *mcip;
4026
4027	i_mac_perim_enter(mip);
4028	if (enable)
4029		mip->mi_state_flags &= ~MIS_POLL_DISABLE;
4030	else
4031		mip->mi_state_flags |= MIS_POLL_DISABLE;
4032	for (mcip = mip->mi_clients_list; mcip != NULL;
4033	    mcip = mcip->mci_client_next)
4034		mac_client_update_classifier(mcip, B_TRUE);
4035	i_mac_perim_exit(mip);
4036}
4037