tcp_misc.c revision 12016:0248e987199b
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/strlog.h>
29#include <sys/policy.h>
30#include <sys/strsun.h>
31#include <sys/squeue_impl.h>
32#include <sys/squeue.h>
33
34#include <inet/common.h>
35#include <inet/ip.h>
36#include <inet/tcp.h>
37#include <inet/tcp_impl.h>
38
39/* Control whether TCP can enter defensive mode when under memory pressure. */
40static boolean_t tcp_do_reclaim = B_TRUE;
41
42/*
43 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
44 *
45 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
46 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
47 * (defined in tcp.h) needs to be filled in and passed into the kernel
48 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
49 * structure contains the four-tuple of a TCP connection and a range of TCP
50 * states (specified by ac_start and ac_end). The use of wildcard addresses
51 * and ports is allowed. Connections with a matching four tuple and a state
52 * within the specified range will be aborted. The valid states for the
53 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
54 * inclusive.
55 *
56 * An application which has its connection aborted by this ioctl will receive
57 * an error that is dependent on the connection state at the time of the abort.
58 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
59 * though a RST packet has been received.  If the connection state is equal to
60 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
61 * and all resources associated with the connection will be freed.
62 */
63static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
64static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
65static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
66    ip_recv_attr_t *dummy);
67static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
68void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
69static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
70    boolean_t, tcp_stack_t *);
71
72/*
73 * Macros used for accessing the different types of sockaddr
74 * structures inside a tcp_ioc_abort_conn_t.
75 */
76#define	TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
77#define	TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
78#define	TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
79#define	TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
80#define	TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
81#define	TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
82#define	TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
83#define	TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
84#define	TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
85#define	TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
86#define	TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
87#define	TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
88
89/*
90 * Return the correct error code to mimic the behavior
91 * of a connection reset.
92 */
93#define	TCP_AC_GET_ERRCODE(state, err) {	\
94		switch ((state)) {		\
95		case TCPS_SYN_SENT:		\
96		case TCPS_SYN_RCVD:		\
97			(err) = ECONNREFUSED;	\
98			break;			\
99		case TCPS_ESTABLISHED:		\
100		case TCPS_FIN_WAIT_1:		\
101		case TCPS_FIN_WAIT_2:		\
102		case TCPS_CLOSE_WAIT:		\
103			(err) = ECONNRESET;	\
104			break;			\
105		case TCPS_CLOSING:		\
106		case TCPS_LAST_ACK:		\
107		case TCPS_TIME_WAIT:		\
108			(err) = 0;		\
109			break;			\
110		default:			\
111			(err) = ENXIO;		\
112		}				\
113	}
114
115/*
116 * Check if a tcp structure matches the info in acp.
117 */
118#define	TCP_AC_ADDR_MATCH(acp, connp, tcp)			\
119	(((acp)->ac_local.ss_family == AF_INET) ?		\
120	((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||		\
121	TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) &&	\
122	(TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||		\
123	TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) &&	\
124	(TCP_AC_V4LPORT((acp)) == 0 ||				\
125	TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) &&	\
126	(TCP_AC_V4RPORT((acp)) == 0 ||				\
127	TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) &&	\
128	(acp)->ac_start <= (tcp)->tcp_state &&			\
129	(acp)->ac_end >= (tcp)->tcp_state) :			\
130	((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||	\
131	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),		\
132	&(connp)->conn_laddr_v6)) &&				\
133	(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||	\
134	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),		\
135	&(connp)->conn_faddr_v6)) &&				\
136	(TCP_AC_V6LPORT((acp)) == 0 ||				\
137	TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) &&	\
138	(TCP_AC_V6RPORT((acp)) == 0 ||				\
139	TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) &&	\
140	(acp)->ac_start <= (tcp)->tcp_state &&			\
141	(acp)->ac_end >= (tcp)->tcp_state))
142
143#define	TCP_AC_MATCH(acp, connp, tcp)				\
144	(((acp)->ac_zoneid == ALL_ZONES ||			\
145	(acp)->ac_zoneid == (connp)->conn_zoneid) ?		\
146	TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
147
148/*
149 * Build a message containing a tcp_ioc_abort_conn_t structure
150 * which is filled in with information from acp and tp.
151 */
152static mblk_t *
153tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
154{
155	mblk_t *mp;
156	tcp_ioc_abort_conn_t *tacp;
157
158	mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
159	if (mp == NULL)
160		return (NULL);
161
162	*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
163	tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
164	    sizeof (uint32_t));
165
166	tacp->ac_start = acp->ac_start;
167	tacp->ac_end = acp->ac_end;
168	tacp->ac_zoneid = acp->ac_zoneid;
169
170	if (acp->ac_local.ss_family == AF_INET) {
171		tacp->ac_local.ss_family = AF_INET;
172		tacp->ac_remote.ss_family = AF_INET;
173		TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
174		TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
175		TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
176		TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
177	} else {
178		tacp->ac_local.ss_family = AF_INET6;
179		tacp->ac_remote.ss_family = AF_INET6;
180		TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
181		TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
182		TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
183		TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
184	}
185	mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
186	return (mp);
187}
188
189/*
190 * Print a tcp_ioc_abort_conn_t structure.
191 */
192static void
193tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
194{
195	char lbuf[128];
196	char rbuf[128];
197	sa_family_t af;
198	in_port_t lport, rport;
199	ushort_t logflags;
200
201	af = acp->ac_local.ss_family;
202
203	if (af == AF_INET) {
204		(void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
205		    lbuf, 128);
206		(void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
207		    rbuf, 128);
208		lport = ntohs(TCP_AC_V4LPORT(acp));
209		rport = ntohs(TCP_AC_V4RPORT(acp));
210	} else {
211		(void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
212		    lbuf, 128);
213		(void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
214		    rbuf, 128);
215		lport = ntohs(TCP_AC_V6LPORT(acp));
216		rport = ntohs(TCP_AC_V6RPORT(acp));
217	}
218
219	logflags = SL_TRACE | SL_NOTE;
220	/*
221	 * Don't print this message to the console if the operation was done
222	 * to a non-global zone.
223	 */
224	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
225		logflags |= SL_CONSOLE;
226	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
227	    "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
228	    "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
229	    acp->ac_start, acp->ac_end);
230}
231
232/*
233 * Called using SQ_FILL when a message built using
234 * tcp_ioctl_abort_build_msg is put into a queue.
235 * Note that when we get here there is no wildcard in acp any more.
236 */
237/* ARGSUSED2 */
238static void
239tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
240    ip_recv_attr_t *dummy)
241{
242	conn_t			*connp = (conn_t *)arg;
243	tcp_t			*tcp = connp->conn_tcp;
244	tcp_ioc_abort_conn_t	*acp;
245
246	/*
247	 * Don't accept any input on a closed tcp as this TCP logically does
248	 * not exist on the system. Don't proceed further with this TCP.
249	 * For eg. this packet could trigger another close of this tcp
250	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
251	 * tcp_clean_death / tcp_closei_local must be called at most once
252	 * on a TCP.
253	 */
254	if (tcp->tcp_state == TCPS_CLOSED ||
255	    tcp->tcp_state == TCPS_BOUND) {
256		freemsg(mp);
257		return;
258	}
259
260	acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
261	if (tcp->tcp_state <= acp->ac_end) {
262		/*
263		 * If we get here, we are already on the correct
264		 * squeue. This ioctl follows the following path
265		 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
266		 * ->tcp_ioctl_abort->squeue_enter (if on a
267		 * different squeue)
268		 */
269		int errcode;
270
271		TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
272		(void) tcp_clean_death(tcp, errcode);
273	}
274	freemsg(mp);
275}
276
277/*
278 * Abort all matching connections on a hash chain.
279 */
280static int
281tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
282    boolean_t exact, tcp_stack_t *tcps)
283{
284	int nmatch, err = 0;
285	tcp_t *tcp;
286	MBLKP mp, last, listhead = NULL;
287	conn_t	*tconnp;
288	connf_t	*connfp;
289	ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
290
291	connfp = &ipst->ips_ipcl_conn_fanout[index];
292
293startover:
294	nmatch = 0;
295
296	mutex_enter(&connfp->connf_lock);
297	for (tconnp = connfp->connf_head; tconnp != NULL;
298	    tconnp = tconnp->conn_next) {
299		tcp = tconnp->conn_tcp;
300		/*
301		 * We are missing a check on sin6_scope_id for linklocals here,
302		 * but current usage is just for aborting based on zoneid
303		 * for shared-IP zones.
304		 */
305		if (TCP_AC_MATCH(acp, tconnp, tcp)) {
306			CONN_INC_REF(tconnp);
307			mp = tcp_ioctl_abort_build_msg(acp, tcp);
308			if (mp == NULL) {
309				err = ENOMEM;
310				CONN_DEC_REF(tconnp);
311				break;
312			}
313			mp->b_prev = (mblk_t *)tcp;
314
315			if (listhead == NULL) {
316				listhead = mp;
317				last = mp;
318			} else {
319				last->b_next = mp;
320				last = mp;
321			}
322			nmatch++;
323			if (exact)
324				break;
325		}
326
327		/* Avoid holding lock for too long. */
328		if (nmatch >= 500)
329			break;
330	}
331	mutex_exit(&connfp->connf_lock);
332
333	/* Pass mp into the correct tcp */
334	while ((mp = listhead) != NULL) {
335		listhead = listhead->b_next;
336		tcp = (tcp_t *)mp->b_prev;
337		mp->b_next = mp->b_prev = NULL;
338		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
339		    tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
340		    SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
341	}
342
343	*count += nmatch;
344	if (nmatch >= 500 && err == 0)
345		goto startover;
346	return (err);
347}
348
349/*
350 * Abort all connections that matches the attributes specified in acp.
351 */
352static int
353tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
354{
355	sa_family_t af;
356	uint32_t  ports;
357	uint16_t *pports;
358	int err = 0, count = 0;
359	boolean_t exact = B_FALSE; /* set when there is no wildcard */
360	int index = -1;
361	ushort_t logflags;
362	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
363
364	af = acp->ac_local.ss_family;
365
366	if (af == AF_INET) {
367		if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
368		    TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
369			pports = (uint16_t *)&ports;
370			pports[1] = TCP_AC_V4LPORT(acp);
371			pports[0] = TCP_AC_V4RPORT(acp);
372			exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
373		}
374	} else {
375		if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
376		    TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
377			pports = (uint16_t *)&ports;
378			pports[1] = TCP_AC_V6LPORT(acp);
379			pports[0] = TCP_AC_V6RPORT(acp);
380			exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
381		}
382	}
383
384	/*
385	 * For cases where remote addr, local port, and remote port are non-
386	 * wildcards, tcp_ioctl_abort_bucket will only be called once.
387	 */
388	if (index != -1) {
389		err = tcp_ioctl_abort_bucket(acp, index,
390		    &count, exact, tcps);
391	} else {
392		/*
393		 * loop through all entries for wildcard case
394		 */
395		for (index = 0;
396		    index < ipst->ips_ipcl_conn_fanout_size;
397		    index++) {
398			err = tcp_ioctl_abort_bucket(acp, index,
399			    &count, exact, tcps);
400			if (err != 0)
401				break;
402		}
403	}
404
405	logflags = SL_TRACE | SL_NOTE;
406	/*
407	 * Don't print this message to the console if the operation was done
408	 * to a non-global zone.
409	 */
410	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
411		logflags |= SL_CONSOLE;
412	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
413	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
414	if (err == 0 && count == 0)
415		err = ENOENT;
416	return (err);
417}
418
419/*
420 * Process the TCP_IOC_ABORT_CONN ioctl request.
421 */
422void
423tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
424{
425	int	err;
426	IOCP    iocp;
427	MBLKP   mp1;
428	sa_family_t laf, raf;
429	tcp_ioc_abort_conn_t *acp;
430	zone_t		*zptr;
431	conn_t		*connp = Q_TO_CONN(q);
432	zoneid_t	zoneid = connp->conn_zoneid;
433	tcp_t		*tcp = connp->conn_tcp;
434	tcp_stack_t	*tcps = tcp->tcp_tcps;
435
436	iocp = (IOCP)mp->b_rptr;
437
438	if ((mp1 = mp->b_cont) == NULL ||
439	    iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
440		err = EINVAL;
441		goto out;
442	}
443
444	/* check permissions */
445	if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
446		err = EPERM;
447		goto out;
448	}
449
450	if (mp1->b_cont != NULL) {
451		freemsg(mp1->b_cont);
452		mp1->b_cont = NULL;
453	}
454
455	acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
456	laf = acp->ac_local.ss_family;
457	raf = acp->ac_remote.ss_family;
458
459	/* check that a zone with the supplied zoneid exists */
460	if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
461		zptr = zone_find_by_id(zoneid);
462		if (zptr != NULL) {
463			zone_rele(zptr);
464		} else {
465			err = EINVAL;
466			goto out;
467		}
468	}
469
470	/*
471	 * For exclusive stacks we set the zoneid to zero
472	 * to make TCP operate as if in the global zone.
473	 */
474	if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
475		acp->ac_zoneid = GLOBAL_ZONEID;
476
477	if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
478	    acp->ac_start > acp->ac_end || laf != raf ||
479	    (laf != AF_INET && laf != AF_INET6)) {
480		err = EINVAL;
481		goto out;
482	}
483
484	tcp_ioctl_abort_dump(acp);
485	err = tcp_ioctl_abort(acp, tcps);
486
487out:
488	if (mp1 != NULL) {
489		freemsg(mp1);
490		mp->b_cont = NULL;
491	}
492
493	if (err != 0)
494		miocnak(q, mp, 0, err);
495	else
496		miocack(q, mp, 0, 0);
497}
498
499/*
500 * Timeout function to reset the TCP stack variable tcps_reclaim to false.
501 */
502void
503tcp_reclaim_timer(void *arg)
504{
505	tcp_stack_t *tcps = (tcp_stack_t *)arg;
506	int64_t tot_conn = 0;
507	int i;
508	extern pgcnt_t lotsfree, needfree;
509
510	for (i = 0; i < tcps->tcps_sc_cnt; i++)
511		tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
512
513	/*
514	 * This happens only when a stack is going away.  tcps_reclaim_tid
515	 * should not be reset to 0 when returning in this case.
516	 */
517	mutex_enter(&tcps->tcps_reclaim_lock);
518	if (!tcps->tcps_reclaim) {
519		mutex_exit(&tcps->tcps_reclaim_lock);
520		return;
521	}
522
523	if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
524		tcps->tcps_reclaim = B_FALSE;
525		tcps->tcps_reclaim_tid = 0;
526	} else {
527		/* Stay in defensive mode and restart the timer */
528		tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
529		    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
530	}
531	mutex_exit(&tcps->tcps_reclaim_lock);
532}
533
534/*
535 * Kmem reclaim call back function.  When the system is under memory
536 * pressure, we set the TCP stack variable tcps_reclaim to true.  This
537 * variable is reset to false after tcps_reclaim_period msecs.  During this
538 * period, TCP will be more aggressive in aborting connections not making
539 * progress, meaning retransmitting for some time (tcp_early_abort seconds).
540 * TCP will also not accept new connection request for those listeners whose
541 * q or q0 is not empty.
542 */
543/* ARGSUSED */
544void
545tcp_conn_reclaim(void *arg)
546{
547	netstack_handle_t nh;
548	netstack_t *ns;
549	tcp_stack_t *tcps;
550	extern pgcnt_t lotsfree, needfree;
551
552	if (!tcp_do_reclaim)
553		return;
554
555	/*
556	 * The reclaim function may be called even when the system is not
557	 * really under memory pressure.
558	 */
559	if (freemem >= lotsfree + needfree)
560		return;
561
562	netstack_next_init(&nh);
563	while ((ns = netstack_next(&nh)) != NULL) {
564		int i;
565		int64_t tot_conn = 0;
566
567		/*
568		 * During boot time, the first netstack_t is created and
569		 * initialized before TCP has registered with the netstack
570		 * framework.  If this reclaim function is called before TCP
571		 * has finished its initialization, netstack_next() will
572		 * return the first netstack_t (since its netstack_flags is
573		 * not NSF_UNINIT).  And its netstack_tcp will be NULL.  We
574		 * need to catch it.
575		 *
576		 * All subsequent netstack_t creation will not have this
577		 * problem since the initialization is not finished until TCP
578		 * has finished its own tcp_stack_t initialization.  Hence
579		 * netstack_next() will not return one with NULL netstack_tcp.
580		 */
581		if ((tcps = ns->netstack_tcp) == NULL) {
582			netstack_rele(ns);
583			continue;
584		}
585
586		/*
587		 * Even if the system is under memory pressure, the reason may
588		 * not be because of TCP activity.  Check the number of
589		 * connections in each stack.  If the number exceeds the
590		 * threshold (maxusers), turn on defensive mode.
591		 */
592		for (i = 0; i < tcps->tcps_sc_cnt; i++)
593			tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
594		if (tot_conn < maxusers) {
595			netstack_rele(ns);
596			continue;
597		}
598
599		mutex_enter(&tcps->tcps_reclaim_lock);
600		if (!tcps->tcps_reclaim) {
601			tcps->tcps_reclaim = B_TRUE;
602			tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
603			    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
604			TCP_STAT(tcps, tcp_reclaim_cnt);
605		}
606		mutex_exit(&tcps->tcps_reclaim_lock);
607		netstack_rele(ns);
608	}
609	netstack_next_fini(&nh);
610}
611
612/*
613 * Given a tcp_stack_t and a port (in host byte order), find a listener
614 * configuration for that port and return the ratio.
615 */
616uint32_t
617tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
618{
619	tcp_listener_t	*tl;
620	uint32_t ratio = 0;
621
622	mutex_enter(&tcps->tcps_listener_conf_lock);
623	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
624	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
625		if (tl->tl_port == port) {
626			ratio = tl->tl_ratio;
627			break;
628		}
629	}
630	mutex_exit(&tcps->tcps_listener_conf_lock);
631	return (ratio);
632}
633
634/*
635 * To remove all listener limit configuration in a tcp_stack_t.
636 */
637void
638tcp_listener_conf_cleanup(tcp_stack_t *tcps)
639{
640	tcp_listener_t	*tl;
641
642	mutex_enter(&tcps->tcps_listener_conf_lock);
643	while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
644		list_remove(&tcps->tcps_listener_conf, tl);
645		kmem_free(tl, sizeof (tcp_listener_t));
646	}
647	mutex_destroy(&tcps->tcps_listener_conf_lock);
648	list_destroy(&tcps->tcps_listener_conf);
649}
650
651/*
652 * Call back function for CPU state change.
653 */
654/* ARGSUSED */
655int
656tcp_cpu_update(cpu_setup_t what, int id, void *arg)
657{
658	cpu_t *cp;
659	netstack_handle_t nh;
660	netstack_t *ns;
661	tcp_stack_t *tcps;
662	int i;
663
664	ASSERT(MUTEX_HELD(&cpu_lock));
665	cp = cpu[id];
666
667	switch (what) {
668	case CPU_CONFIG:
669	case CPU_ON:
670	case CPU_INIT:
671	case CPU_CPUPART_IN:
672		netstack_next_init(&nh);
673		while ((ns = netstack_next(&nh)) != NULL) {
674			tcps = ns->netstack_tcp;
675			if (cp->cpu_seqid >= tcps->tcps_sc_cnt) {
676				for (i = tcps->tcps_sc_cnt; i <= cp->cpu_seqid;
677				    i++) {
678					ASSERT(tcps->tcps_sc[i] == NULL);
679					tcps->tcps_sc[i] = kmem_zalloc(
680					    sizeof (tcp_stats_cpu_t), KM_SLEEP);
681				}
682				membar_producer();
683				tcps->tcps_sc_cnt = cp->cpu_seqid + 1;
684			}
685			netstack_rele(ns);
686		}
687		netstack_next_fini(&nh);
688		break;
689	case CPU_UNCONFIG:
690	case CPU_OFF:
691	case CPU_CPUPART_OUT:
692		/* Nothing to do */
693		break;
694	default:
695		break;
696	}
697	return (0);
698}
699
700/*
701 * Diagnostic routine used to return a string associated with the tcp state.
702 * Note that if the caller does not supply a buffer, it will use an internal
703 * static string.  This means that if multiple threads call this function at
704 * the same time, output can be corrupted...  Note also that this function
705 * does not check the size of the supplied buffer.  The caller has to make
706 * sure that it is big enough.
707 */
708char *
709tcp_display(tcp_t *tcp, char *sup_buf, char format)
710{
711	char		buf1[30];
712	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
713	char		*buf;
714	char		*cp;
715	in6_addr_t	local, remote;
716	char		local_addrbuf[INET6_ADDRSTRLEN];
717	char		remote_addrbuf[INET6_ADDRSTRLEN];
718	conn_t		*connp;
719
720	if (sup_buf != NULL)
721		buf = sup_buf;
722	else
723		buf = priv_buf;
724
725	if (tcp == NULL)
726		return ("NULL_TCP");
727
728	connp = tcp->tcp_connp;
729	switch (tcp->tcp_state) {
730	case TCPS_CLOSED:
731		cp = "TCP_CLOSED";
732		break;
733	case TCPS_IDLE:
734		cp = "TCP_IDLE";
735		break;
736	case TCPS_BOUND:
737		cp = "TCP_BOUND";
738		break;
739	case TCPS_LISTEN:
740		cp = "TCP_LISTEN";
741		break;
742	case TCPS_SYN_SENT:
743		cp = "TCP_SYN_SENT";
744		break;
745	case TCPS_SYN_RCVD:
746		cp = "TCP_SYN_RCVD";
747		break;
748	case TCPS_ESTABLISHED:
749		cp = "TCP_ESTABLISHED";
750		break;
751	case TCPS_CLOSE_WAIT:
752		cp = "TCP_CLOSE_WAIT";
753		break;
754	case TCPS_FIN_WAIT_1:
755		cp = "TCP_FIN_WAIT_1";
756		break;
757	case TCPS_CLOSING:
758		cp = "TCP_CLOSING";
759		break;
760	case TCPS_LAST_ACK:
761		cp = "TCP_LAST_ACK";
762		break;
763	case TCPS_FIN_WAIT_2:
764		cp = "TCP_FIN_WAIT_2";
765		break;
766	case TCPS_TIME_WAIT:
767		cp = "TCP_TIME_WAIT";
768		break;
769	default:
770		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
771		cp = buf1;
772		break;
773	}
774	switch (format) {
775	case DISP_ADDR_AND_PORT:
776		if (connp->conn_ipversion == IPV4_VERSION) {
777			/*
778			 * Note that we use the remote address in the tcp_b
779			 * structure.  This means that it will print out
780			 * the real destination address, not the next hop's
781			 * address if source routing is used.
782			 */
783			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
784			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
785
786		} else {
787			local = connp->conn_laddr_v6;
788			remote = connp->conn_faddr_v6;
789		}
790		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
791		    sizeof (local_addrbuf));
792		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
793		    sizeof (remote_addrbuf));
794		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
795		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
796		    ntohs(connp->conn_fport), cp);
797		break;
798	case DISP_PORT_ONLY:
799	default:
800		(void) mi_sprintf(buf, "[%u, %u] %s",
801		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
802		break;
803	}
804
805	return (buf);
806}
807