1/*-
2 * Copyright (c) 2016-2018 Netflix, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * Author: Lawrence Stewart <lstewart@netflix.com>
29 */
30
31#include <sys/param.h>
32#include <sys/arb.h>
33#include <sys/errno.h>
34#include <sys/malloc.h>
35#include <sys/qmath.h>
36#include <sys/queue.h>
37#include <sys/socket.h>
38#include <sys/socketvar.h>
39#include <sys/sysctl.h>
40#ifdef _KERNEL
41#include <sys/kernel.h>
42#include <sys/lock.h>
43#include <sys/rmlock.h>
44#include <sys/systm.h>
45#endif
46#include <sys/stats.h>
47
48#include <net/vnet.h>
49
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/tcp.h>
53#include <netinet/tcp_var.h>
54
55#include <netinet/cc/cc.h>
56
57VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1;
58
59#ifndef _KERNEL
60#define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
61#define	V_tcp_perconn_stats_dflt_tpl	VNET(tcp_perconn_stats_dflt_tpl)
62#else /* _KERNEL */
63
64VNET_DEFINE(int, tcp_perconn_stats_enable) = 2;
65VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, tcp_perconn_stats_sample_rates);
66VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0;
67#define	V_tcp_perconn_stats_sample_rates VNET(tcp_perconn_stats_sample_rates)
68#define	V_tcp_stats_nrates		VNET(tcp_stats_nrates)
69
70static struct rmlock tcp_stats_tpl_sampling_lock;
71static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
72    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx);
73
74SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable,
75    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0,
76    "Enable per-connection TCP stats gathering; 1 enables for all connections, "
77    "2 enables random sampling across log id connection groups");
78SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates,
79    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, tcp_stats_tpl_sr_cb,
80    sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A",
81    "TCP stats per template random sampling rates, in CSV tpl_spec=percent "
82    "key-value pairs (see stats(9) for template spec details)");
83#endif /* _KERNEL */
84
85#ifdef _KERNEL
86int
87#else
88static int
89/* Ensure all templates are also added to the userland template list. */
90__attribute__ ((constructor))
91#endif
92tcp_stats_init(void)
93{
94	int err, lasterr;
95
96	err = lasterr = 0;
97
98	V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0);
99	if (V_tcp_perconn_stats_dflt_tpl < 0)
100		return (-V_tcp_perconn_stats_dflt_tpl);
101
102	struct voistatspec vss_sum[] = {
103		STATS_VSS_SUM(),
104	};
105	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
106	    VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64,
107	    NVSS(vss_sum), vss_sum, 0);
108	lasterr = err ? err : lasterr;
109	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
110	    VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32,
111	    NVSS(vss_sum), vss_sum, 0);
112	lasterr = err ? err : lasterr;
113
114	struct voistatspec vss_max[] = {
115		STATS_VSS_MAX(),
116	};
117	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
118	    VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG,
119	    NVSS(vss_max), vss_max, 0);
120	lasterr = err ? err : lasterr;
121	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
122	    VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG,
123	    NVSS(vss_max), vss_max, 0);
124	lasterr = err ? err : lasterr;
125
126	struct voistatspec vss_rtt[] = {
127		STATS_VSS_MAX(),
128		STATS_VSS_MIN(),
129		STATS_VSS_TDGSTCLUST32(20, 4),
130	};
131	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
132	    VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32,
133	    NVSS(vss_rtt), vss_rtt, 0);
134	lasterr = err ? err : lasterr;
135
136	struct voistatspec vss_congsig[] = {
137		STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO),
138		    DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0)
139	};
140	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
141	    VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32,
142	    NVSS(vss_congsig), vss_congsig, 0);
143	lasterr = err ? err : lasterr;
144
145	struct voistatspec vss_gput[] = {
146		STATS_VSS_MAX(),
147		STATS_VSS_TDGSTCLUST32(20, 4),
148	};
149	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
150	    VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32,
151	    NVSS(vss_gput), vss_gput, 0);
152	lasterr = err ? err : lasterr;
153
154	struct voistatspec vss_gput_nd[] = {
155		STATS_VSS_TDGSTCLUST32(10, 4),
156	};
157	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
158	    VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32,
159	    NVSS(vss_gput_nd), vss_gput_nd, 0);
160	lasterr = err ? err : lasterr;
161
162	struct voistatspec vss_windiff[] = {
163		STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF)
164	};
165	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
166	    VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32,
167	    NVSS(vss_windiff), vss_windiff, 0);
168	lasterr = err ? err : lasterr;
169
170	struct voistatspec vss_acklen[] = {
171		STATS_VSS_MAX(),
172		STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF)
173	};
174	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
175	    VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32,
176	    NVSS(vss_acklen), vss_acklen, 0);
177	lasterr = err ? err : lasterr;
178
179	return (lasterr);
180}
181
182#ifdef _KERNEL
183int
184tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
185    size_t seed_len)
186{
187	struct rm_priotracker tracker;
188	int tpl;
189
190	tpl = -1;
191
192	if (V_tcp_stats_nrates > 0) {
193		rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker);
194		tpl = stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates,
195		    V_tcp_stats_nrates, seed_bytes, seed_len);
196		rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker);
197
198		if (tpl >= 0) {
199			INP_WLOCK_ASSERT(tptoinpcb(tp));
200			if (tp->t_stats != NULL)
201				stats_blob_destroy(tp->t_stats);
202			tp->t_stats = stats_blob_alloc(tpl, 0);
203			if (tp->t_stats == NULL)
204				tpl = -ENOMEM;
205		}
206	}
207
208	return (tpl);
209}
210
211/*
212 * Callback function for stats_tpl_sample_rates() to interact with the TCP
213 * subsystem's stats template sample rates list.
214 */
215int
216tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
217    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx)
218{
219	struct stats_tpl_sample_rate *old_rates;
220	int old_nrates;
221
222	if (ctx == NULL)
223		return (ENOMEM);
224
225	switch (action) {
226	case TPL_SR_RLOCKED_GET:
227		/*
228		 * Return with rlock held i.e. this call must be paired with a
229		 * "action == TPL_SR_RUNLOCK" call.
230		 */
231		rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
232		rm_rlock(&tcp_stats_tpl_sampling_lock,
233		    (struct rm_priotracker *)ctx);
234		/* FALLTHROUGH */
235	case TPL_SR_UNLOCKED_GET:
236		if (rates != NULL)
237			*rates = V_tcp_perconn_stats_sample_rates;
238		if (nrates != NULL)
239			*nrates = V_tcp_stats_nrates;
240		break;
241	case TPL_SR_RUNLOCK:
242		rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED);
243		rm_runlock(&tcp_stats_tpl_sampling_lock,
244		    (struct rm_priotracker *)ctx);
245		break;
246	case TPL_SR_PUT:
247		KASSERT(rates != NULL && nrates != NULL,
248		    ("%s: PUT without new rates", __func__));
249		rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
250		if (rates == NULL || nrates == NULL)
251			return (EINVAL);
252		rm_wlock(&tcp_stats_tpl_sampling_lock);
253		old_rates = V_tcp_perconn_stats_sample_rates;
254		old_nrates = V_tcp_stats_nrates;
255		V_tcp_perconn_stats_sample_rates = *rates;
256		V_tcp_stats_nrates = *nrates;
257		rm_wunlock(&tcp_stats_tpl_sampling_lock);
258		*rates = old_rates;
259		*nrates = old_nrates;
260		break;
261	default:
262		return (EINVAL);
263		break;
264	}
265
266	return (0);
267}
268
269RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock,
270    "tcp_stats_tpl_sampling_lock");
271#endif /* _KERNEL */
272