1/*-
2 * Copyright (c) 2015
3 *	Jonathan Looney. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/queue.h>
28#include <sys/param.h>
29#include <sys/types.h>
30#include <sys/socket.h>
31#include <sys/socketvar.h>
32#include <sys/sysctl.h>
33#include <sys/systm.h>
34#include <sys/mbuf.h>
35#include <sys/eventhandler.h>
36#include <machine/atomic.h>
37#include <netinet/in.h>
38#include <netinet/in_pcb.h>
39#include <netinet/tcp_var.h>
40#include <netinet/tcp_pcap.h>
41
42#define M_LEADINGSPACE_NOWRITE(m)					\
43	((m)->m_data - M_START(m))
44
45int tcp_pcap_aggressive_free = 1;
46static int tcp_pcap_clusters_referenced_cur = 0;
47static int tcp_pcap_clusters_referenced_max = 0;
48
49SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
50	CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
51	"Free saved packets when the memory system comes under pressure");
52SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
53	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
54	"Number of clusters currently referenced on TCP PCAP queues");
55SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
56	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
57	"Maximum number of clusters allowed to be referenced on TCP PCAP "
58	"queues");
59
60static int tcp_pcap_alloc_reuse_ext = 0;
61static int tcp_pcap_alloc_reuse_mbuf = 0;
62static int tcp_pcap_alloc_new_mbuf = 0;
63SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
64	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
65	"Number of mbufs with external storage reused for the TCP PCAP "
66	"functionality");
67SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
68	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
69	"Number of mbufs with internal storage reused for the TCP PCAP "
70	"functionality");
71SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
72	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
73	"Number of new mbufs allocated for the TCP PCAP functionality");
74
75VNET_DEFINE(int, tcp_pcap_packets) = 0;
76#define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
77SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
78	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
79	"Default number of packets saved per direction per TCPCB");
80
81/* Initialize the values. */
82static void
83tcp_pcap_max_set(void)
84{
85
86	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
87}
88
89void
90tcp_pcap_init(void)
91{
92
93	tcp_pcap_max_set();
94	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
95		NULL, EVENTHANDLER_PRI_ANY);
96}
97
98/*
99 * If we are below the maximum allowed cluster references,
100 * increment the reference count and return TRUE. Otherwise,
101 * leave the reference count alone and return FALSE.
102 */
103static __inline bool
104tcp_pcap_take_cluster_reference(void)
105{
106	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
107		tcp_pcap_clusters_referenced_max) {
108		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
109		return FALSE;
110	}
111	return TRUE;
112}
113
114/*
115 * For all the external entries in m, apply the given adjustment.
116 * This can be used to adjust the counter when an mbuf chain is
117 * copied or freed.
118 */
119static __inline void
120tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
121{
122	while (m) {
123		if (m->m_flags & M_EXT)
124			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
125
126		m = m->m_next;
127	}
128}
129
130/*
131 * Free all mbufs in a chain, decrementing the reference count as
132 * necessary.
133 *
134 * Functions in this file should use this instead of m_freem() when
135 * they are freeing mbuf chains that may contain clusters that were
136 * already included in tcp_pcap_clusters_referenced_cur.
137 */
138static void
139tcp_pcap_m_freem(struct mbuf *mb)
140{
141	while (mb != NULL) {
142		if (mb->m_flags & M_EXT)
143			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
144			    1);
145		mb = m_free(mb);
146	}
147}
148
149/*
150 * Copy data from m to n, where n cannot fit all the data we might
151 * want from m.
152 *
153 * Prioritize data like this:
154 * 1. TCP header
155 * 2. IP header
156 * 3. Data
157 */
158static void
159tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
160{
161	struct mbuf *m_cur = m;
162	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
163
164	/* Below, we assume these will be non-NULL. */
165	KASSERT(th, ("%s: called with th == NULL", __func__));
166	KASSERT(m, ("%s: called with m == NULL", __func__));
167	KASSERT(n, ("%s: called with n == NULL", __func__));
168
169	/* We assume this initialization occurred elsewhere. */
170	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
171		__func__, n->m_len));
172	KASSERT(n->m_data == M_START(n),
173		("%s: called with n->m_data != M_START(n)", __func__));
174
175	/*
176	 * Calculate the size of the TCP header. We use this often
177	 * enough that it is worth just calculating at the start.
178	 */
179	tcp_off = th->th_off << 2;
180
181	/* Trim off leading empty mbufs. */
182	while (m && m->m_len == 0)
183		m = m->m_next;
184
185	if (m) {
186		m_cur = m;
187	}
188	else {
189		/*
190		 * No data? Highly unusual. We would expect to at
191		 * least see a TCP header in the mbuf.
192		 * As we have a pointer to the TCP header, I guess
193		 * we should just copy that. (???)
194		 */
195fallback:
196		bytes_to_copy = tcp_off;
197		if (bytes_to_copy > M_SIZE(n))
198			bytes_to_copy = M_SIZE(n);
199		bcopy(th, n->m_data, bytes_to_copy);
200		n->m_len = bytes_to_copy;
201		return;
202	}
203
204	/*
205	 * Find TCP header. Record the total number of bytes up to,
206	 * and including, the TCP header.
207	 */
208	while (m_cur) {
209		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
210			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
211			break;
212		bytes_to_copy += m_cur->m_len;
213		m_cur = m_cur->m_next;
214	}
215	if (m_cur)
216		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
217	else
218		goto fallback;
219	bytes_to_copy += tcp_off;
220
221	/*
222	 * If we already want to copy more bytes than we can hold
223	 * in the destination mbuf, skip leading bytes and copy
224	 * what we can.
225	 *
226	 * Otherwise, consider trailing data.
227	 */
228	if (bytes_to_copy > M_SIZE(n)) {
229		skip  = bytes_to_copy - M_SIZE(n);
230		bytes_to_copy = M_SIZE(n);
231	}
232	else {
233		/*
234		 * Determine how much trailing data is in the chain.
235		 * We start with the length of this mbuf (the one
236		 * containing th) and subtract the size of the TCP
237		 * header (tcp_off) and the size of the data prior
238		 * to th (th - m_cur->m_data).
239		 *
240		 * This *should not* be negative, as the TCP code
241		 * should put the whole TCP header in a single
242		 * mbuf. But, it isn't a problem if it is. We will
243		 * simple work off our negative balance as we look
244		 * at subsequent mbufs.
245		 */
246		trailing_data = m_cur->m_len - tcp_off;
247		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
248		m_cur = m_cur->m_next;
249		while (m_cur) {
250			trailing_data += m_cur->m_len;
251			m_cur = m_cur->m_next;
252		}
253		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
254			bytes_to_copy = M_SIZE(n);
255		else
256			bytes_to_copy += trailing_data;
257	}
258
259	m_copydata(m, skip, bytes_to_copy, n->m_data);
260	n->m_len = bytes_to_copy;
261}
262
263void
264tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
265{
266	struct mbuf *n = NULL, *mhead;
267
268	KASSERT(th, ("%s: called with th == NULL", __func__));
269	KASSERT(m, ("%s: called with m == NULL", __func__));
270	KASSERT(queue, ("%s: called with queue == NULL", __func__));
271
272	/* We only care about data packets. */
273	while (m && m->m_type != MT_DATA)
274		m = m->m_next;
275
276	/* We only need to do something if we still have an mbuf. */
277	if (!m)
278		return;
279
280	/* If we are not saving mbufs, return now. */
281	if (queue->mq_maxlen == 0)
282		return;
283
284	/*
285	 * Check to see if we will need to recycle mbufs.
286	 *
287	 * If we need to get rid of mbufs to stay below
288	 * our packet count, try to reuse the mbuf. Once
289	 * we already have a new mbuf (n), then we can
290	 * simply free subsequent mbufs.
291	 *
292	 * Note that most of the logic in here is to deal
293	 * with the reuse. If we are fine with constant
294	 * mbuf allocs/deallocs, we could ditch this logic.
295	 * But, it only seems to make sense to reuse
296	 * mbufs we already have.
297	 */
298	while (mbufq_full(queue)) {
299		mhead = mbufq_dequeue(queue);
300
301		if (n) {
302			tcp_pcap_m_freem(mhead);
303		}
304		else {
305			/*
306			 * If this held an external cluster, try to
307			 * detach the cluster. But, if we held the
308			 * last reference, go through the normal
309			 * free-ing process.
310			 */
311			if (mhead->m_flags & M_EXTPG) {
312				/* Don't mess around with these. */
313				tcp_pcap_m_freem(mhead);
314				continue;
315			} else if (mhead->m_flags & M_EXT) {
316				switch (mhead->m_ext.ext_type) {
317				case EXT_SFBUF:
318					/* Don't mess around with these. */
319					tcp_pcap_m_freem(mhead);
320					continue;
321				default:
322					if (atomic_fetchadd_int(
323						mhead->m_ext.ext_cnt, -1) == 1)
324					{
325						/*
326						 * We held the last reference
327						 * on this cluster. Restore
328						 * the reference count and put
329						 * it back in the pool.
330				 		 */
331						*(mhead->m_ext.ext_cnt) = 1;
332						tcp_pcap_m_freem(mhead);
333						continue;
334					}
335					/*
336					 * We were able to cleanly free the
337					 * reference.
338				 	 */
339					atomic_subtract_int(
340					    &tcp_pcap_clusters_referenced_cur,
341					    1);
342					tcp_pcap_alloc_reuse_ext++;
343					break;
344				}
345			} else {
346				tcp_pcap_alloc_reuse_mbuf++;
347			}
348
349			n = mhead;
350			tcp_pcap_m_freem(n->m_next);
351			m_init(n, M_NOWAIT, MT_DATA, 0);
352		}
353	}
354
355	/* Check to see if we need to get a new mbuf. */
356	if (!n) {
357		if (!(n = m_get(M_NOWAIT, MT_DATA)))
358			return;
359		tcp_pcap_alloc_new_mbuf++;
360	}
361
362	/*
363	 * What are we dealing with? If a cluster, attach it. Otherwise,
364	 * try to copy the data from the beginning of the mbuf to the
365	 * end of data. (There may be data between the start of the data
366	 * area and the current data pointer. We want to get this, because
367	 * it may contain header information that is useful.)
368	 * In cases where that isn't possible, settle for what we can
369	 * get.
370	 */
371	if ((m->m_flags & (M_EXT | M_EXTPG)) &&
372	    tcp_pcap_take_cluster_reference()) {
373		n->m_data = m->m_data;
374		n->m_len = m->m_len;
375		mb_dupcl(n, m);
376	}
377	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
378		/*
379		 * At this point, n is guaranteed to be a normal mbuf
380		 * with no cluster and no packet header. Because the
381		 * logic in this code block requires this, the assert
382		 * is here to catch any instances where someone
383		 * changes the logic to invalidate that assumption.
384		 */
385		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
386			("%s: Unexpected flags (%#x) for mbuf",
387			__func__, n->m_flags));
388		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
389		n->m_len = m->m_len;
390		if (m->m_flags & M_EXTPG)
391			m_copydata(m, 0, m->m_len, n->m_data);
392		else
393			bcopy(M_START(m), n->m_dat,
394			    m->m_len + M_LEADINGSPACE_NOWRITE(m));
395	}
396	else {
397		/*
398		 * This is the case where we need to "settle for what
399		 * we can get". The most probable way to this code
400		 * path is that we've already taken references to the
401		 * maximum number of mbuf clusters we can, and the data
402		 * is too long to fit in an mbuf's internal storage.
403		 * Try for a "best fit".
404		 */
405		tcp_pcap_copy_bestfit(th, m, n);
406
407		/* Don't try to get additional data. */
408		goto add_to_queue;
409	}
410
411	if (m->m_next) {
412		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
413		tcp_pcap_adj_cluster_reference(n->m_next, 1);
414	}
415
416add_to_queue:
417	/* Add the new mbuf to the list. */
418	if (mbufq_enqueue(queue, n)) {
419		/* This shouldn't happen. If INVARIANTS is defined, panic. */
420		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
421		tcp_pcap_m_freem(n);
422	}
423}
424
425void
426tcp_pcap_drain(struct mbufq *queue)
427{
428	struct mbuf *m;
429	while ((m = mbufq_dequeue(queue)))
430		tcp_pcap_m_freem(m);
431}
432
433void
434tcp_pcap_tcpcb_init(struct tcpcb *tp)
435{
436	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
437	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
438}
439
440void
441tcp_pcap_set_sock_max(struct mbufq *queue, int newval)
442{
443	queue->mq_maxlen = newval;
444	while (queue->mq_len > queue->mq_maxlen)
445		tcp_pcap_m_freem(mbufq_dequeue(queue));
446}
447
448int
449tcp_pcap_get_sock_max(struct mbufq *queue)
450{
451	return queue->mq_maxlen;
452}
453