1/*-
2 * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
3 * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
4 * Copyright (c) 1982, 1986, 1988, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33#include "opt_rss.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/eventhandler.h>
38#include <sys/kernel.h>
39#include <sys/hash.h>
40#include <sys/mbuf.h>
41#include <sys/malloc.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/mutex.h>
45#include <sys/sysctl.h>
46#include <sys/socket.h>
47
48#include <net/if.h>
49#include <net/if_var.h>
50#include <net/if_private.h>
51#include <net/rss_config.h>
52#include <net/netisr.h>
53#include <net/vnet.h>
54
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/ip_var.h>
58#include <netinet/in_rss.h>
59#ifdef MAC
60#include <security/mac/mac_framework.h>
61#endif
62
63SYSCTL_DECL(_net_inet_ip);
64
65/*
66 * Reassembly headers are stored in hash buckets.
67 */
68#define	IPREASS_NHASH_LOG2	10
69#define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
70#define	IPREASS_HMASK		(V_ipq_hashsize - 1)
71
72struct ipqbucket {
73	TAILQ_HEAD(ipqhead, ipq) head;
74	struct mtx		 lock;
75	struct callout		 timer;
76#ifdef VIMAGE
77	struct vnet		 *vnet;
78#endif
79	int			 count;
80};
81
82VNET_DEFINE_STATIC(struct ipqbucket *, ipq);
83#define	V_ipq		VNET(ipq)
84VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
85#define	V_ipq_hashseed	VNET(ipq_hashseed)
86VNET_DEFINE_STATIC(uint32_t, ipq_hashsize);
87#define	V_ipq_hashsize	VNET(ipq_hashsize)
88
89#define	IPQ_LOCK(i)	mtx_lock(&V_ipq[i].lock)
90#define	IPQ_TRYLOCK(i)	mtx_trylock(&V_ipq[i].lock)
91#define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
92#define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)
93#define	IPQ_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->lock, MA_OWNED)
94
95VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
96#define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
97
98void		ipreass_init(void);
99void		ipreass_vnet_init(void);
100#ifdef VIMAGE
101void		ipreass_destroy(void);
102#endif
103static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
104static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
105static int	sysctl_fragttl(SYSCTL_HANDLER_ARGS);
106static void	ipreass_zone_change(void *);
107static void	ipreass_drain_tomax(void);
108static void	ipq_free(struct ipqbucket *, struct ipq *);
109static struct ipq * ipq_reuse(int);
110static void	ipreass_callout(void *);
111static void	ipreass_reschedule(struct ipqbucket *);
112
113static inline void
114ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
115{
116
117	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
118	ipq_free(bucket, fp);
119}
120
121static inline void
122ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
123{
124
125	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
126	ipq_free(bucket, fp);
127	ipreass_reschedule(bucket);
128}
129
130/*
131 * By default, limit the number of IP fragments across all reassembly
132 * queues to  1/32 of the total number of mbuf clusters.
133 *
134 * Limit the total number of reassembly queues per VNET to the
135 * IP fragment limit, but ensure the limit will not allow any bucket
136 * to grow above 100 items. (The bucket limit is
137 * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct
138 * multiplier to reach a 100-item limit.)
139 * The 100-item limit was chosen as brief testing seems to show that
140 * this produces "reasonable" performance on some subset of systems
141 * under DoS attack.
142 */
143#define	IP_MAXFRAGS		(nmbclusters / 32)
144#define	IP_MAXFRAGPACKETS	(imin(IP_MAXFRAGS, V_ipq_hashsize * 50))
145
146static int		maxfrags;
147static u_int __exclusive_cache_line	nfrags;
148SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
149    &maxfrags, 0,
150    "Maximum number of IPv4 fragments allowed across all reassembly queues");
151SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
152    &nfrags, 0,
153    "Current number of IPv4 fragments across all reassembly queues");
154
155VNET_DEFINE_STATIC(uma_zone_t, ipq_zone);
156#define	V_ipq_zone	VNET(ipq_zone)
157
158SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize,
159    CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0,
160    "Size of IP fragment reassembly hashtable");
161
162SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
163    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
164    NULL, 0, sysctl_maxfragpackets, "I",
165    "Maximum number of IPv4 fragment reassembly queue entries");
166SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
167    &VNET_NAME(ipq_zone),
168    "Current number of IPv4 fragment reassembly queue entries");
169
170VNET_DEFINE_STATIC(int, noreass);
171#define	V_noreass	VNET(noreass)
172
173VNET_DEFINE_STATIC(int, maxfragsperpacket);
174#define	V_maxfragsperpacket	VNET(maxfragsperpacket)
175SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
176    &VNET_NAME(maxfragsperpacket), 0,
177    "Maximum number of IPv4 fragments allowed per packet");
178SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
179    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
180    sysctl_maxfragbucketsize, "I",
181    "Maximum number of IPv4 fragment reassembly queue entries per bucket");
182
183VNET_DEFINE_STATIC(u_int, ipfragttl) = 30;
184#define	V_ipfragttl	VNET(ipfragttl)
185SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW |
186    CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU",
187    "IP fragment life time on reassembly queue (seconds)");
188
189/*
190 * Take incoming datagram fragment and try to reassemble it into
191 * whole datagram.  If the argument is the first fragment or one
192 * in between the function will return NULL and store the mbuf
193 * in the fragment chain.  If the argument is the last fragment
194 * the packet will be reassembled and the pointer to the new
195 * mbuf returned for further processing.  Only m_tags attached
196 * to the first packet/fragment are preserved.
197 * The IP header is *NOT* adjusted out of iplen.
198 */
199#define	M_IP_FRAG	M_PROTO9
200struct mbuf *
201ip_reass(struct mbuf *m)
202{
203	struct ip *ip;
204	struct mbuf *p, *q, *nq, *t;
205	struct ipq *fp;
206	struct ifnet *srcifp;
207	struct ipqhead *head;
208	int i, hlen, next, tmpmax;
209	u_int8_t ecn, ecn0;
210	uint32_t hash, hashkey[3];
211#ifdef	RSS
212	uint32_t rss_hash, rss_type;
213#endif
214
215	/*
216	 * If no reassembling or maxfragsperpacket are 0,
217	 * never accept fragments.
218	 * Also, drop packet if it would exceed the maximum
219	 * number of fragments.
220	 */
221	tmpmax = maxfrags;
222	if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
223	    (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) {
224		IPSTAT_INC(ips_fragments);
225		IPSTAT_INC(ips_fragdropped);
226		m_freem(m);
227		return (NULL);
228	}
229
230	ip = mtod(m, struct ip *);
231	hlen = ip->ip_hl << 2;
232
233	/*
234	 * Adjust ip_len to not reflect header,
235	 * convert offset of this to bytes.
236	 */
237	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
238	/*
239	 * Make sure that fragments have a data length
240	 * that's a non-zero multiple of 8 bytes, unless
241	 * this is the last fragment.
242	 */
243	if (ip->ip_len == htons(0) ||
244	    ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) {
245		IPSTAT_INC(ips_toosmall); /* XXX */
246		IPSTAT_INC(ips_fragdropped);
247		m_freem(m);
248		return (NULL);
249	}
250	if (ip->ip_off & htons(IP_MF))
251		m->m_flags |= M_IP_FRAG;
252	else
253		m->m_flags &= ~M_IP_FRAG;
254	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
255
256	/*
257	 * Make sure the fragment lies within a packet of valid size.
258	 */
259	if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) {
260		IPSTAT_INC(ips_toolong);
261		IPSTAT_INC(ips_fragdropped);
262		m_freem(m);
263		return (NULL);
264	}
265
266	/*
267	 * Store receive network interface pointer for later.
268	 */
269	srcifp = m->m_pkthdr.rcvif;
270
271	/*
272	 * Attempt reassembly; if it succeeds, proceed.
273	 * ip_reass() will return a different mbuf.
274	 */
275	IPSTAT_INC(ips_fragments);
276	m->m_pkthdr.PH_loc.ptr = ip;
277
278	/*
279	 * Presence of header sizes in mbufs
280	 * would confuse code below.
281	 */
282	m->m_data += hlen;
283	m->m_len -= hlen;
284
285	hashkey[0] = ip->ip_src.s_addr;
286	hashkey[1] = ip->ip_dst.s_addr;
287	hashkey[2] = (uint32_t)ip->ip_p << 16;
288	hashkey[2] += ip->ip_id;
289	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
290	hash &= IPREASS_HMASK;
291	head = &V_ipq[hash].head;
292	IPQ_LOCK(hash);
293
294	/*
295	 * Look for queue of fragments
296	 * of this datagram.
297	 */
298	TAILQ_FOREACH(fp, head, ipq_list)
299		if (ip->ip_id == fp->ipq_id &&
300		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
301		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
302#ifdef MAC
303		    mac_ipq_match(m, fp) &&
304#endif
305		    ip->ip_p == fp->ipq_p)
306			break;
307	/*
308	 * If first fragment to arrive, create a reassembly queue.
309	 */
310	if (fp == NULL) {
311		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
312			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
313		if (fp == NULL)
314			fp = ipq_reuse(hash);
315		if (fp == NULL)
316			goto dropfrag;
317#ifdef MAC
318		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
319			uma_zfree(V_ipq_zone, fp);
320			fp = NULL;
321			goto dropfrag;
322		}
323		mac_ipq_create(m, fp);
324#endif
325		TAILQ_INSERT_HEAD(head, fp, ipq_list);
326		V_ipq[hash].count++;
327		fp->ipq_nfrags = 1;
328		atomic_add_int(&nfrags, 1);
329		fp->ipq_expire = time_uptime + V_ipfragttl;
330		fp->ipq_p = ip->ip_p;
331		fp->ipq_id = ip->ip_id;
332		fp->ipq_src = ip->ip_src;
333		fp->ipq_dst = ip->ip_dst;
334		fp->ipq_frags = m;
335		if (m->m_flags & M_IP_FRAG)
336			fp->ipq_maxoff = -1;
337		else
338			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
339		m->m_nextpkt = NULL;
340		if (fp == TAILQ_LAST(head, ipqhead))
341			callout_reset_sbt(&V_ipq[hash].timer,
342			    SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout,
343			    &V_ipq[hash], 0);
344		else
345			MPASS(callout_active(&V_ipq[hash].timer));
346		goto done;
347	} else {
348		/*
349		 * If we already saw the last fragment, make sure
350		 * this fragment's offset looks sane. Otherwise, if
351		 * this is the last fragment, record its endpoint.
352		 */
353		if (fp->ipq_maxoff > 0) {
354			i = ntohs(ip->ip_off) + ntohs(ip->ip_len);
355			if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) ||
356			    ((m->m_flags & M_IP_FRAG) == 0 &&
357			    i != fp->ipq_maxoff)) {
358				fp = NULL;
359				goto dropfrag;
360			}
361		} else if ((m->m_flags & M_IP_FRAG) == 0)
362			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
363		fp->ipq_nfrags++;
364		atomic_add_int(&nfrags, 1);
365#ifdef MAC
366		mac_ipq_update(m, fp);
367#endif
368	}
369
370#define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
371
372	/*
373	 * Handle ECN by comparing this segment with the first one;
374	 * if CE is set, do not lose CE.
375	 * drop if CE and not-ECT are mixed for the same packet.
376	 */
377	ecn = ip->ip_tos & IPTOS_ECN_MASK;
378	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
379	if (ecn == IPTOS_ECN_CE) {
380		if (ecn0 == IPTOS_ECN_NOTECT)
381			goto dropfrag;
382		if (ecn0 != IPTOS_ECN_CE)
383			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
384	}
385	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
386		goto dropfrag;
387
388	/*
389	 * Find a segment which begins after this one does.
390	 */
391	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
392		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
393			break;
394
395	/*
396	 * If there is a preceding segment, it may provide some of
397	 * our data already.  If so, drop the data from the incoming
398	 * segment.  If it provides all of our data, drop us, otherwise
399	 * stick new segment in the proper place.
400	 *
401	 * If some of the data is dropped from the preceding
402	 * segment, then it's checksum is invalidated.
403	 */
404	if (p) {
405		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
406		    ntohs(ip->ip_off);
407		if (i > 0) {
408			if (i >= ntohs(ip->ip_len))
409				goto dropfrag;
410			m_adj(m, i);
411			m->m_pkthdr.csum_flags = 0;
412			ip->ip_off = htons(ntohs(ip->ip_off) + i);
413			ip->ip_len = htons(ntohs(ip->ip_len) - i);
414		}
415		m->m_nextpkt = p->m_nextpkt;
416		p->m_nextpkt = m;
417	} else {
418		m->m_nextpkt = fp->ipq_frags;
419		fp->ipq_frags = m;
420	}
421
422	/*
423	 * While we overlap succeeding segments trim them or,
424	 * if they are completely covered, dequeue them.
425	 */
426	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
427	    ntohs(GETIP(q)->ip_off); q = nq) {
428		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
429		    ntohs(GETIP(q)->ip_off);
430		if (i < ntohs(GETIP(q)->ip_len)) {
431			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
432			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
433			m_adj(q, i);
434			q->m_pkthdr.csum_flags = 0;
435			break;
436		}
437		nq = q->m_nextpkt;
438		m->m_nextpkt = nq;
439		IPSTAT_INC(ips_fragdropped);
440		fp->ipq_nfrags--;
441		atomic_subtract_int(&nfrags, 1);
442		m_freem(q);
443	}
444
445	/*
446	 * Check for complete reassembly and perform frag per packet
447	 * limiting.
448	 *
449	 * Frag limiting is performed here so that the nth frag has
450	 * a chance to complete the packet before we drop the packet.
451	 * As a result, n+1 frags are actually allowed per packet, but
452	 * only n will ever be stored. (n = maxfragsperpacket.)
453	 *
454	 */
455	next = 0;
456	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
457		if (ntohs(GETIP(q)->ip_off) != next) {
458			if (fp->ipq_nfrags > V_maxfragsperpacket)
459				ipq_drop(&V_ipq[hash], fp);
460			goto done;
461		}
462		next += ntohs(GETIP(q)->ip_len);
463	}
464	/* Make sure the last packet didn't have the IP_MF flag */
465	if (p->m_flags & M_IP_FRAG) {
466		if (fp->ipq_nfrags > V_maxfragsperpacket)
467			ipq_drop(&V_ipq[hash], fp);
468		goto done;
469	}
470
471	/*
472	 * Reassembly is complete.  Make sure the packet is a sane size.
473	 */
474	q = fp->ipq_frags;
475	ip = GETIP(q);
476	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
477		IPSTAT_INC(ips_toolong);
478		ipq_drop(&V_ipq[hash], fp);
479		goto done;
480	}
481
482	/*
483	 * Concatenate fragments.
484	 */
485	m = q;
486	t = m->m_next;
487	m->m_next = NULL;
488	m_cat(m, t);
489	nq = q->m_nextpkt;
490	q->m_nextpkt = NULL;
491	for (q = nq; q != NULL; q = nq) {
492		nq = q->m_nextpkt;
493		q->m_nextpkt = NULL;
494		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
495		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
496		m_demote_pkthdr(q);
497		m_cat(m, q);
498	}
499	/*
500	 * In order to do checksumming faster we do 'end-around carry' here
501	 * (and not in for{} loop), though it implies we are not going to
502	 * reassemble more than 64k fragments.
503	 */
504	while (m->m_pkthdr.csum_data & 0xffff0000)
505		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
506		    (m->m_pkthdr.csum_data >> 16);
507	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
508#ifdef MAC
509	mac_ipq_reassemble(fp, m);
510	mac_ipq_destroy(fp);
511#endif
512
513	/*
514	 * Create header for new ip packet by modifying header of first
515	 * packet;  dequeue and discard fragment reassembly header.
516	 * Make header visible.
517	 */
518	ip->ip_len = htons((ip->ip_hl << 2) + next);
519	ip->ip_src = fp->ipq_src;
520	ip->ip_dst = fp->ipq_dst;
521	TAILQ_REMOVE(head, fp, ipq_list);
522	V_ipq[hash].count--;
523	uma_zfree(V_ipq_zone, fp);
524	m->m_len += (ip->ip_hl << 2);
525	m->m_data -= (ip->ip_hl << 2);
526	/* some debugging cruft by sklower, below, will go away soon */
527	if (m->m_flags & M_PKTHDR) {	/* XXX this should be done elsewhere */
528		m_fixhdr(m);
529		/* set valid receive interface pointer */
530		m->m_pkthdr.rcvif = srcifp;
531	}
532	IPSTAT_INC(ips_reassembled);
533	ipreass_reschedule(&V_ipq[hash]);
534	IPQ_UNLOCK(hash);
535
536#ifdef	RSS
537	/*
538	 * Query the RSS layer for the flowid / flowtype for the
539	 * mbuf payload.
540	 *
541	 * For now, just assume we have to calculate a new one.
542	 * Later on we should check to see if the assigned flowid matches
543	 * what RSS wants for the given IP protocol and if so, just keep it.
544	 *
545	 * We then queue into the relevant netisr so it can be dispatched
546	 * to the correct CPU.
547	 *
548	 * Note - this may return 1, which means the flowid in the mbuf
549	 * is correct for the configured RSS hash types and can be used.
550	 */
551	if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
552		m->m_pkthdr.flowid = rss_hash;
553		M_HASHTYPE_SET(m, rss_type);
554	}
555
556	/*
557	 * Queue/dispatch for reprocessing.
558	 *
559	 * Note: this is much slower than just handling the frame in the
560	 * current receive context.  It's likely worth investigating
561	 * why this is.
562	 */
563	netisr_dispatch(NETISR_IP_DIRECT, m);
564	return (NULL);
565#endif
566
567	/* Handle in-line */
568	return (m);
569
570dropfrag:
571	IPSTAT_INC(ips_fragdropped);
572	if (fp != NULL) {
573		fp->ipq_nfrags--;
574		atomic_subtract_int(&nfrags, 1);
575	}
576	m_freem(m);
577done:
578	IPQ_UNLOCK(hash);
579	return (NULL);
580
581#undef GETIP
582}
583
584/*
585 * Timer expired on a bucket.
586 * There should be at least one ipq to be timed out.
587 */
588static void
589ipreass_callout(void *arg)
590{
591	struct ipqbucket *bucket = arg;
592	struct ipq *fp;
593
594	IPQ_BUCKET_LOCK_ASSERT(bucket);
595	MPASS(atomic_load_int(&nfrags) > 0);
596
597	CURVNET_SET(bucket->vnet);
598	fp = TAILQ_LAST(&bucket->head, ipqhead);
599	KASSERT(fp != NULL && fp->ipq_expire <= time_uptime,
600	    ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket,
601	    fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime));
602
603	while (fp != NULL && fp->ipq_expire <= time_uptime) {
604		ipq_timeout(bucket, fp);
605		fp = TAILQ_LAST(&bucket->head, ipqhead);
606	}
607	ipreass_reschedule(bucket);
608	CURVNET_RESTORE();
609}
610
611static void
612ipreass_reschedule(struct ipqbucket *bucket)
613{
614	struct ipq *fp;
615
616	IPQ_BUCKET_LOCK_ASSERT(bucket);
617
618	if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) {
619		time_t t;
620
621		/* Protect against time_uptime tick. */
622		t = fp->ipq_expire - time_uptime;
623		t = (t > 0) ? t : 1;
624		callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S,
625		    ipreass_callout, bucket, 0);
626	} else
627		callout_stop(&bucket->timer);
628}
629
630static void
631ipreass_drain_vnet(void)
632{
633	u_int dropped = 0;
634
635	for (int i = 0; i < V_ipq_hashsize; i++) {
636		bool resched;
637
638		IPQ_LOCK(i);
639		resched = !TAILQ_EMPTY(&V_ipq[i].head);
640		while(!TAILQ_EMPTY(&V_ipq[i].head)) {
641			struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head);
642
643			dropped += fp->ipq_nfrags;
644			ipq_free(&V_ipq[i], fp);
645		}
646		if (resched)
647			ipreass_reschedule(&V_ipq[i]);
648		KASSERT(V_ipq[i].count == 0,
649		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
650		    V_ipq[i].count, V_ipq));
651		IPQ_UNLOCK(i);
652	}
653	IPSTAT_ADD(ips_fragdropped, dropped);
654}
655
656/*
657 * Drain off all datagram fragments.
658 */
659static void
660ipreass_drain(void)
661{
662	VNET_ITERATOR_DECL(vnet_iter);
663
664	VNET_LIST_RLOCK();
665	VNET_FOREACH(vnet_iter) {
666		CURVNET_SET(vnet_iter);
667		ipreass_drain_vnet();
668		CURVNET_RESTORE();
669	}
670	VNET_LIST_RUNLOCK();
671}
672
673
674/*
675 * Initialize IP reassembly structures.
676 */
677MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers");
678void
679ipreass_vnet_init(void)
680{
681	int max;
682
683	V_ipq_hashsize = IPREASS_NHASH;
684	TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize);
685	V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize,
686	    M_IPREASS_HASH, M_WAITOK);
687
688	for (int i = 0; i < V_ipq_hashsize; i++) {
689		TAILQ_INIT(&V_ipq[i].head);
690		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
691		    MTX_DEF | MTX_DUPOK | MTX_NEW);
692		callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0);
693		V_ipq[i].count = 0;
694#ifdef VIMAGE
695		V_ipq[i].vnet = curvnet;
696#endif
697	}
698	V_ipq_hashseed = arc4random();
699	V_maxfragsperpacket = 16;
700	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
701	    NULL, UMA_ALIGN_PTR, 0);
702	max = IP_MAXFRAGPACKETS;
703	max = uma_zone_set_max(V_ipq_zone, max);
704	V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
705}
706
707void
708ipreass_init(void)
709{
710
711	maxfrags = IP_MAXFRAGS;
712	EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
713	    NULL, EVENTHANDLER_PRI_ANY);
714	EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL,
715	    LOWMEM_PRI_DEFAULT);
716	EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL,
717		LOWMEM_PRI_DEFAULT);
718}
719
720/*
721 * Drain off all datagram fragments belonging to
722 * the given network interface.
723 */
724static void
725ipreass_cleanup(void *arg __unused, struct ifnet *ifp)
726{
727	struct ipq *fp, *temp;
728	struct mbuf *m;
729	int i;
730
731	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
732
733	CURVNET_SET_QUIET(ifp->if_vnet);
734
735	/*
736	 * Skip processing if IPv4 reassembly is not initialised or
737	 * torn down by ipreass_destroy().
738	 */
739	if (V_ipq_zone == NULL) {
740		CURVNET_RESTORE();
741		return;
742	}
743
744	for (i = 0; i < V_ipq_hashsize; i++) {
745		IPQ_LOCK(i);
746		/* Scan fragment list. */
747		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) {
748			for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) {
749				/* clear no longer valid rcvif pointer */
750				if (m->m_pkthdr.rcvif == ifp)
751					m->m_pkthdr.rcvif = NULL;
752			}
753		}
754		IPQ_UNLOCK(i);
755	}
756	CURVNET_RESTORE();
757}
758EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0);
759
760#ifdef VIMAGE
761/*
762 * Destroy IP reassembly structures.
763 */
764void
765ipreass_destroy(void)
766{
767
768	ipreass_drain_vnet();
769	uma_zdestroy(V_ipq_zone);
770	V_ipq_zone = NULL;
771	for (int i = 0; i < V_ipq_hashsize; i++)
772		mtx_destroy(&V_ipq[i].lock);
773	free(V_ipq, M_IPREASS_HASH);
774}
775#endif
776
777/*
778 * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
779 * max has slightly different semantics than the sysctl, for historical
780 * reasons.
781 */
782static void
783ipreass_drain_tomax(void)
784{
785	struct ipq *fp;
786	int target;
787
788	/*
789	 * Make sure each bucket is under the new limit. If
790	 * necessary, drop enough of the oldest elements from
791	 * each bucket to get under the new limit.
792	 */
793	for (int i = 0; i < V_ipq_hashsize; i++) {
794		IPQ_LOCK(i);
795		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
796		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
797			ipq_timeout(&V_ipq[i], fp);
798		ipreass_reschedule(&V_ipq[i]);
799		IPQ_UNLOCK(i);
800	}
801
802	/*
803	 * If we are over the maximum number of fragments,
804	 * drain off enough to get down to the new limit,
805	 * stripping off last elements on queues.  Every
806	 * run we strip the oldest element from each bucket.
807	 */
808	target = uma_zone_get_max(V_ipq_zone);
809	while (uma_zone_get_cur(V_ipq_zone) > target) {
810		for (int i = 0; i < V_ipq_hashsize; i++) {
811			IPQ_LOCK(i);
812			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
813			if (fp != NULL) {
814				ipq_timeout(&V_ipq[i], fp);
815				ipreass_reschedule(&V_ipq[i]);
816			}
817			IPQ_UNLOCK(i);
818		}
819	}
820}
821
822static void
823ipreass_zone_change(void *tag)
824{
825	VNET_ITERATOR_DECL(vnet_iter);
826	int max;
827
828	maxfrags = IP_MAXFRAGS;
829	max = IP_MAXFRAGPACKETS;
830	VNET_LIST_RLOCK_NOSLEEP();
831	VNET_FOREACH(vnet_iter) {
832		CURVNET_SET(vnet_iter);
833		max = uma_zone_set_max(V_ipq_zone, max);
834		V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
835		ipreass_drain_tomax();
836		CURVNET_RESTORE();
837	}
838	VNET_LIST_RUNLOCK_NOSLEEP();
839}
840
841/*
842 * Change the limit on the UMA zone, or disable the fragment allocation
843 * at all.  Since 0 and -1 is a special values here, we need our own handler,
844 * instead of sysctl_handle_uma_zone_max().
845 */
846static int
847sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
848{
849	int error, max;
850
851	if (V_noreass == 0) {
852		max = uma_zone_get_max(V_ipq_zone);
853		if (max == 0)
854			max = -1;
855	} else
856		max = 0;
857	error = sysctl_handle_int(oidp, &max, 0, req);
858	if (error || !req->newptr)
859		return (error);
860	if (max > 0) {
861		/*
862		 * XXXRW: Might be a good idea to sanity check the argument
863		 * and place an extreme upper bound.
864		 */
865		max = uma_zone_set_max(V_ipq_zone, max);
866		V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
867		ipreass_drain_tomax();
868		V_noreass = 0;
869	} else if (max == 0) {
870		V_noreass = 1;
871		ipreass_drain();
872	} else if (max == -1) {
873		V_noreass = 0;
874		uma_zone_set_max(V_ipq_zone, 0);
875		V_ipreass_maxbucketsize = INT_MAX;
876	} else
877		return (EINVAL);
878	return (0);
879}
880
881/*
882 * Seek for old fragment queue header that can be reused.  Try to
883 * reuse a header from currently locked hash bucket.
884 */
885static struct ipq *
886ipq_reuse(int start)
887{
888	struct ipq *fp;
889	int bucket, i;
890
891	IPQ_LOCK_ASSERT(start);
892
893	for (i = 0; i < V_ipq_hashsize; i++) {
894		bucket = (start + i) % V_ipq_hashsize;
895		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
896			continue;
897		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
898		if (fp) {
899			struct mbuf *m;
900
901			IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
902			atomic_subtract_int(&nfrags, fp->ipq_nfrags);
903			while (fp->ipq_frags) {
904				m = fp->ipq_frags;
905				fp->ipq_frags = m->m_nextpkt;
906				m_freem(m);
907			}
908			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
909			V_ipq[bucket].count--;
910			ipreass_reschedule(&V_ipq[bucket]);
911			if (bucket != start)
912				IPQ_UNLOCK(bucket);
913			break;
914		}
915		if (bucket != start)
916			IPQ_UNLOCK(bucket);
917	}
918	IPQ_LOCK_ASSERT(start);
919	return (fp);
920}
921
922/*
923 * Free a fragment reassembly header and all associated datagrams.
924 */
925static void
926ipq_free(struct ipqbucket *bucket, struct ipq *fp)
927{
928	struct mbuf *q;
929
930	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
931	while (fp->ipq_frags) {
932		q = fp->ipq_frags;
933		fp->ipq_frags = q->m_nextpkt;
934		m_freem(q);
935	}
936	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
937	bucket->count--;
938	uma_zfree(V_ipq_zone, fp);
939}
940
941/*
942 * Get or set the maximum number of reassembly queues per bucket.
943 */
944static int
945sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
946{
947	int error, max;
948
949	max = V_ipreass_maxbucketsize;
950	error = sysctl_handle_int(oidp, &max, 0, req);
951	if (error || !req->newptr)
952		return (error);
953	if (max <= 0)
954		return (EINVAL);
955	V_ipreass_maxbucketsize = max;
956	ipreass_drain_tomax();
957	return (0);
958}
959
960/*
961 * Get or set the IP fragment time to live.
962 */
963static int
964sysctl_fragttl(SYSCTL_HANDLER_ARGS)
965{
966	u_int ttl;
967	int error;
968
969	ttl = V_ipfragttl;
970	error = sysctl_handle_int(oidp, &ttl, 0, req);
971	if (error || !req->newptr)
972		return (error);
973
974	if (ttl < 1 || ttl > MAXTTL)
975		return (EINVAL);
976
977	atomic_store_int(&V_ipfragttl, ttl);
978	return (0);
979}
980