1/*-
2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Bjoern Zeeb
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/mbuf.h>
41#include <sys/kernel.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/ethernet.h>
48#include <net/vnet.h>
49
50#include <netinet/in_systm.h>
51#include <netinet/in.h>
52#include <netinet/ip6.h>
53#include <netinet/ip.h>
54#include <netinet/ip_var.h>
55#include <netinet/tcp.h>
56#include <netinet/tcp_lro.h>
57#include <netinet/tcp_var.h>
58
59#include <netinet6/ip6_var.h>
60
61#include <machine/in_cksum.h>
62
63#ifndef LRO_ENTRIES
64#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
65#endif
66
67#define	TCP_LRO_UPDATE_CSUM	1
68#ifndef	TCP_LRO_UPDATE_CSUM
69#define	TCP_LRO_INVALID_CSUM	0x0000
70#endif
71
72SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
73    "TCP LRO");
74
75static unsigned	tcp_lro_entries = LRO_ENTRIES;
76SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
77    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
78    "default number of LRO entries");
79
80int
81tcp_lro_init(struct lro_ctrl *lc)
82{
83	struct lro_entry *le;
84	int error, i;
85
86	lc->lro_bad_csum = 0;
87	lc->lro_queued = 0;
88	lc->lro_flushed = 0;
89	lc->lro_cnt = 0;
90	SLIST_INIT(&lc->lro_free);
91	SLIST_INIT(&lc->lro_active);
92
93	error = 0;
94	for (i = 0; i < tcp_lro_entries; i++) {
95		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
96		    M_NOWAIT | M_ZERO);
97                if (le == NULL) {
98			if (i == 0)
99				error = ENOMEM;
100                        break;
101                }
102		lc->lro_cnt = i + 1;
103		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
104        }
105
106	return (error);
107}
108
109void
110tcp_lro_free(struct lro_ctrl *lc)
111{
112	struct lro_entry *le;
113
114	while (!SLIST_EMPTY(&lc->lro_free)) {
115		le = SLIST_FIRST(&lc->lro_free);
116		SLIST_REMOVE_HEAD(&lc->lro_free, next);
117		free(le, M_DEVBUF);
118	}
119}
120
121#ifdef TCP_LRO_UPDATE_CSUM
122static uint16_t
123tcp_lro_csum_th(struct tcphdr *th)
124{
125	uint32_t ch;
126	uint16_t *p, l;
127
128	ch = th->th_sum = 0x0000;
129	l = th->th_off;
130	p = (uint16_t *)th;
131	while (l > 0) {
132		ch += *p;
133		p++;
134		ch += *p;
135		p++;
136		l--;
137	}
138	while (ch > 0xffff)
139		ch = (ch >> 16) + (ch & 0xffff);
140
141	return (ch & 0xffff);
142}
143
144static uint16_t
145tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
146    uint16_t tcp_data_len, uint16_t csum)
147{
148	uint32_t c;
149	uint16_t cs;
150
151	c = csum;
152
153	/* Remove length from checksum. */
154	switch (le->eh_type) {
155#ifdef INET6
156	case ETHERTYPE_IPV6:
157	{
158		struct ip6_hdr *ip6;
159
160		ip6 = (struct ip6_hdr *)l3hdr;
161		if (le->append_cnt == 0)
162			cs = ip6->ip6_plen;
163		else {
164			uint32_t cx;
165
166			cx = ntohs(ip6->ip6_plen);
167			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
168		}
169		break;
170	}
171#endif
172#ifdef INET
173	case ETHERTYPE_IP:
174	{
175		struct ip *ip4;
176
177		ip4 = (struct ip *)l3hdr;
178		if (le->append_cnt == 0)
179			cs = ip4->ip_len;
180		else {
181			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
182			    IPPROTO_TCP);
183			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
184			    htons(cs));
185		}
186		break;
187	}
188#endif
189	default:
190		cs = 0;		/* Keep compiler happy. */
191	}
192
193	cs = ~cs;
194	c += cs;
195
196	/* Remove TCP header csum. */
197	cs = ~tcp_lro_csum_th(th);
198	c += cs;
199	while (c > 0xffff)
200		c = (c >> 16) + (c & 0xffff);
201
202	return (c & 0xffff);
203}
204#endif
205
206void
207tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
208{
209	struct lro_entry *le, *le_tmp;
210	struct timeval tv;
211
212	if (SLIST_EMPTY(&lc->lro_active))
213		return;
214
215	getmicrotime(&tv);
216	timevalsub(&tv, timeout);
217	SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
218		if (timevalcmp(&tv, &le->mtime, >=)) {
219			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
220			tcp_lro_flush(lc, le);
221		}
222	}
223}
224
225void
226tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
227{
228
229	if (le->append_cnt > 0) {
230		struct tcphdr *th;
231		uint16_t p_len;
232
233		p_len = htons(le->p_len);
234		switch (le->eh_type) {
235#ifdef INET6
236		case ETHERTYPE_IPV6:
237		{
238			struct ip6_hdr *ip6;
239
240			ip6 = le->le_ip6;
241			ip6->ip6_plen = p_len;
242			th = (struct tcphdr *)(ip6 + 1);
243			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
244			    CSUM_PSEUDO_HDR;
245			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
246			break;
247		}
248#endif
249#ifdef INET
250		case ETHERTYPE_IP:
251		{
252			struct ip *ip4;
253#ifdef TCP_LRO_UPDATE_CSUM
254			uint32_t cl;
255			uint16_t c;
256#endif
257
258			ip4 = le->le_ip4;
259#ifdef TCP_LRO_UPDATE_CSUM
260			/* Fix IP header checksum for new length. */
261			c = ~ip4->ip_sum;
262			cl = c;
263			c = ~ip4->ip_len;
264			cl += c + p_len;
265			while (cl > 0xffff)
266				cl = (cl >> 16) + (cl & 0xffff);
267			c = cl;
268			ip4->ip_sum = ~c;
269#else
270			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
271#endif
272			ip4->ip_len = p_len;
273			th = (struct tcphdr *)(ip4 + 1);
274			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
275			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
276			le->p_len += ETHER_HDR_LEN;
277			break;
278		}
279#endif
280		default:
281			th = NULL;	/* Keep compiler happy. */
282		}
283		le->m_head->m_pkthdr.csum_data = 0xffff;
284		le->m_head->m_pkthdr.len = le->p_len;
285
286		/* Incorporate the latest ACK into the TCP header. */
287		th->th_ack = le->ack_seq;
288		th->th_win = le->window;
289		/* Incorporate latest timestamp into the TCP header. */
290		if (le->timestamp != 0) {
291			uint32_t *ts_ptr;
292
293			ts_ptr = (uint32_t *)(th + 1);
294			ts_ptr[1] = htonl(le->tsval);
295			ts_ptr[2] = le->tsecr;
296		}
297#ifdef TCP_LRO_UPDATE_CSUM
298		/* Update the TCP header checksum. */
299		le->ulp_csum += p_len;
300		le->ulp_csum += tcp_lro_csum_th(th);
301		while (le->ulp_csum > 0xffff)
302			le->ulp_csum = (le->ulp_csum >> 16) +
303			    (le->ulp_csum & 0xffff);
304		th->th_sum = (le->ulp_csum & 0xffff);
305		th->th_sum = ~th->th_sum;
306#else
307		th->th_sum = TCP_LRO_INVALID_CSUM;
308#endif
309	}
310
311	(*lc->ifp->if_input)(lc->ifp, le->m_head);
312	lc->lro_queued += le->append_cnt + 1;
313	lc->lro_flushed++;
314	bzero(le, sizeof(*le));
315	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
316}
317
318#ifdef INET6
319static int
320tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
321    struct tcphdr **th)
322{
323
324	/* XXX-BZ we should check the flow-label. */
325
326	/* XXX-BZ We do not yet support ext. hdrs. */
327	if (ip6->ip6_nxt != IPPROTO_TCP)
328		return (TCP_LRO_NOT_SUPPORTED);
329
330	/* Find the TCP header. */
331	*th = (struct tcphdr *)(ip6 + 1);
332
333	return (0);
334}
335#endif
336
337#ifdef INET
338static int
339tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
340    struct tcphdr **th)
341{
342	int csum_flags;
343	uint16_t csum;
344
345	if (ip4->ip_p != IPPROTO_TCP)
346		return (TCP_LRO_NOT_SUPPORTED);
347
348	/* Ensure there are no options. */
349	if ((ip4->ip_hl << 2) != sizeof (*ip4))
350		return (TCP_LRO_CANNOT);
351
352	/* .. and the packet is not fragmented. */
353	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
354		return (TCP_LRO_CANNOT);
355
356	/* Legacy IP has a header checksum that needs to be correct. */
357	csum_flags = m->m_pkthdr.csum_flags;
358	if (csum_flags & CSUM_IP_CHECKED) {
359		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
360			lc->lro_bad_csum++;
361			return (TCP_LRO_CANNOT);
362		}
363	} else {
364		csum = in_cksum_hdr(ip4);
365		if (__predict_false((csum) != 0)) {
366			lc->lro_bad_csum++;
367			return (TCP_LRO_CANNOT);
368		}
369	}
370
371	/* Find the TCP header (we assured there are no IP options). */
372	*th = (struct tcphdr *)(ip4 + 1);
373
374	return (0);
375}
376#endif
377
378int
379tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
380{
381	struct lro_entry *le;
382	struct ether_header *eh;
383#ifdef INET6
384	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
385#endif
386#ifdef INET
387	struct ip *ip4 = NULL;		/* Keep compiler happy. */
388#endif
389	struct tcphdr *th;
390	void *l3hdr = NULL;		/* Keep compiler happy. */
391	uint32_t *ts_ptr;
392	tcp_seq seq;
393	int error, ip_len, l;
394	uint16_t eh_type, tcp_data_len;
395	int force_flush = 0;
396
397	/* We expect a contiguous header [eh, ip, tcp]. */
398
399	eh = mtod(m, struct ether_header *);
400	eh_type = ntohs(eh->ether_type);
401	switch (eh_type) {
402#ifdef INET6
403	case ETHERTYPE_IPV6:
404	{
405		CURVNET_SET(lc->ifp->if_vnet);
406		if (V_ip6_forwarding != 0) {
407			/* XXX-BZ stats but changing lro_ctrl is a problem. */
408			CURVNET_RESTORE();
409			return (TCP_LRO_CANNOT);
410		}
411		CURVNET_RESTORE();
412		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
413		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
414		if (error != 0)
415			return (error);
416		tcp_data_len = ntohs(ip6->ip6_plen);
417		ip_len = sizeof(*ip6) + tcp_data_len;
418		break;
419	}
420#endif
421#ifdef INET
422	case ETHERTYPE_IP:
423	{
424		CURVNET_SET(lc->ifp->if_vnet);
425		if (V_ipforwarding != 0) {
426			/* XXX-BZ stats but changing lro_ctrl is a problem. */
427			CURVNET_RESTORE();
428			return (TCP_LRO_CANNOT);
429		}
430		CURVNET_RESTORE();
431		l3hdr = ip4 = (struct ip *)(eh + 1);
432		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
433		if (error != 0)
434			return (error);
435		ip_len = ntohs(ip4->ip_len);
436		tcp_data_len = ip_len - sizeof(*ip4);
437		break;
438	}
439#endif
440	/* XXX-BZ what happens in case of VLAN(s)? */
441	default:
442		return (TCP_LRO_NOT_SUPPORTED);
443	}
444
445	/*
446	 * If the frame is padded beyond the end of the IP packet, then we must
447	 * trim the extra bytes off.
448	 */
449	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
450	if (l != 0) {
451		if (l < 0)
452			/* Truncated packet. */
453			return (TCP_LRO_CANNOT);
454
455		m_adj(m, -l);
456	}
457
458	/*
459	 * Check TCP header constraints.
460	 */
461	/* Ensure no bits set besides ACK or PSH. */
462	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
463		if (th->th_flags & TH_SYN)
464			return (TCP_LRO_CANNOT);
465		/*
466		 * Make sure that previously seen segements/ACKs are delivered
467		 * before this segement, e.g. FIN.
468		 */
469		force_flush = 1;
470	}
471
472	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
473	/* XXX-BZ Ideally we'd flush on PUSH? */
474
475	/*
476	 * Check for timestamps.
477	 * Since the only option we handle are timestamps, we only have to
478	 * handle the simple case of aligned timestamps.
479	 */
480	l = (th->th_off << 2);
481	tcp_data_len -= l;
482	l -= sizeof(*th);
483	ts_ptr = (uint32_t *)(th + 1);
484	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
485	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
486	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
487		/*
488		 * Make sure that previously seen segements/ACKs are delivered
489		 * before this segement.
490		 */
491		force_flush = 1;
492	}
493
494	/* If the driver did not pass in the checksum, set it now. */
495	if (csum == 0x0000)
496		csum = th->th_sum;
497
498	seq = ntohl(th->th_seq);
499
500	/* Try to find a matching previous segment. */
501	SLIST_FOREACH(le, &lc->lro_active, next) {
502		if (le->eh_type != eh_type)
503			continue;
504		if (le->source_port != th->th_sport ||
505		    le->dest_port != th->th_dport)
506			continue;
507		switch (eh_type) {
508#ifdef INET6
509		case ETHERTYPE_IPV6:
510			if (bcmp(&le->source_ip6, &ip6->ip6_src,
511			    sizeof(struct in6_addr)) != 0 ||
512			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
513			    sizeof(struct in6_addr)) != 0)
514				continue;
515			break;
516#endif
517#ifdef INET
518		case ETHERTYPE_IP:
519			if (le->source_ip4 != ip4->ip_src.s_addr ||
520			    le->dest_ip4 != ip4->ip_dst.s_addr)
521				continue;
522			break;
523#endif
524		}
525
526		if (force_flush) {
527			/* Timestamps mismatch; this is a FIN, etc */
528			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
529			tcp_lro_flush(lc, le);
530			return (TCP_LRO_CANNOT);
531		}
532
533		/* Flush now if appending will result in overflow. */
534		if (le->p_len > (65535 - tcp_data_len)) {
535			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
536			tcp_lro_flush(lc, le);
537			break;
538		}
539
540		/* Try to append the new segment. */
541		if (__predict_false(seq != le->next_seq ||
542		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
543			/* Out of order packet or duplicate ACK. */
544			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
545			tcp_lro_flush(lc, le);
546			return (TCP_LRO_CANNOT);
547		}
548
549		if (l != 0) {
550			uint32_t tsval = ntohl(*(ts_ptr + 1));
551			/* Make sure timestamp values are increasing. */
552			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
553			if (__predict_false(le->tsval > tsval ||
554			    *(ts_ptr + 2) == 0))
555				return (TCP_LRO_CANNOT);
556			le->tsval = tsval;
557			le->tsecr = *(ts_ptr + 2);
558		}
559
560		le->next_seq += tcp_data_len;
561		le->ack_seq = th->th_ack;
562		le->window = th->th_win;
563		le->append_cnt++;
564
565#ifdef TCP_LRO_UPDATE_CSUM
566		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
567		    tcp_data_len, ~csum);
568#endif
569
570		if (tcp_data_len == 0) {
571			m_freem(m);
572			return (0);
573		}
574
575		le->p_len += tcp_data_len;
576
577		/*
578		 * Adjust the mbuf so that m_data points to the first byte of
579		 * the ULP payload.  Adjust the mbuf to avoid complications and
580		 * append new segment to existing mbuf chain.
581		 */
582		m_adj(m, m->m_pkthdr.len - tcp_data_len);
583		m->m_flags &= ~M_PKTHDR;
584
585		le->m_tail->m_next = m;
586		le->m_tail = m_last(m);
587
588		/*
589		 * If a possible next full length packet would cause an
590		 * overflow, pro-actively flush now.
591		 */
592		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
593			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
594			tcp_lro_flush(lc, le);
595		} else
596			getmicrotime(&le->mtime);
597
598		return (0);
599	}
600
601	if (force_flush) {
602		/*
603		 * Nothing to flush, but this segment can not be further
604		 * aggregated/delayed.
605		 */
606		return (TCP_LRO_CANNOT);
607	}
608
609	/* Try to find an empty slot. */
610	if (SLIST_EMPTY(&lc->lro_free))
611		return (TCP_LRO_NO_ENTRIES);
612
613	/* Start a new segment chain. */
614	le = SLIST_FIRST(&lc->lro_free);
615	SLIST_REMOVE_HEAD(&lc->lro_free, next);
616	SLIST_INSERT_HEAD(&lc->lro_active, le, next);
617	getmicrotime(&le->mtime);
618
619	/* Start filling in details. */
620	switch (eh_type) {
621#ifdef INET6
622	case ETHERTYPE_IPV6:
623		le->le_ip6 = ip6;
624		le->source_ip6 = ip6->ip6_src;
625		le->dest_ip6 = ip6->ip6_dst;
626		le->eh_type = eh_type;
627		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
628		break;
629#endif
630#ifdef INET
631	case ETHERTYPE_IP:
632		le->le_ip4 = ip4;
633		le->source_ip4 = ip4->ip_src.s_addr;
634		le->dest_ip4 = ip4->ip_dst.s_addr;
635		le->eh_type = eh_type;
636		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
637		break;
638#endif
639	}
640	le->source_port = th->th_sport;
641	le->dest_port = th->th_dport;
642
643	le->next_seq = seq + tcp_data_len;
644	le->ack_seq = th->th_ack;
645	le->window = th->th_win;
646	if (l != 0) {
647		le->timestamp = 1;
648		le->tsval = ntohl(*(ts_ptr + 1));
649		le->tsecr = *(ts_ptr + 2);
650	}
651
652#ifdef TCP_LRO_UPDATE_CSUM
653	/*
654	 * Do not touch the csum of the first packet.  However save the
655	 * "adjusted" checksum of just the source and destination addresses,
656	 * the next header and the TCP payload.  The length and TCP header
657	 * parts may change, so we remove those from the saved checksum and
658	 * re-add with final values on tcp_lro_flush() if needed.
659	 */
660	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
661	    __func__, le, le->ulp_csum));
662
663	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
664	    ~csum);
665	th->th_sum = csum;	/* Restore checksum on first packet. */
666#endif
667
668	le->m_head = m;
669	le->m_tail = m_last(m);
670
671	return (0);
672}
673
674/* end */
675