1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 *      The Regents of the University of California.  All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 *      Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12 * All rights reserved.
13 *
14 * Portions of this software were developed at the Centre for Advanced Internet
15 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16 * James Healy and David Hayes, made possible in part by a grant from the Cisco
17 * University Research Program Fund at Community Foundation Silicon Valley.
18 *
19 * Portions of this software were developed at the Centre for Advanced
20 * Internet Architectures, Swinburne University of Technology, Melbourne,
21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22 *
23 * Portions of this software were developed by Robert N. M. Watson under
24 * contract to Juniper Networks, Inc.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 3. Neither the name of the University nor the names of its contributors
35 *    may be used to endorse or promote products derived from this software
36 *    without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 */
50
51/*
52 * Utility functions to deal with Explicit Congestion Notification in TCP
53 * implementing the essential parts of the Accurate ECN extension
54 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
55 */
56
57#include <sys/cdefs.h>
58#include "opt_inet.h"
59#include "opt_inet6.h"
60
61#include <sys/param.h>
62#include <sys/systm.h>
63#include <sys/kernel.h>
64#include <sys/sysctl.h>
65#include <sys/malloc.h>
66#include <sys/mbuf.h>
67#include <sys/socket.h>
68#include <sys/socketvar.h>
69
70#include <machine/cpu.h>
71
72#include <vm/uma.h>
73
74#include <net/if.h>
75#include <net/if_var.h>
76#include <net/route.h>
77#include <net/vnet.h>
78
79#include <netinet/in.h>
80#include <netinet/in_systm.h>
81#include <netinet/ip.h>
82#include <netinet/in_var.h>
83#include <netinet/in_pcb.h>
84#include <netinet/ip_var.h>
85#include <netinet/ip6.h>
86#include <netinet/icmp6.h>
87#include <netinet6/nd6.h>
88#include <netinet6/ip6_var.h>
89#include <netinet6/in6_pcb.h>
90#include <netinet/tcp.h>
91#include <netinet/tcp_fsm.h>
92#include <netinet/tcp_seq.h>
93#include <netinet/tcp_var.h>
94#include <netinet/tcp_syncache.h>
95#include <netinet/tcp_timer.h>
96#include <netinet/tcpip.h>
97#include <netinet/tcp_ecn.h>
98
99static inline int  tcp_ecn_get_ace(uint16_t);
100static inline void tcp_ecn_set_ace(uint16_t *, uint32_t);
101
102static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn,
103    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
104    "TCP ECN");
105
106VNET_DEFINE(int, tcp_do_ecn) = 2;
107SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable,
108    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0,
109    "TCP ECN support");
110
111VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
112SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries,
113    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0,
114    "Max retries before giving up on ECN");
115
116/*
117 * Process incoming SYN,ACK packet
118 */
119void
120tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
121{
122	switch (V_tcp_do_ecn) {
123	case 0:
124		return;
125	case 1:
126		/* FALLTHROUGH */
127	case 2:
128		/* RFC3168 ECN handling */
129		if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
130			tp->t_flags2 |= TF2_ECN_PERMIT;
131			tp->t_flags2 &= ~TF2_ACE_PERMIT;
132			TCPSTAT_INC(tcps_ecn_shs);
133		}
134		break;
135	case 3:
136		/* FALLTHROUGH */
137	case 4:
138		/*
139		 * Decoding Accurate ECN according to
140		 * table in section 3.1.1
141		 *
142		 * On the SYN,ACK, process the AccECN
143		 * flags indicating the state the SYN
144		 * was delivered.
145		 * Reactions to Path ECN mangling can
146		 * come here.
147		 */
148		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
149		/* RFC3168 SYN */
150		case (0|0|TH_ECE):
151			tp->t_flags2 |= TF2_ECN_PERMIT;
152			tp->t_flags2 &= ~TF2_ACE_PERMIT;
153			TCPSTAT_INC(tcps_ecn_shs);
154			break;
155		/* non-ECT SYN */
156		case (0|TH_CWR|0):
157			tp->t_flags2 |= TF2_ACE_PERMIT;
158			tp->t_flags2 &= ~TF2_ECN_PERMIT;
159			tp->t_scep = 5;
160			TCPSTAT_INC(tcps_ecn_shs);
161			TCPSTAT_INC(tcps_ace_nect);
162			break;
163		/* ECT0 SYN */
164		case (TH_AE|0|0):
165			tp->t_flags2 |= TF2_ACE_PERMIT;
166			tp->t_flags2 &= ~TF2_ECN_PERMIT;
167			tp->t_scep = 5;
168			TCPSTAT_INC(tcps_ecn_shs);
169			TCPSTAT_INC(tcps_ace_ect0);
170			break;
171		/* ECT1 SYN */
172		case (0|TH_CWR|TH_ECE):
173			tp->t_flags2 |= TF2_ACE_PERMIT;
174			tp->t_flags2 &= ~TF2_ECN_PERMIT;
175			tp->t_scep = 5;
176			TCPSTAT_INC(tcps_ecn_shs);
177			TCPSTAT_INC(tcps_ace_ect1);
178			break;
179		/* CE SYN */
180		case (TH_AE|TH_CWR|0):
181			tp->t_flags2 |= TF2_ACE_PERMIT;
182			tp->t_flags2 &= ~TF2_ECN_PERMIT;
183			tp->t_scep = 6;
184			/*
185			 * reduce the IW to 2 MSS (to
186			 * account for delayed acks) if
187			 * the SYN,ACK was CE marked
188			 */
189			tp->snd_cwnd = 2 * tcp_maxseg(tp);
190			TCPSTAT_INC(tcps_ecn_shs);
191			TCPSTAT_INC(tcps_ace_nect);
192			break;
193		default:
194			tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
195			break;
196		}
197		/*
198		 * Set the AccECN Codepoints on
199		 * the outgoing <ACK> to the ECN
200		 * state of the <SYN,ACK>
201		 * according to table 3 in the
202		 * AccECN draft
203		 */
204		switch (iptos & IPTOS_ECN_MASK) {
205		case (IPTOS_ECN_NOTECT):
206			tp->t_rcep = 0b010;
207			break;
208		case (IPTOS_ECN_ECT0):
209			tp->t_rcep = 0b100;
210			break;
211		case (IPTOS_ECN_ECT1):
212			tp->t_rcep = 0b011;
213			break;
214		case (IPTOS_ECN_CE):
215			tp->t_rcep = 0b110;
216			break;
217		}
218		break;
219	}
220}
221
222/*
223 * Handle parallel SYN for ECN
224 */
225void
226tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
227{
228	if (thflags & TH_ACK)
229		return;
230	switch (V_tcp_do_ecn) {
231	case 0:
232		return;
233	case 1:
234		/* FALLTHROUGH */
235	case 2:
236		/* RFC3168 ECN handling */
237		if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
238			tp->t_flags2 |= TF2_ECN_PERMIT;
239			tp->t_flags2 &= ~TF2_ACE_PERMIT;
240			tp->t_flags2 |= TF2_ECN_SND_ECE;
241			TCPSTAT_INC(tcps_ecn_shs);
242		}
243		break;
244	case 3:
245		/* FALLTHROUGH */
246	case 4:
247		/* AccECN handling */
248		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
249		default:
250		case (0|0|0):
251			tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
252			break;
253		case (0|TH_CWR|TH_ECE):
254			tp->t_flags2 |= TF2_ECN_PERMIT;
255			tp->t_flags2 &= ~TF2_ACE_PERMIT;
256			tp->t_flags2 |= TF2_ECN_SND_ECE;
257			TCPSTAT_INC(tcps_ecn_shs);
258			break;
259		case (TH_AE|TH_CWR|TH_ECE):
260			tp->t_flags2 |= TF2_ACE_PERMIT;
261			tp->t_flags2 &= ~TF2_ECN_PERMIT;
262			TCPSTAT_INC(tcps_ecn_shs);
263			/*
264			 * Set the AccECN Codepoints on
265			 * the outgoing <ACK> to the ECN
266			 * state of the <SYN,ACK>
267			 * according to table 3 in the
268			 * AccECN draft
269			 */
270			switch (iptos & IPTOS_ECN_MASK) {
271			case (IPTOS_ECN_NOTECT):
272				tp->t_rcep = 0b010;
273				break;
274			case (IPTOS_ECN_ECT0):
275				tp->t_rcep = 0b100;
276				break;
277			case (IPTOS_ECN_ECT1):
278				tp->t_rcep = 0b011;
279				break;
280			case (IPTOS_ECN_CE):
281				tp->t_rcep = 0b110;
282				break;
283			}
284			break;
285		}
286		break;
287	}
288}
289
290/*
291 * TCP ECN processing.
292 */
293int
294tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos)
295{
296	int delta_cep = 0;
297
298	switch (iptos & IPTOS_ECN_MASK) {
299	case IPTOS_ECN_CE:
300		TCPSTAT_INC(tcps_ecn_rcvce);
301		break;
302	case IPTOS_ECN_ECT0:
303		TCPSTAT_INC(tcps_ecn_rcvect0);
304		break;
305	case IPTOS_ECN_ECT1:
306		TCPSTAT_INC(tcps_ecn_rcvect1);
307		break;
308	}
309
310	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
311		if (tp->t_flags2 & TF2_ACE_PERMIT) {
312			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
313				tp->t_rcep += 1;
314			if (tp->t_flags2 & TF2_ECN_PERMIT) {
315				delta_cep = (tcp_ecn_get_ace(thflags) + 8 -
316					    (tp->t_scep & 7)) & 7;
317				if (delta_cep < pkts)
318					delta_cep = pkts -
319					    ((pkts - delta_cep) & 7);
320				tp->t_scep += delta_cep;
321			} else {
322				/*
323				 * process the final ACK of the 3WHS
324				 * see table 3 in draft-ietf-tcpm-accurate-ecn
325				 */
326				switch (tcp_ecn_get_ace(thflags)) {
327				case 0b010:
328					/* nonECT SYN or SYN,ACK */
329					/* FALLTHROUGH */
330				case 0b011:
331					/* ECT1 SYN or SYN,ACK */
332					/* FALLTHROUGH */
333				case 0b100:
334					/* ECT0 SYN or SYN,ACK */
335					tp->t_scep = 5;
336					break;
337				case 0b110:
338					/* CE SYN or SYN,ACK */
339					tp->t_scep = 6;
340					tp->snd_cwnd = 2 * tcp_maxseg(tp);
341					break;
342				default:
343					/* mangled AccECN handshake */
344					tp->t_scep = 5;
345					break;
346				}
347				tp->t_flags2 |= TF2_ECN_PERMIT;
348			}
349		} else {
350			/* RFC3168 ECN handling */
351			if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) {
352				delta_cep = 1;
353				tp->t_scep++;
354			}
355			if (thflags & TH_CWR) {
356				tp->t_flags2 &= ~TF2_ECN_SND_ECE;
357				tp->t_flags |= TF_ACKNOW;
358			}
359			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
360				tp->t_flags2 |= TF2_ECN_SND_ECE;
361		}
362
363		/* Process a packet differently from RFC3168. */
364		cc_ecnpkt_handler_flags(tp, thflags, iptos);
365	}
366
367	return delta_cep;
368}
369
370/*
371 * Send ECN setup <SYN> packet header flags
372 */
373uint16_t
374tcp_ecn_output_syn_sent(struct tcpcb *tp)
375{
376	uint16_t thflags = 0;
377
378	if (V_tcp_do_ecn == 0)
379		return thflags;
380	if (V_tcp_do_ecn == 1) {
381		/* Send a RFC3168 ECN setup <SYN> packet */
382		if (tp->t_rxtshift >= 1) {
383			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
384				thflags = TH_ECE|TH_CWR;
385		} else
386			thflags = TH_ECE|TH_CWR;
387	} else if (V_tcp_do_ecn == 3) {
388		/* Send an Accurate ECN setup <SYN> packet */
389		if (tp->t_rxtshift >= 1) {
390			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
391				thflags = TH_ECE|TH_CWR|TH_AE;
392		} else
393			thflags = TH_ECE|TH_CWR|TH_AE;
394	}
395
396	return thflags;
397}
398
399/*
400 * output processing of ECN feature
401 * returning IP ECN header codepoint
402 */
403int
404tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
405{
406	int ipecn = IPTOS_ECN_NOTECT;
407	bool newdata;
408
409	/*
410	 * If the peer has ECN, mark data packets with
411	 * ECN capable transmission (ECT).
412	 * Ignore pure control packets, retransmissions
413	 * and window probes.
414	 */
415	newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
416		    !rxmit &&
417		    !((tp->t_flags & TF_FORCEDATA) && len == 1));
418	/* RFC3168 ECN marking, only new data segments */
419	if (newdata) {
420		if (tp->t_flags2 & TF2_ECN_USE_ECT1) {
421			ipecn = IPTOS_ECN_ECT1;
422			TCPSTAT_INC(tcps_ecn_sndect1);
423		} else {
424			ipecn = IPTOS_ECN_ECT0;
425			TCPSTAT_INC(tcps_ecn_sndect0);
426		}
427	}
428	/*
429	 * Reply with proper ECN notifications.
430	 */
431	if (tp->t_flags2 & TF2_ACE_PERMIT) {
432		tcp_ecn_set_ace(thflags, tp->t_rcep);
433		if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
434			/*
435			 * here we process the final
436			 * ACK of the 3WHS
437			 */
438			if (tp->t_rcep == 0b110) {
439				tp->t_rcep = 6;
440			} else {
441				tp->t_rcep = 5;
442			}
443			tp->t_flags2 |= TF2_ECN_PERMIT;
444		}
445	} else {
446		if (newdata &&
447		    (tp->t_flags2 & TF2_ECN_SND_CWR)) {
448			*thflags |= TH_CWR;
449			tp->t_flags2 &= ~TF2_ECN_SND_CWR;
450		}
451		if (tp->t_flags2 & TF2_ECN_SND_ECE)
452			*thflags |= TH_ECE;
453	}
454
455	return ipecn;
456}
457
458/*
459 * Set up the ECN related tcpcb fields from
460 * a syncache entry
461 */
462void
463tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
464{
465	if (sc->sc_flags & SCF_ECN_MASK) {
466		switch (sc->sc_flags & SCF_ECN_MASK) {
467		case SCF_ECN:
468			tp->t_flags2 |= TF2_ECN_PERMIT;
469			break;
470		case SCF_ACE_N:
471			/* FALLTHROUGH */
472		case SCF_ACE_0:
473			/* FALLTHROUGH */
474		case SCF_ACE_1:
475			tp->t_flags2 |= TF2_ACE_PERMIT;
476			tp->t_scep = 5;
477			tp->t_rcep = 5;
478			break;
479		case SCF_ACE_CE:
480			tp->t_flags2 |= TF2_ACE_PERMIT;
481			tp->t_scep = 6;
482			tp->t_rcep = 6;
483			break;
484		}
485	}
486}
487
488/*
489 * Process a <SYN> packets ECN information, and provide the
490 * syncache with the relevant information.
491 */
492int
493tcp_ecn_syncache_add(uint16_t thflags, int iptos)
494{
495	int scflags = 0;
496
497	switch (iptos & IPTOS_ECN_MASK) {
498	case IPTOS_ECN_CE:
499		TCPSTAT_INC(tcps_ecn_rcvce);
500		break;
501	case IPTOS_ECN_ECT0:
502		TCPSTAT_INC(tcps_ecn_rcvect0);
503		break;
504	case IPTOS_ECN_ECT1:
505		TCPSTAT_INC(tcps_ecn_rcvect1);
506		break;
507	}
508
509	switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
510	/* no ECN */
511	case (0|0|0):
512		break;
513	/* legacy ECN */
514	case (0|TH_CWR|TH_ECE):
515		scflags = SCF_ECN;
516		break;
517	/* Accurate ECN */
518	case (TH_AE|TH_CWR|TH_ECE):
519		if ((V_tcp_do_ecn == 3) ||
520		    (V_tcp_do_ecn == 4)) {
521			switch (iptos & IPTOS_ECN_MASK) {
522			case IPTOS_ECN_CE:
523				scflags = SCF_ACE_CE;
524				break;
525			case IPTOS_ECN_ECT0:
526				scflags = SCF_ACE_0;
527				break;
528			case IPTOS_ECN_ECT1:
529				scflags = SCF_ACE_1;
530				break;
531			case IPTOS_ECN_NOTECT:
532				scflags = SCF_ACE_N;
533				break;
534			}
535		} else
536			scflags = SCF_ECN;
537		break;
538	/* Default Case (section 3.1.2) */
539	default:
540		if ((V_tcp_do_ecn == 3) ||
541		    (V_tcp_do_ecn == 4)) {
542			switch (iptos & IPTOS_ECN_MASK) {
543			case IPTOS_ECN_CE:
544				scflags = SCF_ACE_CE;
545				break;
546			case IPTOS_ECN_ECT0:
547				scflags = SCF_ACE_0;
548				break;
549			case IPTOS_ECN_ECT1:
550				scflags = SCF_ACE_1;
551				break;
552			case IPTOS_ECN_NOTECT:
553				scflags = SCF_ACE_N;
554				break;
555			}
556		}
557		break;
558	}
559	return scflags;
560}
561
562/*
563 * Set up the ECN information for the <SYN,ACK> from
564 * syncache information.
565 */
566uint16_t
567tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
568{
569	if ((thflags & TH_SYN) &&
570	    (sc->sc_flags & SCF_ECN_MASK)) {
571		switch (sc->sc_flags & SCF_ECN_MASK) {
572		case SCF_ECN:
573			thflags |= (0 | 0 | TH_ECE);
574			TCPSTAT_INC(tcps_ecn_shs);
575			break;
576		case SCF_ACE_N:
577			thflags |= (0 | TH_CWR | 0);
578			TCPSTAT_INC(tcps_ecn_shs);
579			TCPSTAT_INC(tcps_ace_nect);
580			break;
581		case SCF_ACE_0:
582			thflags |= (TH_AE | 0 | 0);
583			TCPSTAT_INC(tcps_ecn_shs);
584			TCPSTAT_INC(tcps_ace_ect0);
585			break;
586		case SCF_ACE_1:
587			thflags |= (0 | TH_ECE | TH_CWR);
588			TCPSTAT_INC(tcps_ecn_shs);
589			TCPSTAT_INC(tcps_ace_ect1);
590			break;
591		case SCF_ACE_CE:
592			thflags |= (TH_AE | TH_CWR | 0);
593			TCPSTAT_INC(tcps_ecn_shs);
594			TCPSTAT_INC(tcps_ace_ce);
595			break;
596		}
597	}
598	return thflags;
599}
600
601static inline int
602tcp_ecn_get_ace(uint16_t thflags)
603{
604	return ((thflags & (TH_AE|TH_CWR|TH_ECE)) >> TH_ACE_SHIFT);
605}
606
607static inline void
608tcp_ecn_set_ace(uint16_t *thflags, uint32_t t_rcep)
609{
610	*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
611	*thflags |= ((t_rcep << TH_ACE_SHIFT) & (TH_AE|TH_CWR|TH_ECE));
612}
613