t4_listen.c revision 331722
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_listen.c 331722 2018-03-29 02:50:57Z eadler $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_types.h>
49#include <net/if_vlan_var.h>
50#include <net/route.h>
51#include <netinet/in.h>
52#include <netinet/in_fib.h>
53#include <netinet/in_pcb.h>
54#include <netinet/ip.h>
55#include <netinet/ip6.h>
56#include <netinet6/in6_fib.h>
57#include <netinet6/scope6_var.h>
58#include <netinet/tcp_timer.h>
59#define TCPSTATES
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_var.h>
62#include <netinet/toecore.h>
63
64#include "common/common.h"
65#include "common/t4_msg.h"
66#include "common/t4_regs.h"
67#include "tom/t4_tom_l2t.h"
68#include "tom/t4_tom.h"
69
70/* stid services */
71static int alloc_stid(struct adapter *, struct listen_ctx *, int);
72static struct listen_ctx *lookup_stid(struct adapter *, int);
73static void free_stid(struct adapter *, struct listen_ctx *);
74
75/* lctx services */
76static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
77    struct vi_info *);
78static int free_lctx(struct adapter *, struct listen_ctx *);
79static void hold_lctx(struct listen_ctx *);
80static void listen_hash_add(struct adapter *, struct listen_ctx *);
81static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
82static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
83static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
84
85static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *);
86static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
87static void send_reset_synqe(struct toedev *, struct synq_entry *);
88
89static int
90alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
91{
92	struct tid_info *t = &sc->tids;
93	u_int stid, n, f, mask;
94	struct stid_region *sr = &lctx->stid_region;
95
96	/*
97	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
98	 * the TCAM.  The start of the stid region is properly aligned (the chip
99	 * requires each region to be 128-cell aligned).
100	 */
101	n = isipv6 ? 2 : 1;
102	mask = n - 1;
103	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
104	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
105	    __func__, t->stid_base, t->nstids, n));
106
107	mtx_lock(&t->stid_lock);
108	if (n > t->nstids - t->stids_in_use) {
109		mtx_unlock(&t->stid_lock);
110		return (-1);
111	}
112
113	if (t->nstids_free_head >= n) {
114		/*
115		 * This allocation will definitely succeed because the region
116		 * starts at a good alignment and we just checked we have enough
117		 * stids free.
118		 */
119		f = t->nstids_free_head & mask;
120		t->nstids_free_head -= n + f;
121		stid = t->nstids_free_head;
122		TAILQ_INSERT_HEAD(&t->stids, sr, link);
123	} else {
124		struct stid_region *s;
125
126		stid = t->nstids_free_head;
127		TAILQ_FOREACH(s, &t->stids, link) {
128			stid += s->used + s->free;
129			f = stid & mask;
130			if (s->free >= n + f) {
131				stid -= n + f;
132				s->free -= n + f;
133				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
134				goto allocated;
135			}
136		}
137
138		if (__predict_false(stid != t->nstids)) {
139			panic("%s: stids TAILQ (%p) corrupt."
140			    "  At %d instead of %d at the end of the queue.",
141			    __func__, &t->stids, stid, t->nstids);
142		}
143
144		mtx_unlock(&t->stid_lock);
145		return (-1);
146	}
147
148allocated:
149	sr->used = n;
150	sr->free = f;
151	t->stids_in_use += n;
152	t->stid_tab[stid] = lctx;
153	mtx_unlock(&t->stid_lock);
154
155	KASSERT(((stid + t->stid_base) & mask) == 0,
156	    ("%s: EDOOFUS.", __func__));
157	return (stid + t->stid_base);
158}
159
160static struct listen_ctx *
161lookup_stid(struct adapter *sc, int stid)
162{
163	struct tid_info *t = &sc->tids;
164
165	return (t->stid_tab[stid - t->stid_base]);
166}
167
168static void
169free_stid(struct adapter *sc, struct listen_ctx *lctx)
170{
171	struct tid_info *t = &sc->tids;
172	struct stid_region *sr = &lctx->stid_region;
173	struct stid_region *s;
174
175	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
176
177	mtx_lock(&t->stid_lock);
178	s = TAILQ_PREV(sr, stid_head, link);
179	if (s != NULL)
180		s->free += sr->used + sr->free;
181	else
182		t->nstids_free_head += sr->used + sr->free;
183	KASSERT(t->stids_in_use >= sr->used,
184	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
185	    t->stids_in_use, sr->used));
186	t->stids_in_use -= sr->used;
187	TAILQ_REMOVE(&t->stids, sr, link);
188	mtx_unlock(&t->stid_lock);
189}
190
191static struct listen_ctx *
192alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
193{
194	struct listen_ctx *lctx;
195
196	INP_WLOCK_ASSERT(inp);
197
198	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
199	if (lctx == NULL)
200		return (NULL);
201
202	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
203	if (lctx->stid < 0) {
204		free(lctx, M_CXGBE);
205		return (NULL);
206	}
207
208	if (inp->inp_vflag & INP_IPV6 &&
209	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
210		struct tom_data *td = sc->tom_softc;
211
212		lctx->ce = hold_lip(td, &inp->in6p_laddr, NULL);
213		if (lctx->ce == NULL) {
214			free(lctx, M_CXGBE);
215			return (NULL);
216		}
217	}
218
219	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
220	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
221	refcount_init(&lctx->refcount, 1);
222	TAILQ_INIT(&lctx->synq);
223
224	lctx->inp = inp;
225	lctx->vnet = inp->inp_socket->so_vnet;
226	in_pcbref(inp);
227
228	return (lctx);
229}
230
231/* Don't call this directly, use release_lctx instead */
232static int
233free_lctx(struct adapter *sc, struct listen_ctx *lctx)
234{
235	struct inpcb *inp = lctx->inp;
236	struct tom_data *td = sc->tom_softc;
237
238	INP_WLOCK_ASSERT(inp);
239	KASSERT(lctx->refcount == 0,
240	    ("%s: refcount %d", __func__, lctx->refcount));
241	KASSERT(TAILQ_EMPTY(&lctx->synq),
242	    ("%s: synq not empty.", __func__));
243	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
244
245	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
246	    __func__, lctx->stid, lctx, lctx->inp);
247
248	if (lctx->ce)
249		release_lip(td, lctx->ce);
250	free_stid(sc, lctx);
251	free(lctx, M_CXGBE);
252
253	return (in_pcbrele_wlocked(inp));
254}
255
256static void
257hold_lctx(struct listen_ctx *lctx)
258{
259
260	refcount_acquire(&lctx->refcount);
261}
262
263static inline uint32_t
264listen_hashfn(void *key, u_long mask)
265{
266
267	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
268}
269
270/*
271 * Add a listen_ctx entry to the listen hash table.
272 */
273static void
274listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
275{
276	struct tom_data *td = sc->tom_softc;
277	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
278
279	mtx_lock(&td->lctx_hash_lock);
280	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
281	td->lctx_count++;
282	mtx_unlock(&td->lctx_hash_lock);
283}
284
285/*
286 * Look for the listening socket's context entry in the hash and return it.
287 */
288static struct listen_ctx *
289listen_hash_find(struct adapter *sc, struct inpcb *inp)
290{
291	struct tom_data *td = sc->tom_softc;
292	int bucket = listen_hashfn(inp, td->listen_mask);
293	struct listen_ctx *lctx;
294
295	mtx_lock(&td->lctx_hash_lock);
296	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
297		if (lctx->inp == inp)
298			break;
299	}
300	mtx_unlock(&td->lctx_hash_lock);
301
302	return (lctx);
303}
304
305/*
306 * Removes the listen_ctx structure for inp from the hash and returns it.
307 */
308static struct listen_ctx *
309listen_hash_del(struct adapter *sc, struct inpcb *inp)
310{
311	struct tom_data *td = sc->tom_softc;
312	int bucket = listen_hashfn(inp, td->listen_mask);
313	struct listen_ctx *lctx, *l;
314
315	mtx_lock(&td->lctx_hash_lock);
316	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
317		if (lctx->inp == inp) {
318			LIST_REMOVE(lctx, link);
319			td->lctx_count--;
320			break;
321		}
322	}
323	mtx_unlock(&td->lctx_hash_lock);
324
325	return (lctx);
326}
327
328/*
329 * Releases a hold on the lctx.  Must be called with the listening socket's inp
330 * locked.  The inp may be freed by this function and it returns NULL to
331 * indicate this.
332 */
333static struct inpcb *
334release_lctx(struct adapter *sc, struct listen_ctx *lctx)
335{
336	struct inpcb *inp = lctx->inp;
337	int inp_freed = 0;
338
339	INP_WLOCK_ASSERT(inp);
340	if (refcount_release(&lctx->refcount))
341		inp_freed = free_lctx(sc, lctx);
342
343	return (inp_freed ? NULL : inp);
344}
345
346static void
347send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
348{
349	struct adapter *sc = tod->tod_softc;
350	struct mbuf *m = synqe->syn;
351	struct ifnet *ifp = m->m_pkthdr.rcvif;
352	struct vi_info *vi = ifp->if_softc;
353	struct port_info *pi = vi->pi;
354	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
355	struct wrqe *wr;
356	struct fw_flowc_wr *flowc;
357	struct cpl_abort_req *req;
358	int txqid, rxqid, flowclen;
359	struct sge_wrq *ofld_txq;
360	struct sge_ofld_rxq *ofld_rxq;
361	const int nparams = 6;
362	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
363
364	INP_WLOCK_ASSERT(synqe->lctx->inp);
365
366	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
367	    __func__, synqe, synqe->flags, synqe->tid,
368	    synqe->flags & TPF_ABORT_SHUTDOWN ?
369	    " (abort already in progress)" : "");
370	if (synqe->flags & TPF_ABORT_SHUTDOWN)
371		return;	/* abort already in progress */
372	synqe->flags |= TPF_ABORT_SHUTDOWN;
373
374	get_qids_from_mbuf(m, &txqid, &rxqid);
375	ofld_txq = &sc->sge.ofld_txq[txqid];
376	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
377
378	/* The wrqe will have two WRs - a flowc followed by an abort_req */
379	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
380
381	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
382	if (wr == NULL) {
383		/* XXX */
384		panic("%s: allocation failure.", __func__);
385	}
386	flowc = wrtod(wr);
387	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
388
389	/* First the flowc ... */
390	memset(flowc, 0, wr->wr_len);
391	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
392	    V_FW_FLOWC_WR_NPARAMS(nparams));
393	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
394	    V_FW_WR_FLOWID(synqe->tid));
395	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
396	flowc->mnemval[0].val = htobe32(pfvf);
397	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
398	flowc->mnemval[1].val = htobe32(pi->tx_chan);
399	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
400	flowc->mnemval[2].val = htobe32(pi->tx_chan);
401	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
402	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
403 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
404 	flowc->mnemval[4].val = htobe32(512);
405 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
406 	flowc->mnemval[5].val = htobe32(512);
407	synqe->flags |= TPF_FLOWC_WR_SENT;
408
409	/* ... then ABORT request */
410	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
411	req->rsvd0 = 0;	/* don't have a snd_nxt */
412	req->rsvd1 = 1;	/* no data sent yet */
413	req->cmd = CPL_ABORT_SEND_RST;
414
415	t4_l2t_send(sc, wr, e);
416}
417
418static int
419create_server(struct adapter *sc, struct listen_ctx *lctx)
420{
421	struct wrqe *wr;
422	struct cpl_pass_open_req *req;
423	struct inpcb *inp = lctx->inp;
424
425	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
426	if (wr == NULL) {
427		log(LOG_ERR, "%s: allocation failure", __func__);
428		return (ENOMEM);
429	}
430	req = wrtod(wr);
431
432	INIT_TP_WR(req, 0);
433	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
434	req->local_port = inp->inp_lport;
435	req->peer_port = 0;
436	req->local_ip = inp->inp_laddr.s_addr;
437	req->peer_ip = 0;
438	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
439	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
440	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
441
442	t4_wrq_tx(sc, wr);
443	return (0);
444}
445
446static int
447create_server6(struct adapter *sc, struct listen_ctx *lctx)
448{
449	struct wrqe *wr;
450	struct cpl_pass_open_req6 *req;
451	struct inpcb *inp = lctx->inp;
452
453	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
454	if (wr == NULL) {
455		log(LOG_ERR, "%s: allocation failure", __func__);
456		return (ENOMEM);
457	}
458	req = wrtod(wr);
459
460	INIT_TP_WR(req, 0);
461	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
462	req->local_port = inp->inp_lport;
463	req->peer_port = 0;
464	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
465	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
466	req->peer_ip_hi = 0;
467	req->peer_ip_lo = 0;
468	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
469	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
470	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
471
472	t4_wrq_tx(sc, wr);
473	return (0);
474}
475
476static int
477destroy_server(struct adapter *sc, struct listen_ctx *lctx)
478{
479	struct wrqe *wr;
480	struct cpl_close_listsvr_req *req;
481
482	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
483	if (wr == NULL) {
484		/* XXX */
485		panic("%s: allocation failure.", __func__);
486	}
487	req = wrtod(wr);
488
489	INIT_TP_WR(req, 0);
490	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
491	    lctx->stid));
492	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
493	req->rsvd = htobe16(0);
494
495	t4_wrq_tx(sc, wr);
496	return (0);
497}
498
499/*
500 * Start a listening server by sending a passive open request to HW.
501 *
502 * Can't take adapter lock here and access to sc->flags,
503 * sc->offload_map, if_capenable are all race prone.
504 */
505int
506t4_listen_start(struct toedev *tod, struct tcpcb *tp)
507{
508	struct adapter *sc = tod->tod_softc;
509	struct vi_info *vi;
510	struct port_info *pi;
511	struct inpcb *inp = tp->t_inpcb;
512	struct listen_ctx *lctx;
513	int i, rc, v;
514
515	INP_WLOCK_ASSERT(inp);
516
517	/* Don't start a hardware listener for any loopback address. */
518	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
519		return (0);
520	if (!(inp->inp_vflag & INP_IPV6) &&
521	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
522		return (0);
523#if 0
524	ADAPTER_LOCK(sc);
525	if (IS_BUSY(sc)) {
526		log(LOG_ERR, "%s: listen request ignored, %s is busy",
527		    __func__, device_get_nameunit(sc->dev));
528		goto done;
529	}
530
531	KASSERT(uld_active(sc, ULD_TOM),
532	    ("%s: TOM not initialized", __func__));
533#endif
534
535	/*
536	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
537	 * such VI's queues to send the passive open and receive the reply to
538	 * it.
539	 *
540	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
541	 * then reject any attempt to bring down such a port (and maybe reject
542	 * attempts to disable IFCAP_TOE on that port too?).
543	 */
544	for_each_port(sc, i) {
545		pi = sc->port[i];
546		for_each_vi(pi, v, vi) {
547			if (vi->flags & VI_INIT_DONE &&
548			    vi->ifp->if_capenable & IFCAP_TOE)
549				goto found;
550		}
551	}
552	goto done;	/* no port that's UP with IFCAP_TOE enabled */
553found:
554
555	if (listen_hash_find(sc, inp) != NULL)
556		goto done;	/* already setup */
557
558	lctx = alloc_lctx(sc, inp, vi);
559	if (lctx == NULL) {
560		log(LOG_ERR,
561		    "%s: listen request ignored, %s couldn't allocate lctx\n",
562		    __func__, device_get_nameunit(sc->dev));
563		goto done;
564	}
565	listen_hash_add(sc, lctx);
566
567	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
568	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
569	    inp->inp_vflag);
570
571	if (inp->inp_vflag & INP_IPV6)
572		rc = create_server6(sc, lctx);
573	else
574		rc = create_server(sc, lctx);
575	if (rc != 0) {
576		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
577		    __func__, device_get_nameunit(sc->dev), rc);
578		(void) listen_hash_del(sc, inp);
579		inp = release_lctx(sc, lctx);
580		/* can't be freed, host stack has a reference */
581		KASSERT(inp != NULL, ("%s: inp freed", __func__));
582		goto done;
583	}
584	lctx->flags |= LCTX_RPL_PENDING;
585done:
586#if 0
587	ADAPTER_UNLOCK(sc);
588#endif
589	return (0);
590}
591
592int
593t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
594{
595	struct listen_ctx *lctx;
596	struct adapter *sc = tod->tod_softc;
597	struct inpcb *inp = tp->t_inpcb;
598	struct synq_entry *synqe;
599
600	INP_WLOCK_ASSERT(inp);
601
602	lctx = listen_hash_del(sc, inp);
603	if (lctx == NULL)
604		return (ENOENT);	/* no hardware listener for this inp */
605
606	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
607	    lctx, lctx->flags);
608
609	/*
610	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
611	 * arrive and clean up when it does.
612	 */
613	if (lctx->flags & LCTX_RPL_PENDING) {
614		KASSERT(TAILQ_EMPTY(&lctx->synq),
615		    ("%s: synq not empty.", __func__));
616		return (EINPROGRESS);
617	}
618
619	/*
620	 * The host stack will abort all the connections on the listening
621	 * socket's so_comp.  It doesn't know about the connections on the synq
622	 * so we need to take care of those.
623	 */
624	TAILQ_FOREACH(synqe, &lctx->synq, link) {
625		if (synqe->flags & TPF_SYNQE_HAS_L2TE)
626			send_reset_synqe(tod, synqe);
627	}
628
629	destroy_server(sc, lctx);
630	return (0);
631}
632
633static inline void
634hold_synqe(struct synq_entry *synqe)
635{
636
637	refcount_acquire(&synqe->refcnt);
638}
639
640static inline void
641release_synqe(struct synq_entry *synqe)
642{
643
644	if (refcount_release(&synqe->refcnt)) {
645		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
646
647		m_freem(synqe->syn);
648		if (needfree)
649			free(synqe, M_CXGBE);
650	}
651}
652
653void
654t4_syncache_added(struct toedev *tod __unused, void *arg)
655{
656	struct synq_entry *synqe = arg;
657
658	hold_synqe(synqe);
659}
660
661void
662t4_syncache_removed(struct toedev *tod __unused, void *arg)
663{
664	struct synq_entry *synqe = arg;
665
666	release_synqe(synqe);
667}
668
669int
670t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
671{
672	struct adapter *sc = tod->tod_softc;
673	struct synq_entry *synqe = arg;
674	struct wrqe *wr;
675	struct l2t_entry *e;
676	struct tcpopt to;
677	struct ip *ip = mtod(m, struct ip *);
678	struct tcphdr *th;
679
680	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
681	if (wr == NULL) {
682		m_freem(m);
683		return (EALREADY);
684	}
685
686	if (ip->ip_v == IPVERSION)
687		th = (void *)(ip + 1);
688	else
689		th = (void *)((struct ip6_hdr *)ip + 1);
690	bzero(&to, sizeof(to));
691	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
692	    TO_SYN);
693
694	/* save these for later */
695	synqe->iss = be32toh(th->th_seq);
696	synqe->ts = to.to_tsval;
697
698	if (chip_id(sc) >= CHELSIO_T5) {
699		struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
700
701		rpl5->iss = th->th_seq;
702	}
703
704	e = &sc->l2t->l2tab[synqe->l2e_idx];
705	t4_l2t_send(sc, wr, e);
706
707	m_freem(m);	/* don't need this any more */
708	return (0);
709}
710
711static int
712do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
713    struct mbuf *m)
714{
715	struct adapter *sc = iq->adapter;
716	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
717	int stid = GET_TID(cpl);
718	unsigned int status = cpl->status;
719	struct listen_ctx *lctx = lookup_stid(sc, stid);
720	struct inpcb *inp = lctx->inp;
721#ifdef INVARIANTS
722	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
723#endif
724
725	KASSERT(opcode == CPL_PASS_OPEN_RPL,
726	    ("%s: unexpected opcode 0x%x", __func__, opcode));
727	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
728	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
729
730	INP_WLOCK(inp);
731
732	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
733	    __func__, stid, status, lctx->flags);
734
735	lctx->flags &= ~LCTX_RPL_PENDING;
736
737	if (status != CPL_ERR_NONE)
738		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
739
740#ifdef INVARIANTS
741	/*
742	 * If the inp has been dropped (listening socket closed) then
743	 * listen_stop must have run and taken the inp out of the hash.
744	 */
745	if (inp->inp_flags & INP_DROPPED) {
746		KASSERT(listen_hash_del(sc, inp) == NULL,
747		    ("%s: inp %p still in listen hash", __func__, inp));
748	}
749#endif
750
751	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
752		if (release_lctx(sc, lctx) != NULL)
753			INP_WUNLOCK(inp);
754		return (status);
755	}
756
757	/*
758	 * Listening socket stopped listening earlier and now the chip tells us
759	 * it has started the hardware listener.  Stop it; the lctx will be
760	 * released in do_close_server_rpl.
761	 */
762	if (inp->inp_flags & INP_DROPPED) {
763		destroy_server(sc, lctx);
764		INP_WUNLOCK(inp);
765		return (status);
766	}
767
768	/*
769	 * Failed to start hardware listener.  Take inp out of the hash and
770	 * release our reference on it.  An error message has been logged
771	 * already.
772	 */
773	if (status != CPL_ERR_NONE) {
774		listen_hash_del(sc, inp);
775		if (release_lctx(sc, lctx) != NULL)
776			INP_WUNLOCK(inp);
777		return (status);
778	}
779
780	/* hardware listener open for business */
781
782	INP_WUNLOCK(inp);
783	return (status);
784}
785
786static int
787do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
788    struct mbuf *m)
789{
790	struct adapter *sc = iq->adapter;
791	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
792	int stid = GET_TID(cpl);
793	unsigned int status = cpl->status;
794	struct listen_ctx *lctx = lookup_stid(sc, stid);
795	struct inpcb *inp = lctx->inp;
796#ifdef INVARIANTS
797	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
798#endif
799
800	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
801	    ("%s: unexpected opcode 0x%x", __func__, opcode));
802	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
803	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
804
805	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
806
807	if (status != CPL_ERR_NONE) {
808		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
809		    __func__, status, stid);
810		return (status);
811	}
812
813	INP_WLOCK(inp);
814	inp = release_lctx(sc, lctx);
815	if (inp != NULL)
816		INP_WUNLOCK(inp);
817
818	return (status);
819}
820
821static void
822done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
823{
824	struct listen_ctx *lctx = synqe->lctx;
825	struct inpcb *inp = lctx->inp;
826	struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc;
827	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
828	int ntids;
829
830	INP_WLOCK_ASSERT(inp);
831	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
832
833	TAILQ_REMOVE(&lctx->synq, synqe, link);
834	inp = release_lctx(sc, lctx);
835	if (inp)
836		INP_WUNLOCK(inp);
837	remove_tid(sc, synqe->tid, ntids);
838	release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]);
839	t4_l2t_release(e);
840	release_synqe(synqe);	/* removed from synq list */
841}
842
843int
844do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
845    struct mbuf *m)
846{
847	struct adapter *sc = iq->adapter;
848	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
849	unsigned int tid = GET_TID(cpl);
850	struct synq_entry *synqe = lookup_tid(sc, tid);
851	struct listen_ctx *lctx = synqe->lctx;
852	struct inpcb *inp = lctx->inp;
853	int txqid;
854	struct sge_wrq *ofld_txq;
855#ifdef INVARIANTS
856	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
857#endif
858
859	KASSERT(opcode == CPL_ABORT_REQ_RSS,
860	    ("%s: unexpected opcode 0x%x", __func__, opcode));
861	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
862	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
863
864	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
865	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
866
867	if (negative_advice(cpl->status))
868		return (0);	/* Ignore negative advice */
869
870	INP_WLOCK(inp);
871
872	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
873	ofld_txq = &sc->sge.ofld_txq[txqid];
874
875	/*
876	 * If we'd initiated an abort earlier the reply to it is responsible for
877	 * cleaning up resources.  Otherwise we tear everything down right here
878	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
879	 */
880	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
881		INP_WUNLOCK(inp);
882		goto done;
883	}
884
885	done_with_synqe(sc, synqe);
886	/* inp lock released by done_with_synqe */
887done:
888	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
889	return (0);
890}
891
892int
893do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
894    struct mbuf *m)
895{
896	struct adapter *sc = iq->adapter;
897	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
898	unsigned int tid = GET_TID(cpl);
899	struct synq_entry *synqe = lookup_tid(sc, tid);
900	struct listen_ctx *lctx = synqe->lctx;
901	struct inpcb *inp = lctx->inp;
902#ifdef INVARIANTS
903	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
904#endif
905
906	KASSERT(opcode == CPL_ABORT_RPL_RSS,
907	    ("%s: unexpected opcode 0x%x", __func__, opcode));
908	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
909	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
910
911	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
912	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
913
914	INP_WLOCK(inp);
915	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
916	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
917	    __func__, synqe, synqe->flags));
918
919	done_with_synqe(sc, synqe);
920	/* inp lock released by done_with_synqe */
921
922	return (0);
923}
924
925void
926t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
927{
928	struct adapter *sc = tod->tod_softc;
929	struct synq_entry *synqe = arg;
930#ifdef INVARIANTS
931	struct inpcb *inp = sotoinpcb(so);
932#endif
933	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
934	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
935
936	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
937	INP_WLOCK_ASSERT(inp);
938	KASSERT(synqe->flags & TPF_SYNQE,
939	    ("%s: %p not a synq_entry?", __func__, arg));
940
941	offload_socket(so, toep);
942	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
943	toep->flags |= TPF_CPL_PENDING;
944	update_tid(sc, synqe->tid, toep);
945	synqe->flags |= TPF_SYNQE_EXPANDED;
946}
947
948static inline void
949save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi)
950{
951	uint32_t txqid, rxqid;
952
953	txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
954	rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
955
956	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
957}
958
959static inline void
960get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
961{
962
963	if (txqid)
964		*txqid = m->m_pkthdr.flowid >> 16;
965	if (rxqid)
966		*rxqid = m->m_pkthdr.flowid & 0xffff;
967}
968
969/*
970 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
971 * store some state temporarily.
972 */
973static struct synq_entry *
974mbuf_to_synqe(struct mbuf *m)
975{
976	int len = roundup2(sizeof (struct synq_entry), 8);
977	int tspace = M_TRAILINGSPACE(m);
978	struct synq_entry *synqe = NULL;
979
980	if (tspace < len) {
981		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
982		if (synqe == NULL)
983			return (NULL);
984		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
985	} else {
986		synqe = (void *)(m->m_data + m->m_len + tspace - len);
987		synqe->flags = TPF_SYNQE;
988	}
989
990	return (synqe);
991}
992
993static void
994t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
995{
996	bzero(to, sizeof(*to));
997
998	if (t4opt->mss) {
999		to->to_flags |= TOF_MSS;
1000		to->to_mss = be16toh(t4opt->mss);
1001	}
1002
1003	if (t4opt->wsf) {
1004		to->to_flags |= TOF_SCALE;
1005		to->to_wscale = t4opt->wsf;
1006	}
1007
1008	if (t4opt->tstamp)
1009		to->to_flags |= TOF_TS;
1010
1011	if (t4opt->sack)
1012		to->to_flags |= TOF_SACKPERM;
1013}
1014
1015/*
1016 * Options2 for passive open.
1017 */
1018static uint32_t
1019calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
1020    const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
1021{
1022	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1023	uint32_t opt2;
1024
1025	opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
1026	    F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1027
1028	if (V_tcp_do_rfc1323) {
1029		if (tcpopt->tstamp)
1030			opt2 |= F_TSTAMPS_EN;
1031		if (tcpopt->sack)
1032			opt2 |= F_SACK_EN;
1033		if (tcpopt->wsf <= 14)
1034			opt2 |= F_WND_SCALE_EN;
1035	}
1036
1037	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1038		opt2 |= F_CCTRL_ECN;
1039
1040	/* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */
1041	if (is_t4(sc))
1042		opt2 |= F_RX_COALESCE_VALID;
1043	else {
1044		opt2 |= F_T5_OPT_2_VALID;
1045		opt2 |= F_T5_ISS;
1046	}
1047	if (sc->tt.rx_coalesce)
1048		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1049
1050	if (sc->tt.cong_algorithm != -1)
1051		opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
1052
1053#ifdef USE_DDP_RX_FLOW_CONTROL
1054	if (ulp_mode == ULP_MODE_TCPDDP)
1055		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1056#endif
1057
1058	return htobe32(opt2);
1059}
1060
1061static void
1062pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1063    struct in_conninfo *inc, struct tcphdr *th)
1064{
1065	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1066	const struct ether_header *eh;
1067	unsigned int hlen = be32toh(cpl->hdr_len);
1068	uintptr_t l3hdr;
1069	const struct tcphdr *tcp;
1070
1071	eh = (const void *)(cpl + 1);
1072	if (chip_id(sc) >= CHELSIO_T6) {
1073		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1074		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1075	} else {
1076		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1077		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1078	}
1079
1080	if (inc) {
1081		bzero(inc, sizeof(*inc));
1082		inc->inc_fport = tcp->th_sport;
1083		inc->inc_lport = tcp->th_dport;
1084		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1085			const struct ip *ip = (const void *)l3hdr;
1086
1087			inc->inc_faddr = ip->ip_src;
1088			inc->inc_laddr = ip->ip_dst;
1089		} else {
1090			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1091
1092			inc->inc_flags |= INC_ISIPV6;
1093			inc->inc6_faddr = ip6->ip6_src;
1094			inc->inc6_laddr = ip6->ip6_dst;
1095		}
1096	}
1097
1098	if (th) {
1099		bcopy(tcp, th, sizeof(*th));
1100		tcp_fields_to_host(th);		/* just like tcp_input */
1101	}
1102}
1103
1104static struct l2t_entry *
1105get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1106    struct in_conninfo *inc)
1107{
1108	struct l2t_entry *e;
1109	struct sockaddr_in6 sin6;
1110	struct sockaddr *dst = (void *)&sin6;
1111
1112	if (inc->inc_flags & INC_ISIPV6) {
1113		struct nhop6_basic nh6;
1114
1115		bzero(dst, sizeof(struct sockaddr_in6));
1116		dst->sa_len = sizeof(struct sockaddr_in6);
1117		dst->sa_family = AF_INET6;
1118
1119		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1120			/* no need for route lookup */
1121			e = t4_l2t_get(pi, ifp, dst);
1122			return (e);
1123		}
1124
1125		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1126		    0, 0, 0, &nh6) != 0)
1127			return (NULL);
1128		if (nh6.nh_ifp != ifp)
1129			return (NULL);
1130		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1131	} else {
1132		struct nhop4_basic nh4;
1133
1134		dst->sa_len = sizeof(struct sockaddr_in);
1135		dst->sa_family = AF_INET;
1136
1137		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1138		    &nh4) != 0)
1139			return (NULL);
1140		if (nh4.nh_ifp != ifp)
1141			return (NULL);
1142		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1143	}
1144
1145	e = t4_l2t_get(pi, ifp, dst);
1146	return (e);
1147}
1148
1149#define REJECT_PASS_ACCEPT()	do { \
1150	reject_reason = __LINE__; \
1151	goto reject; \
1152} while (0)
1153
1154/*
1155 * The context associated with a tid entry via insert_tid could be a synq_entry
1156 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1157 */
1158CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1159
1160/*
1161 * Incoming SYN on a listening socket.
1162 *
1163 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1164 * etc.
1165 */
1166static int
1167do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1168    struct mbuf *m)
1169{
1170	struct adapter *sc = iq->adapter;
1171	struct toedev *tod;
1172	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1173	struct cpl_pass_accept_rpl *rpl;
1174	struct wrqe *wr;
1175	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1176	unsigned int tid = GET_TID(cpl);
1177	struct listen_ctx *lctx = lookup_stid(sc, stid);
1178	struct inpcb *inp;
1179	struct socket *so;
1180	struct in_conninfo inc;
1181	struct tcphdr th;
1182	struct tcpopt to;
1183	struct port_info *pi;
1184	struct vi_info *vi;
1185	struct ifnet *hw_ifp, *ifp;
1186	struct l2t_entry *e = NULL;
1187	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1188	struct synq_entry *synqe = NULL;
1189	int reject_reason, v, ntids;
1190	uint16_t vid;
1191#ifdef INVARIANTS
1192	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1193#endif
1194
1195	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1196	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1197	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1198
1199	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1200	    lctx);
1201
1202	pass_accept_req_to_protohdrs(sc, m, &inc, &th);
1203	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1204
1205	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1206
1207	CURVNET_SET(lctx->vnet);
1208
1209	/*
1210	 * Use the MAC index to lookup the associated VI.  If this SYN
1211	 * didn't match a perfect MAC filter, punt.
1212	 */
1213	if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) {
1214		m_freem(m);
1215		m = NULL;
1216		REJECT_PASS_ACCEPT();
1217	}
1218	for_each_vi(pi, v, vi) {
1219		if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info)))
1220			goto found;
1221	}
1222	m_freem(m);
1223	m = NULL;
1224	REJECT_PASS_ACCEPT();
1225
1226found:
1227	hw_ifp = vi->ifp;	/* the (v)cxgbeX ifnet */
1228	m->m_pkthdr.rcvif = hw_ifp;
1229	tod = TOEDEV(hw_ifp);
1230
1231	/*
1232	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1233	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1234	 * doesn't match anything on this interface.
1235	 *
1236	 * XXX: lagg support, lagg + vlan support.
1237	 */
1238	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1239	if (vid != 0xfff) {
1240		ifp = VLAN_DEVAT(hw_ifp, vid);
1241		if (ifp == NULL)
1242			REJECT_PASS_ACCEPT();
1243	} else
1244		ifp = hw_ifp;
1245
1246	/*
1247	 * Don't offload if the peer requested a TCP option that's not known to
1248	 * the silicon.
1249	 */
1250	if (cpl->tcpopt.unknown)
1251		REJECT_PASS_ACCEPT();
1252
1253	if (inc.inc_flags & INC_ISIPV6) {
1254
1255		/* Don't offload if the ifcap isn't enabled */
1256		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1257			REJECT_PASS_ACCEPT();
1258
1259		/*
1260		 * SYN must be directed to an IP6 address on this ifnet.  This
1261		 * is more restrictive than in6_localip.
1262		 */
1263		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
1264			REJECT_PASS_ACCEPT();
1265
1266		ntids = 2;
1267	} else {
1268
1269		/* Don't offload if the ifcap isn't enabled */
1270		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1271			REJECT_PASS_ACCEPT();
1272
1273		/*
1274		 * SYN must be directed to an IP address on this ifnet.  This
1275		 * is more restrictive than in_localip.
1276		 */
1277		if (!in_ifhasaddr(ifp, inc.inc_laddr))
1278			REJECT_PASS_ACCEPT();
1279
1280		ntids = 1;
1281	}
1282
1283	/*
1284	 * Don't offload if the ifnet that the SYN came in on is not in the same
1285	 * vnet as the listening socket.
1286	 */
1287	if (lctx->vnet != ifp->if_vnet)
1288		REJECT_PASS_ACCEPT();
1289
1290	e = get_l2te_for_nexthop(pi, ifp, &inc);
1291	if (e == NULL)
1292		REJECT_PASS_ACCEPT();
1293
1294	synqe = mbuf_to_synqe(m);
1295	if (synqe == NULL)
1296		REJECT_PASS_ACCEPT();
1297
1298	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1299	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1300	if (wr == NULL)
1301		REJECT_PASS_ACCEPT();
1302	rpl = wrtod(wr);
1303
1304	INP_INFO_RLOCK(&V_tcbinfo);	/* for 4-tuple check */
1305
1306	/* Don't offload if the 4-tuple is already in use */
1307	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1308		INP_INFO_RUNLOCK(&V_tcbinfo);
1309		free(wr, M_CXGBE);
1310		REJECT_PASS_ACCEPT();
1311	}
1312	INP_INFO_RUNLOCK(&V_tcbinfo);
1313
1314	inp = lctx->inp;		/* listening socket, not owned by TOE */
1315	INP_WLOCK(inp);
1316
1317	/* Don't offload if the listening socket has closed */
1318	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1319		/*
1320		 * The listening socket has closed.  The reply from the TOE to
1321		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1322		 * resources tied to this listen context.
1323		 */
1324		INP_WUNLOCK(inp);
1325		free(wr, M_CXGBE);
1326		REJECT_PASS_ACCEPT();
1327	}
1328	so = inp->inp_socket;
1329
1330	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1331	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1332	SOCKBUF_LOCK(&so->so_rcv);
1333	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1334	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1335	SOCKBUF_UNLOCK(&so->so_rcv);
1336
1337	save_qids_in_mbuf(m, vi);
1338	get_qids_from_mbuf(m, NULL, &rxqid);
1339
1340	if (is_t4(sc))
1341		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1342	else {
1343		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1344
1345		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1346	}
1347	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1348		ulp_mode = ULP_MODE_TCPDDP;
1349		synqe->flags |= TPF_SYNQE_TCPDDP;
1350	} else
1351		ulp_mode = ULP_MODE_NONE;
1352	rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1353	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1354
1355	synqe->tid = tid;
1356	synqe->lctx = lctx;
1357	synqe->syn = m;
1358	m = NULL;
1359	refcount_init(&synqe->refcnt, 1);	/* 1 means extra hold */
1360	synqe->l2e_idx = e->idx;
1361	synqe->rcv_bufsize = rx_credits;
1362	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1363
1364	insert_tid(sc, tid, synqe, ntids);
1365	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1366	hold_synqe(synqe);	/* hold for the duration it's in the synq */
1367	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
1368
1369	/*
1370	 * If all goes well t4_syncache_respond will get called during
1371	 * syncache_add.  Note that syncache_add releases the pcb lock.
1372	 */
1373	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1374	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
1375
1376	/*
1377	 * If we replied during syncache_add (synqe->wr has been consumed),
1378	 * good.  Otherwise, set it to 0 so that further syncache_respond
1379	 * attempts by the kernel will be ignored.
1380	 */
1381	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1382
1383		/*
1384		 * syncache may or may not have a hold on the synqe, which may
1385		 * or may not be stashed in the original SYN mbuf passed to us.
1386		 * Just copy it over instead of dealing with all possibilities.
1387		 */
1388		m = m_dup(synqe->syn, M_NOWAIT);
1389		if (m)
1390			m->m_pkthdr.rcvif = hw_ifp;
1391
1392		remove_tid(sc, synqe->tid, ntids);
1393		free(wr, M_CXGBE);
1394
1395		/* Yank the synqe out of the lctx synq. */
1396		INP_WLOCK(inp);
1397		TAILQ_REMOVE(&lctx->synq, synqe, link);
1398		release_synqe(synqe);	/* removed from synq list */
1399		inp = release_lctx(sc, lctx);
1400		if (inp)
1401			INP_WUNLOCK(inp);
1402
1403		release_synqe(synqe);	/* extra hold */
1404		REJECT_PASS_ACCEPT();
1405	}
1406
1407	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1408	    __func__, stid, tid, lctx, synqe);
1409
1410	INP_WLOCK(inp);
1411	synqe->flags |= TPF_SYNQE_HAS_L2TE;
1412	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1413		/*
1414		 * Listening socket closed but tod_listen_stop did not abort
1415		 * this tid because there was no L2T entry for the tid at that
1416		 * time.  Abort it now.  The reply to the abort will clean up.
1417		 */
1418		CTR6(KTR_CXGBE,
1419		    "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1420		    __func__, stid, tid, lctx, synqe, synqe->flags);
1421		if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1422			send_reset_synqe(tod, synqe);
1423		INP_WUNLOCK(inp);
1424		CURVNET_RESTORE();
1425
1426		release_synqe(synqe);	/* extra hold */
1427		return (__LINE__);
1428	}
1429	INP_WUNLOCK(inp);
1430	CURVNET_RESTORE();
1431
1432	release_synqe(synqe);	/* extra hold */
1433	return (0);
1434reject:
1435	CURVNET_RESTORE();
1436	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1437	    reject_reason);
1438
1439	if (e)
1440		t4_l2t_release(e);
1441	release_tid(sc, tid, lctx->ctrlq);
1442
1443	if (__predict_true(m != NULL)) {
1444		m_adj(m, sizeof(*cpl));
1445		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1446		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1447		m->m_pkthdr.csum_data = 0xffff;
1448		hw_ifp->if_input(hw_ifp, m);
1449	}
1450
1451	return (reject_reason);
1452}
1453
1454static void
1455synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1456    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1457    struct tcphdr *th, struct tcpopt *to)
1458{
1459	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1460
1461	/* start off with the original SYN */
1462	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
1463
1464	/* modify parts to make it look like the ACK to our SYN|ACK */
1465	th->th_flags = TH_ACK;
1466	th->th_ack = synqe->iss + 1;
1467	th->th_seq = be32toh(cpl->rcv_isn);
1468	bzero(to, sizeof(*to));
1469	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1470		to->to_flags |= TOF_TS;
1471		to->to_tsecr = synqe->ts;
1472	}
1473}
1474
1475static int
1476do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1477    struct mbuf *m)
1478{
1479	struct adapter *sc = iq->adapter;
1480	struct vi_info *vi;
1481	struct ifnet *ifp;
1482	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1483#if defined(KTR) || defined(INVARIANTS)
1484	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1485#endif
1486	unsigned int tid = GET_TID(cpl);
1487	struct synq_entry *synqe = lookup_tid(sc, tid);
1488	struct listen_ctx *lctx = synqe->lctx;
1489	struct inpcb *inp = lctx->inp, *new_inp;
1490	struct socket *so;
1491	struct tcphdr th;
1492	struct tcpopt to;
1493	struct in_conninfo inc;
1494	struct toepcb *toep;
1495	u_int txqid, rxqid;
1496#ifdef INVARIANTS
1497	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1498#endif
1499
1500	KASSERT(opcode == CPL_PASS_ESTABLISH,
1501	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1502	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1503	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1504	KASSERT(synqe->flags & TPF_SYNQE,
1505	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1506
1507	CURVNET_SET(lctx->vnet);
1508	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
1509	INP_WLOCK(inp);
1510
1511	CTR6(KTR_CXGBE,
1512	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1513	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1514
1515	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1516
1517		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1518			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1519			    ("%s: listen socket closed but tid %u not aborted.",
1520			    __func__, tid));
1521		}
1522
1523		INP_WUNLOCK(inp);
1524		INP_INFO_RUNLOCK(&V_tcbinfo);
1525		CURVNET_RESTORE();
1526		return (0);
1527	}
1528
1529	ifp = synqe->syn->m_pkthdr.rcvif;
1530	vi = ifp->if_softc;
1531	KASSERT(vi->pi->adapter == sc,
1532	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1533
1534	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1535	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1536	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
1537	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1538
1539	toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT);
1540	if (toep == NULL) {
1541reset:
1542		/*
1543		 * The reply to this abort will perform final cleanup.  There is
1544		 * no need to check for HAS_L2TE here.  We can be here only if
1545		 * we responded to the PASS_ACCEPT_REQ, and our response had the
1546		 * L2T idx.
1547		 */
1548		send_reset_synqe(TOEDEV(ifp), synqe);
1549		INP_WUNLOCK(inp);
1550		INP_INFO_RUNLOCK(&V_tcbinfo);
1551		CURVNET_RESTORE();
1552		return (0);
1553	}
1554	toep->tid = tid;
1555	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1556	if (synqe->flags & TPF_SYNQE_TCPDDP)
1557		set_tcpddp_ulp_mode(toep);
1558	else
1559		toep->ulp_mode = ULP_MODE_NONE;
1560	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1561	toep->rx_credits = synqe->rcv_bufsize;
1562
1563	so = inp->inp_socket;
1564	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1565
1566	/* Come up with something that syncache_expand should be ok with. */
1567	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1568
1569	/*
1570	 * No more need for anything in the mbuf that carried the
1571	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
1572	 * there.  XXX: bad form but I don't want to increase the size of synqe.
1573	 */
1574	m = synqe->syn;
1575	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1576	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1577	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1578	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1579
1580	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1581		free_toepcb(toep);
1582		goto reset;
1583	}
1584
1585	/* New connection inpcb is already locked by syncache_expand(). */
1586	new_inp = sotoinpcb(so);
1587	INP_WLOCK_ASSERT(new_inp);
1588	MPASS(so->so_vnet == lctx->vnet);
1589	toep->vnet = lctx->vnet;
1590	if (inc.inc_flags & INC_ISIPV6)
1591		toep->ce = hold_lip(sc->tom_softc, &inc.inc6_laddr, lctx->ce);
1592
1593	/*
1594	 * This is for the unlikely case where the syncache entry that we added
1595	 * has been evicted from the syncache, but the syncache_expand above
1596	 * works because of syncookies.
1597	 *
1598	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1599	 * anyone accept'ing a connection before we've installed our hooks, but
1600	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1601	 */
1602	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1603		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1604		t4_offload_socket(TOEDEV(ifp), synqe, so);
1605	}
1606
1607	INP_WUNLOCK(new_inp);
1608
1609	/* Done with the synqe */
1610	TAILQ_REMOVE(&lctx->synq, synqe, link);
1611	inp = release_lctx(sc, lctx);
1612	if (inp != NULL)
1613		INP_WUNLOCK(inp);
1614	INP_INFO_RUNLOCK(&V_tcbinfo);
1615	CURVNET_RESTORE();
1616	release_synqe(synqe);
1617
1618	return (0);
1619}
1620
1621void
1622t4_init_listen_cpl_handlers(void)
1623{
1624
1625	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1626	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1627	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1628	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1629}
1630
1631void
1632t4_uninit_listen_cpl_handlers(void)
1633{
1634
1635	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1636	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1637	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1638	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1639}
1640#endif
1641