t4_listen.c revision 331722
1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_listen.c 331722 2018-03-29 02:50:57Z eadler $"); 30 31#include "opt_inet.h" 32#include "opt_inet6.h" 33 34#ifdef TCP_OFFLOAD 35#include <sys/param.h> 36#include <sys/types.h> 37#include <sys/kernel.h> 38#include <sys/ktr.h> 39#include <sys/module.h> 40#include <sys/protosw.h> 41#include <sys/refcount.h> 42#include <sys/domain.h> 43#include <sys/fnv_hash.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <net/ethernet.h> 47#include <net/if.h> 48#include <net/if_types.h> 49#include <net/if_vlan_var.h> 50#include <net/route.h> 51#include <netinet/in.h> 52#include <netinet/in_fib.h> 53#include <netinet/in_pcb.h> 54#include <netinet/ip.h> 55#include <netinet/ip6.h> 56#include <netinet6/in6_fib.h> 57#include <netinet6/scope6_var.h> 58#include <netinet/tcp_timer.h> 59#define TCPSTATES 60#include <netinet/tcp_fsm.h> 61#include <netinet/tcp_var.h> 62#include <netinet/toecore.h> 63 64#include "common/common.h" 65#include "common/t4_msg.h" 66#include "common/t4_regs.h" 67#include "tom/t4_tom_l2t.h" 68#include "tom/t4_tom.h" 69 70/* stid services */ 71static int alloc_stid(struct adapter *, struct listen_ctx *, int); 72static struct listen_ctx *lookup_stid(struct adapter *, int); 73static void free_stid(struct adapter *, struct listen_ctx *); 74 75/* lctx services */ 76static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 77 struct vi_info *); 78static int free_lctx(struct adapter *, struct listen_ctx *); 79static void hold_lctx(struct listen_ctx *); 80static void listen_hash_add(struct adapter *, struct listen_ctx *); 81static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 82static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 83static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 84 85static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *); 86static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); 87static void send_reset_synqe(struct toedev *, struct synq_entry *); 88 89static int 90alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 91{ 92 struct tid_info *t = &sc->tids; 93 u_int stid, n, f, mask; 94 struct stid_region *sr = &lctx->stid_region; 95 96 /* 97 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 98 * the TCAM. The start of the stid region is properly aligned (the chip 99 * requires each region to be 128-cell aligned). 100 */ 101 n = isipv6 ? 2 : 1; 102 mask = n - 1; 103 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 104 ("%s: stid region (%u, %u) not properly aligned. n = %u", 105 __func__, t->stid_base, t->nstids, n)); 106 107 mtx_lock(&t->stid_lock); 108 if (n > t->nstids - t->stids_in_use) { 109 mtx_unlock(&t->stid_lock); 110 return (-1); 111 } 112 113 if (t->nstids_free_head >= n) { 114 /* 115 * This allocation will definitely succeed because the region 116 * starts at a good alignment and we just checked we have enough 117 * stids free. 118 */ 119 f = t->nstids_free_head & mask; 120 t->nstids_free_head -= n + f; 121 stid = t->nstids_free_head; 122 TAILQ_INSERT_HEAD(&t->stids, sr, link); 123 } else { 124 struct stid_region *s; 125 126 stid = t->nstids_free_head; 127 TAILQ_FOREACH(s, &t->stids, link) { 128 stid += s->used + s->free; 129 f = stid & mask; 130 if (s->free >= n + f) { 131 stid -= n + f; 132 s->free -= n + f; 133 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 134 goto allocated; 135 } 136 } 137 138 if (__predict_false(stid != t->nstids)) { 139 panic("%s: stids TAILQ (%p) corrupt." 140 " At %d instead of %d at the end of the queue.", 141 __func__, &t->stids, stid, t->nstids); 142 } 143 144 mtx_unlock(&t->stid_lock); 145 return (-1); 146 } 147 148allocated: 149 sr->used = n; 150 sr->free = f; 151 t->stids_in_use += n; 152 t->stid_tab[stid] = lctx; 153 mtx_unlock(&t->stid_lock); 154 155 KASSERT(((stid + t->stid_base) & mask) == 0, 156 ("%s: EDOOFUS.", __func__)); 157 return (stid + t->stid_base); 158} 159 160static struct listen_ctx * 161lookup_stid(struct adapter *sc, int stid) 162{ 163 struct tid_info *t = &sc->tids; 164 165 return (t->stid_tab[stid - t->stid_base]); 166} 167 168static void 169free_stid(struct adapter *sc, struct listen_ctx *lctx) 170{ 171 struct tid_info *t = &sc->tids; 172 struct stid_region *sr = &lctx->stid_region; 173 struct stid_region *s; 174 175 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 176 177 mtx_lock(&t->stid_lock); 178 s = TAILQ_PREV(sr, stid_head, link); 179 if (s != NULL) 180 s->free += sr->used + sr->free; 181 else 182 t->nstids_free_head += sr->used + sr->free; 183 KASSERT(t->stids_in_use >= sr->used, 184 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 185 t->stids_in_use, sr->used)); 186 t->stids_in_use -= sr->used; 187 TAILQ_REMOVE(&t->stids, sr, link); 188 mtx_unlock(&t->stid_lock); 189} 190 191static struct listen_ctx * 192alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 193{ 194 struct listen_ctx *lctx; 195 196 INP_WLOCK_ASSERT(inp); 197 198 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 199 if (lctx == NULL) 200 return (NULL); 201 202 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 203 if (lctx->stid < 0) { 204 free(lctx, M_CXGBE); 205 return (NULL); 206 } 207 208 if (inp->inp_vflag & INP_IPV6 && 209 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 210 struct tom_data *td = sc->tom_softc; 211 212 lctx->ce = hold_lip(td, &inp->in6p_laddr, NULL); 213 if (lctx->ce == NULL) { 214 free(lctx, M_CXGBE); 215 return (NULL); 216 } 217 } 218 219 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 220 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 221 refcount_init(&lctx->refcount, 1); 222 TAILQ_INIT(&lctx->synq); 223 224 lctx->inp = inp; 225 lctx->vnet = inp->inp_socket->so_vnet; 226 in_pcbref(inp); 227 228 return (lctx); 229} 230 231/* Don't call this directly, use release_lctx instead */ 232static int 233free_lctx(struct adapter *sc, struct listen_ctx *lctx) 234{ 235 struct inpcb *inp = lctx->inp; 236 struct tom_data *td = sc->tom_softc; 237 238 INP_WLOCK_ASSERT(inp); 239 KASSERT(lctx->refcount == 0, 240 ("%s: refcount %d", __func__, lctx->refcount)); 241 KASSERT(TAILQ_EMPTY(&lctx->synq), 242 ("%s: synq not empty.", __func__)); 243 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 244 245 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 246 __func__, lctx->stid, lctx, lctx->inp); 247 248 if (lctx->ce) 249 release_lip(td, lctx->ce); 250 free_stid(sc, lctx); 251 free(lctx, M_CXGBE); 252 253 return (in_pcbrele_wlocked(inp)); 254} 255 256static void 257hold_lctx(struct listen_ctx *lctx) 258{ 259 260 refcount_acquire(&lctx->refcount); 261} 262 263static inline uint32_t 264listen_hashfn(void *key, u_long mask) 265{ 266 267 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 268} 269 270/* 271 * Add a listen_ctx entry to the listen hash table. 272 */ 273static void 274listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 275{ 276 struct tom_data *td = sc->tom_softc; 277 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 278 279 mtx_lock(&td->lctx_hash_lock); 280 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 281 td->lctx_count++; 282 mtx_unlock(&td->lctx_hash_lock); 283} 284 285/* 286 * Look for the listening socket's context entry in the hash and return it. 287 */ 288static struct listen_ctx * 289listen_hash_find(struct adapter *sc, struct inpcb *inp) 290{ 291 struct tom_data *td = sc->tom_softc; 292 int bucket = listen_hashfn(inp, td->listen_mask); 293 struct listen_ctx *lctx; 294 295 mtx_lock(&td->lctx_hash_lock); 296 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 297 if (lctx->inp == inp) 298 break; 299 } 300 mtx_unlock(&td->lctx_hash_lock); 301 302 return (lctx); 303} 304 305/* 306 * Removes the listen_ctx structure for inp from the hash and returns it. 307 */ 308static struct listen_ctx * 309listen_hash_del(struct adapter *sc, struct inpcb *inp) 310{ 311 struct tom_data *td = sc->tom_softc; 312 int bucket = listen_hashfn(inp, td->listen_mask); 313 struct listen_ctx *lctx, *l; 314 315 mtx_lock(&td->lctx_hash_lock); 316 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 317 if (lctx->inp == inp) { 318 LIST_REMOVE(lctx, link); 319 td->lctx_count--; 320 break; 321 } 322 } 323 mtx_unlock(&td->lctx_hash_lock); 324 325 return (lctx); 326} 327 328/* 329 * Releases a hold on the lctx. Must be called with the listening socket's inp 330 * locked. The inp may be freed by this function and it returns NULL to 331 * indicate this. 332 */ 333static struct inpcb * 334release_lctx(struct adapter *sc, struct listen_ctx *lctx) 335{ 336 struct inpcb *inp = lctx->inp; 337 int inp_freed = 0; 338 339 INP_WLOCK_ASSERT(inp); 340 if (refcount_release(&lctx->refcount)) 341 inp_freed = free_lctx(sc, lctx); 342 343 return (inp_freed ? NULL : inp); 344} 345 346static void 347send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 348{ 349 struct adapter *sc = tod->tod_softc; 350 struct mbuf *m = synqe->syn; 351 struct ifnet *ifp = m->m_pkthdr.rcvif; 352 struct vi_info *vi = ifp->if_softc; 353 struct port_info *pi = vi->pi; 354 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 355 struct wrqe *wr; 356 struct fw_flowc_wr *flowc; 357 struct cpl_abort_req *req; 358 int txqid, rxqid, flowclen; 359 struct sge_wrq *ofld_txq; 360 struct sge_ofld_rxq *ofld_rxq; 361 const int nparams = 6; 362 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 363 364 INP_WLOCK_ASSERT(synqe->lctx->inp); 365 366 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 367 __func__, synqe, synqe->flags, synqe->tid, 368 synqe->flags & TPF_ABORT_SHUTDOWN ? 369 " (abort already in progress)" : ""); 370 if (synqe->flags & TPF_ABORT_SHUTDOWN) 371 return; /* abort already in progress */ 372 synqe->flags |= TPF_ABORT_SHUTDOWN; 373 374 get_qids_from_mbuf(m, &txqid, &rxqid); 375 ofld_txq = &sc->sge.ofld_txq[txqid]; 376 ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 377 378 /* The wrqe will have two WRs - a flowc followed by an abort_req */ 379 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 380 381 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); 382 if (wr == NULL) { 383 /* XXX */ 384 panic("%s: allocation failure.", __func__); 385 } 386 flowc = wrtod(wr); 387 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE)); 388 389 /* First the flowc ... */ 390 memset(flowc, 0, wr->wr_len); 391 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 392 V_FW_FLOWC_WR_NPARAMS(nparams)); 393 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 394 V_FW_WR_FLOWID(synqe->tid)); 395 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 396 flowc->mnemval[0].val = htobe32(pfvf); 397 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 398 flowc->mnemval[1].val = htobe32(pi->tx_chan); 399 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 400 flowc->mnemval[2].val = htobe32(pi->tx_chan); 401 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 402 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 403 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 404 flowc->mnemval[4].val = htobe32(512); 405 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 406 flowc->mnemval[5].val = htobe32(512); 407 synqe->flags |= TPF_FLOWC_WR_SENT; 408 409 /* ... then ABORT request */ 410 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 411 req->rsvd0 = 0; /* don't have a snd_nxt */ 412 req->rsvd1 = 1; /* no data sent yet */ 413 req->cmd = CPL_ABORT_SEND_RST; 414 415 t4_l2t_send(sc, wr, e); 416} 417 418static int 419create_server(struct adapter *sc, struct listen_ctx *lctx) 420{ 421 struct wrqe *wr; 422 struct cpl_pass_open_req *req; 423 struct inpcb *inp = lctx->inp; 424 425 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 426 if (wr == NULL) { 427 log(LOG_ERR, "%s: allocation failure", __func__); 428 return (ENOMEM); 429 } 430 req = wrtod(wr); 431 432 INIT_TP_WR(req, 0); 433 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 434 req->local_port = inp->inp_lport; 435 req->peer_port = 0; 436 req->local_ip = inp->inp_laddr.s_addr; 437 req->peer_ip = 0; 438 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 439 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 440 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 441 442 t4_wrq_tx(sc, wr); 443 return (0); 444} 445 446static int 447create_server6(struct adapter *sc, struct listen_ctx *lctx) 448{ 449 struct wrqe *wr; 450 struct cpl_pass_open_req6 *req; 451 struct inpcb *inp = lctx->inp; 452 453 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 454 if (wr == NULL) { 455 log(LOG_ERR, "%s: allocation failure", __func__); 456 return (ENOMEM); 457 } 458 req = wrtod(wr); 459 460 INIT_TP_WR(req, 0); 461 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 462 req->local_port = inp->inp_lport; 463 req->peer_port = 0; 464 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 465 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 466 req->peer_ip_hi = 0; 467 req->peer_ip_lo = 0; 468 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 469 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 470 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 471 472 t4_wrq_tx(sc, wr); 473 return (0); 474} 475 476static int 477destroy_server(struct adapter *sc, struct listen_ctx *lctx) 478{ 479 struct wrqe *wr; 480 struct cpl_close_listsvr_req *req; 481 482 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 483 if (wr == NULL) { 484 /* XXX */ 485 panic("%s: allocation failure.", __func__); 486 } 487 req = wrtod(wr); 488 489 INIT_TP_WR(req, 0); 490 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 491 lctx->stid)); 492 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 493 req->rsvd = htobe16(0); 494 495 t4_wrq_tx(sc, wr); 496 return (0); 497} 498 499/* 500 * Start a listening server by sending a passive open request to HW. 501 * 502 * Can't take adapter lock here and access to sc->flags, 503 * sc->offload_map, if_capenable are all race prone. 504 */ 505int 506t4_listen_start(struct toedev *tod, struct tcpcb *tp) 507{ 508 struct adapter *sc = tod->tod_softc; 509 struct vi_info *vi; 510 struct port_info *pi; 511 struct inpcb *inp = tp->t_inpcb; 512 struct listen_ctx *lctx; 513 int i, rc, v; 514 515 INP_WLOCK_ASSERT(inp); 516 517 /* Don't start a hardware listener for any loopback address. */ 518 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 519 return (0); 520 if (!(inp->inp_vflag & INP_IPV6) && 521 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 522 return (0); 523#if 0 524 ADAPTER_LOCK(sc); 525 if (IS_BUSY(sc)) { 526 log(LOG_ERR, "%s: listen request ignored, %s is busy", 527 __func__, device_get_nameunit(sc->dev)); 528 goto done; 529 } 530 531 KASSERT(uld_active(sc, ULD_TOM), 532 ("%s: TOM not initialized", __func__)); 533#endif 534 535 /* 536 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 537 * such VI's queues to send the passive open and receive the reply to 538 * it. 539 * 540 * XXX: need a way to mark a port in use by offload. if_cxgbe should 541 * then reject any attempt to bring down such a port (and maybe reject 542 * attempts to disable IFCAP_TOE on that port too?). 543 */ 544 for_each_port(sc, i) { 545 pi = sc->port[i]; 546 for_each_vi(pi, v, vi) { 547 if (vi->flags & VI_INIT_DONE && 548 vi->ifp->if_capenable & IFCAP_TOE) 549 goto found; 550 } 551 } 552 goto done; /* no port that's UP with IFCAP_TOE enabled */ 553found: 554 555 if (listen_hash_find(sc, inp) != NULL) 556 goto done; /* already setup */ 557 558 lctx = alloc_lctx(sc, inp, vi); 559 if (lctx == NULL) { 560 log(LOG_ERR, 561 "%s: listen request ignored, %s couldn't allocate lctx\n", 562 __func__, device_get_nameunit(sc->dev)); 563 goto done; 564 } 565 listen_hash_add(sc, lctx); 566 567 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 568 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 569 inp->inp_vflag); 570 571 if (inp->inp_vflag & INP_IPV6) 572 rc = create_server6(sc, lctx); 573 else 574 rc = create_server(sc, lctx); 575 if (rc != 0) { 576 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 577 __func__, device_get_nameunit(sc->dev), rc); 578 (void) listen_hash_del(sc, inp); 579 inp = release_lctx(sc, lctx); 580 /* can't be freed, host stack has a reference */ 581 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 582 goto done; 583 } 584 lctx->flags |= LCTX_RPL_PENDING; 585done: 586#if 0 587 ADAPTER_UNLOCK(sc); 588#endif 589 return (0); 590} 591 592int 593t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 594{ 595 struct listen_ctx *lctx; 596 struct adapter *sc = tod->tod_softc; 597 struct inpcb *inp = tp->t_inpcb; 598 struct synq_entry *synqe; 599 600 INP_WLOCK_ASSERT(inp); 601 602 lctx = listen_hash_del(sc, inp); 603 if (lctx == NULL) 604 return (ENOENT); /* no hardware listener for this inp */ 605 606 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 607 lctx, lctx->flags); 608 609 /* 610 * If the reply to the PASS_OPEN is still pending we'll wait for it to 611 * arrive and clean up when it does. 612 */ 613 if (lctx->flags & LCTX_RPL_PENDING) { 614 KASSERT(TAILQ_EMPTY(&lctx->synq), 615 ("%s: synq not empty.", __func__)); 616 return (EINPROGRESS); 617 } 618 619 /* 620 * The host stack will abort all the connections on the listening 621 * socket's so_comp. It doesn't know about the connections on the synq 622 * so we need to take care of those. 623 */ 624 TAILQ_FOREACH(synqe, &lctx->synq, link) { 625 if (synqe->flags & TPF_SYNQE_HAS_L2TE) 626 send_reset_synqe(tod, synqe); 627 } 628 629 destroy_server(sc, lctx); 630 return (0); 631} 632 633static inline void 634hold_synqe(struct synq_entry *synqe) 635{ 636 637 refcount_acquire(&synqe->refcnt); 638} 639 640static inline void 641release_synqe(struct synq_entry *synqe) 642{ 643 644 if (refcount_release(&synqe->refcnt)) { 645 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE; 646 647 m_freem(synqe->syn); 648 if (needfree) 649 free(synqe, M_CXGBE); 650 } 651} 652 653void 654t4_syncache_added(struct toedev *tod __unused, void *arg) 655{ 656 struct synq_entry *synqe = arg; 657 658 hold_synqe(synqe); 659} 660 661void 662t4_syncache_removed(struct toedev *tod __unused, void *arg) 663{ 664 struct synq_entry *synqe = arg; 665 666 release_synqe(synqe); 667} 668 669int 670t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 671{ 672 struct adapter *sc = tod->tod_softc; 673 struct synq_entry *synqe = arg; 674 struct wrqe *wr; 675 struct l2t_entry *e; 676 struct tcpopt to; 677 struct ip *ip = mtod(m, struct ip *); 678 struct tcphdr *th; 679 680 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); 681 if (wr == NULL) { 682 m_freem(m); 683 return (EALREADY); 684 } 685 686 if (ip->ip_v == IPVERSION) 687 th = (void *)(ip + 1); 688 else 689 th = (void *)((struct ip6_hdr *)ip + 1); 690 bzero(&to, sizeof(to)); 691 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 692 TO_SYN); 693 694 /* save these for later */ 695 synqe->iss = be32toh(th->th_seq); 696 synqe->ts = to.to_tsval; 697 698 if (chip_id(sc) >= CHELSIO_T5) { 699 struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr); 700 701 rpl5->iss = th->th_seq; 702 } 703 704 e = &sc->l2t->l2tab[synqe->l2e_idx]; 705 t4_l2t_send(sc, wr, e); 706 707 m_freem(m); /* don't need this any more */ 708 return (0); 709} 710 711static int 712do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 713 struct mbuf *m) 714{ 715 struct adapter *sc = iq->adapter; 716 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 717 int stid = GET_TID(cpl); 718 unsigned int status = cpl->status; 719 struct listen_ctx *lctx = lookup_stid(sc, stid); 720 struct inpcb *inp = lctx->inp; 721#ifdef INVARIANTS 722 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 723#endif 724 725 KASSERT(opcode == CPL_PASS_OPEN_RPL, 726 ("%s: unexpected opcode 0x%x", __func__, opcode)); 727 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 728 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 729 730 INP_WLOCK(inp); 731 732 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 733 __func__, stid, status, lctx->flags); 734 735 lctx->flags &= ~LCTX_RPL_PENDING; 736 737 if (status != CPL_ERR_NONE) 738 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 739 740#ifdef INVARIANTS 741 /* 742 * If the inp has been dropped (listening socket closed) then 743 * listen_stop must have run and taken the inp out of the hash. 744 */ 745 if (inp->inp_flags & INP_DROPPED) { 746 KASSERT(listen_hash_del(sc, inp) == NULL, 747 ("%s: inp %p still in listen hash", __func__, inp)); 748 } 749#endif 750 751 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 752 if (release_lctx(sc, lctx) != NULL) 753 INP_WUNLOCK(inp); 754 return (status); 755 } 756 757 /* 758 * Listening socket stopped listening earlier and now the chip tells us 759 * it has started the hardware listener. Stop it; the lctx will be 760 * released in do_close_server_rpl. 761 */ 762 if (inp->inp_flags & INP_DROPPED) { 763 destroy_server(sc, lctx); 764 INP_WUNLOCK(inp); 765 return (status); 766 } 767 768 /* 769 * Failed to start hardware listener. Take inp out of the hash and 770 * release our reference on it. An error message has been logged 771 * already. 772 */ 773 if (status != CPL_ERR_NONE) { 774 listen_hash_del(sc, inp); 775 if (release_lctx(sc, lctx) != NULL) 776 INP_WUNLOCK(inp); 777 return (status); 778 } 779 780 /* hardware listener open for business */ 781 782 INP_WUNLOCK(inp); 783 return (status); 784} 785 786static int 787do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 788 struct mbuf *m) 789{ 790 struct adapter *sc = iq->adapter; 791 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 792 int stid = GET_TID(cpl); 793 unsigned int status = cpl->status; 794 struct listen_ctx *lctx = lookup_stid(sc, stid); 795 struct inpcb *inp = lctx->inp; 796#ifdef INVARIANTS 797 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 798#endif 799 800 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 801 ("%s: unexpected opcode 0x%x", __func__, opcode)); 802 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 803 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 804 805 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 806 807 if (status != CPL_ERR_NONE) { 808 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 809 __func__, status, stid); 810 return (status); 811 } 812 813 INP_WLOCK(inp); 814 inp = release_lctx(sc, lctx); 815 if (inp != NULL) 816 INP_WUNLOCK(inp); 817 818 return (status); 819} 820 821static void 822done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 823{ 824 struct listen_ctx *lctx = synqe->lctx; 825 struct inpcb *inp = lctx->inp; 826 struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc; 827 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 828 int ntids; 829 830 INP_WLOCK_ASSERT(inp); 831 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 832 833 TAILQ_REMOVE(&lctx->synq, synqe, link); 834 inp = release_lctx(sc, lctx); 835 if (inp) 836 INP_WUNLOCK(inp); 837 remove_tid(sc, synqe->tid, ntids); 838 release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]); 839 t4_l2t_release(e); 840 release_synqe(synqe); /* removed from synq list */ 841} 842 843int 844do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 845 struct mbuf *m) 846{ 847 struct adapter *sc = iq->adapter; 848 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 849 unsigned int tid = GET_TID(cpl); 850 struct synq_entry *synqe = lookup_tid(sc, tid); 851 struct listen_ctx *lctx = synqe->lctx; 852 struct inpcb *inp = lctx->inp; 853 int txqid; 854 struct sge_wrq *ofld_txq; 855#ifdef INVARIANTS 856 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 857#endif 858 859 KASSERT(opcode == CPL_ABORT_REQ_RSS, 860 ("%s: unexpected opcode 0x%x", __func__, opcode)); 861 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 862 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 863 864 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 865 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 866 867 if (negative_advice(cpl->status)) 868 return (0); /* Ignore negative advice */ 869 870 INP_WLOCK(inp); 871 872 get_qids_from_mbuf(synqe->syn, &txqid, NULL); 873 ofld_txq = &sc->sge.ofld_txq[txqid]; 874 875 /* 876 * If we'd initiated an abort earlier the reply to it is responsible for 877 * cleaning up resources. Otherwise we tear everything down right here 878 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 879 */ 880 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 881 INP_WUNLOCK(inp); 882 goto done; 883 } 884 885 done_with_synqe(sc, synqe); 886 /* inp lock released by done_with_synqe */ 887done: 888 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 889 return (0); 890} 891 892int 893do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 894 struct mbuf *m) 895{ 896 struct adapter *sc = iq->adapter; 897 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 898 unsigned int tid = GET_TID(cpl); 899 struct synq_entry *synqe = lookup_tid(sc, tid); 900 struct listen_ctx *lctx = synqe->lctx; 901 struct inpcb *inp = lctx->inp; 902#ifdef INVARIANTS 903 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 904#endif 905 906 KASSERT(opcode == CPL_ABORT_RPL_RSS, 907 ("%s: unexpected opcode 0x%x", __func__, opcode)); 908 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 909 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 910 911 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 912 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 913 914 INP_WLOCK(inp); 915 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 916 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 917 __func__, synqe, synqe->flags)); 918 919 done_with_synqe(sc, synqe); 920 /* inp lock released by done_with_synqe */ 921 922 return (0); 923} 924 925void 926t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 927{ 928 struct adapter *sc = tod->tod_softc; 929 struct synq_entry *synqe = arg; 930#ifdef INVARIANTS 931 struct inpcb *inp = sotoinpcb(so); 932#endif 933 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); 934 struct toepcb *toep = *(struct toepcb **)(cpl + 1); 935 936 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 937 INP_WLOCK_ASSERT(inp); 938 KASSERT(synqe->flags & TPF_SYNQE, 939 ("%s: %p not a synq_entry?", __func__, arg)); 940 941 offload_socket(so, toep); 942 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 943 toep->flags |= TPF_CPL_PENDING; 944 update_tid(sc, synqe->tid, toep); 945 synqe->flags |= TPF_SYNQE_EXPANDED; 946} 947 948static inline void 949save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi) 950{ 951 uint32_t txqid, rxqid; 952 953 txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; 954 rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; 955 956 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); 957} 958 959static inline void 960get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) 961{ 962 963 if (txqid) 964 *txqid = m->m_pkthdr.flowid >> 16; 965 if (rxqid) 966 *rxqid = m->m_pkthdr.flowid & 0xffff; 967} 968 969/* 970 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 971 * store some state temporarily. 972 */ 973static struct synq_entry * 974mbuf_to_synqe(struct mbuf *m) 975{ 976 int len = roundup2(sizeof (struct synq_entry), 8); 977 int tspace = M_TRAILINGSPACE(m); 978 struct synq_entry *synqe = NULL; 979 980 if (tspace < len) { 981 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); 982 if (synqe == NULL) 983 return (NULL); 984 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE; 985 } else { 986 synqe = (void *)(m->m_data + m->m_len + tspace - len); 987 synqe->flags = TPF_SYNQE; 988 } 989 990 return (synqe); 991} 992 993static void 994t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 995{ 996 bzero(to, sizeof(*to)); 997 998 if (t4opt->mss) { 999 to->to_flags |= TOF_MSS; 1000 to->to_mss = be16toh(t4opt->mss); 1001 } 1002 1003 if (t4opt->wsf) { 1004 to->to_flags |= TOF_SCALE; 1005 to->to_wscale = t4opt->wsf; 1006 } 1007 1008 if (t4opt->tstamp) 1009 to->to_flags |= TOF_TS; 1010 1011 if (t4opt->sack) 1012 to->to_flags |= TOF_SACKPERM; 1013} 1014 1015/* 1016 * Options2 for passive open. 1017 */ 1018static uint32_t 1019calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, 1020 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode) 1021{ 1022 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 1023 uint32_t opt2; 1024 1025 opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | 1026 F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); 1027 1028 if (V_tcp_do_rfc1323) { 1029 if (tcpopt->tstamp) 1030 opt2 |= F_TSTAMPS_EN; 1031 if (tcpopt->sack) 1032 opt2 |= F_SACK_EN; 1033 if (tcpopt->wsf <= 14) 1034 opt2 |= F_WND_SCALE_EN; 1035 } 1036 1037 if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) 1038 opt2 |= F_CCTRL_ECN; 1039 1040 /* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */ 1041 if (is_t4(sc)) 1042 opt2 |= F_RX_COALESCE_VALID; 1043 else { 1044 opt2 |= F_T5_OPT_2_VALID; 1045 opt2 |= F_T5_ISS; 1046 } 1047 if (sc->tt.rx_coalesce) 1048 opt2 |= V_RX_COALESCE(M_RX_COALESCE); 1049 1050 if (sc->tt.cong_algorithm != -1) 1051 opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); 1052 1053#ifdef USE_DDP_RX_FLOW_CONTROL 1054 if (ulp_mode == ULP_MODE_TCPDDP) 1055 opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; 1056#endif 1057 1058 return htobe32(opt2); 1059} 1060 1061static void 1062pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1063 struct in_conninfo *inc, struct tcphdr *th) 1064{ 1065 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1066 const struct ether_header *eh; 1067 unsigned int hlen = be32toh(cpl->hdr_len); 1068 uintptr_t l3hdr; 1069 const struct tcphdr *tcp; 1070 1071 eh = (const void *)(cpl + 1); 1072 if (chip_id(sc) >= CHELSIO_T6) { 1073 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1074 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1075 } else { 1076 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1077 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1078 } 1079 1080 if (inc) { 1081 bzero(inc, sizeof(*inc)); 1082 inc->inc_fport = tcp->th_sport; 1083 inc->inc_lport = tcp->th_dport; 1084 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1085 const struct ip *ip = (const void *)l3hdr; 1086 1087 inc->inc_faddr = ip->ip_src; 1088 inc->inc_laddr = ip->ip_dst; 1089 } else { 1090 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1091 1092 inc->inc_flags |= INC_ISIPV6; 1093 inc->inc6_faddr = ip6->ip6_src; 1094 inc->inc6_laddr = ip6->ip6_dst; 1095 } 1096 } 1097 1098 if (th) { 1099 bcopy(tcp, th, sizeof(*th)); 1100 tcp_fields_to_host(th); /* just like tcp_input */ 1101 } 1102} 1103 1104static struct l2t_entry * 1105get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, 1106 struct in_conninfo *inc) 1107{ 1108 struct l2t_entry *e; 1109 struct sockaddr_in6 sin6; 1110 struct sockaddr *dst = (void *)&sin6; 1111 1112 if (inc->inc_flags & INC_ISIPV6) { 1113 struct nhop6_basic nh6; 1114 1115 bzero(dst, sizeof(struct sockaddr_in6)); 1116 dst->sa_len = sizeof(struct sockaddr_in6); 1117 dst->sa_family = AF_INET6; 1118 1119 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1120 /* no need for route lookup */ 1121 e = t4_l2t_get(pi, ifp, dst); 1122 return (e); 1123 } 1124 1125 if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr, 1126 0, 0, 0, &nh6) != 0) 1127 return (NULL); 1128 if (nh6.nh_ifp != ifp) 1129 return (NULL); 1130 ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr; 1131 } else { 1132 struct nhop4_basic nh4; 1133 1134 dst->sa_len = sizeof(struct sockaddr_in); 1135 dst->sa_family = AF_INET; 1136 1137 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0, 1138 &nh4) != 0) 1139 return (NULL); 1140 if (nh4.nh_ifp != ifp) 1141 return (NULL); 1142 ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr; 1143 } 1144 1145 e = t4_l2t_get(pi, ifp, dst); 1146 return (e); 1147} 1148 1149#define REJECT_PASS_ACCEPT() do { \ 1150 reject_reason = __LINE__; \ 1151 goto reject; \ 1152} while (0) 1153 1154/* 1155 * The context associated with a tid entry via insert_tid could be a synq_entry 1156 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1157 */ 1158CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1159 1160/* 1161 * Incoming SYN on a listening socket. 1162 * 1163 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1164 * etc. 1165 */ 1166static int 1167do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1168 struct mbuf *m) 1169{ 1170 struct adapter *sc = iq->adapter; 1171 struct toedev *tod; 1172 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1173 struct cpl_pass_accept_rpl *rpl; 1174 struct wrqe *wr; 1175 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1176 unsigned int tid = GET_TID(cpl); 1177 struct listen_ctx *lctx = lookup_stid(sc, stid); 1178 struct inpcb *inp; 1179 struct socket *so; 1180 struct in_conninfo inc; 1181 struct tcphdr th; 1182 struct tcpopt to; 1183 struct port_info *pi; 1184 struct vi_info *vi; 1185 struct ifnet *hw_ifp, *ifp; 1186 struct l2t_entry *e = NULL; 1187 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode; 1188 struct synq_entry *synqe = NULL; 1189 int reject_reason, v, ntids; 1190 uint16_t vid; 1191#ifdef INVARIANTS 1192 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1193#endif 1194 1195 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1196 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1197 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1198 1199 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1200 lctx); 1201 1202 pass_accept_req_to_protohdrs(sc, m, &inc, &th); 1203 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1204 1205 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; 1206 1207 CURVNET_SET(lctx->vnet); 1208 1209 /* 1210 * Use the MAC index to lookup the associated VI. If this SYN 1211 * didn't match a perfect MAC filter, punt. 1212 */ 1213 if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) { 1214 m_freem(m); 1215 m = NULL; 1216 REJECT_PASS_ACCEPT(); 1217 } 1218 for_each_vi(pi, v, vi) { 1219 if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info))) 1220 goto found; 1221 } 1222 m_freem(m); 1223 m = NULL; 1224 REJECT_PASS_ACCEPT(); 1225 1226found: 1227 hw_ifp = vi->ifp; /* the (v)cxgbeX ifnet */ 1228 m->m_pkthdr.rcvif = hw_ifp; 1229 tod = TOEDEV(hw_ifp); 1230 1231 /* 1232 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1233 * involved. Don't offload if the SYN had a VLAN tag and the vid 1234 * doesn't match anything on this interface. 1235 * 1236 * XXX: lagg support, lagg + vlan support. 1237 */ 1238 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1239 if (vid != 0xfff) { 1240 ifp = VLAN_DEVAT(hw_ifp, vid); 1241 if (ifp == NULL) 1242 REJECT_PASS_ACCEPT(); 1243 } else 1244 ifp = hw_ifp; 1245 1246 /* 1247 * Don't offload if the peer requested a TCP option that's not known to 1248 * the silicon. 1249 */ 1250 if (cpl->tcpopt.unknown) 1251 REJECT_PASS_ACCEPT(); 1252 1253 if (inc.inc_flags & INC_ISIPV6) { 1254 1255 /* Don't offload if the ifcap isn't enabled */ 1256 if ((ifp->if_capenable & IFCAP_TOE6) == 0) 1257 REJECT_PASS_ACCEPT(); 1258 1259 /* 1260 * SYN must be directed to an IP6 address on this ifnet. This 1261 * is more restrictive than in6_localip. 1262 */ 1263 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) 1264 REJECT_PASS_ACCEPT(); 1265 1266 ntids = 2; 1267 } else { 1268 1269 /* Don't offload if the ifcap isn't enabled */ 1270 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 1271 REJECT_PASS_ACCEPT(); 1272 1273 /* 1274 * SYN must be directed to an IP address on this ifnet. This 1275 * is more restrictive than in_localip. 1276 */ 1277 if (!in_ifhasaddr(ifp, inc.inc_laddr)) 1278 REJECT_PASS_ACCEPT(); 1279 1280 ntids = 1; 1281 } 1282 1283 /* 1284 * Don't offload if the ifnet that the SYN came in on is not in the same 1285 * vnet as the listening socket. 1286 */ 1287 if (lctx->vnet != ifp->if_vnet) 1288 REJECT_PASS_ACCEPT(); 1289 1290 e = get_l2te_for_nexthop(pi, ifp, &inc); 1291 if (e == NULL) 1292 REJECT_PASS_ACCEPT(); 1293 1294 synqe = mbuf_to_synqe(m); 1295 if (synqe == NULL) 1296 REJECT_PASS_ACCEPT(); 1297 1298 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1299 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]); 1300 if (wr == NULL) 1301 REJECT_PASS_ACCEPT(); 1302 rpl = wrtod(wr); 1303 1304 INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ 1305 1306 /* Don't offload if the 4-tuple is already in use */ 1307 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1308 INP_INFO_RUNLOCK(&V_tcbinfo); 1309 free(wr, M_CXGBE); 1310 REJECT_PASS_ACCEPT(); 1311 } 1312 INP_INFO_RUNLOCK(&V_tcbinfo); 1313 1314 inp = lctx->inp; /* listening socket, not owned by TOE */ 1315 INP_WLOCK(inp); 1316 1317 /* Don't offload if the listening socket has closed */ 1318 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1319 /* 1320 * The listening socket has closed. The reply from the TOE to 1321 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 1322 * resources tied to this listen context. 1323 */ 1324 INP_WUNLOCK(inp); 1325 free(wr, M_CXGBE); 1326 REJECT_PASS_ACCEPT(); 1327 } 1328 so = inp->inp_socket; 1329 1330 mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); 1331 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; 1332 SOCKBUF_LOCK(&so->so_rcv); 1333 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1334 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 1335 SOCKBUF_UNLOCK(&so->so_rcv); 1336 1337 save_qids_in_mbuf(m, vi); 1338 get_qids_from_mbuf(m, NULL, &rxqid); 1339 1340 if (is_t4(sc)) 1341 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1342 else { 1343 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1344 1345 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1346 } 1347 if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) { 1348 ulp_mode = ULP_MODE_TCPDDP; 1349 synqe->flags |= TPF_SYNQE_TCPDDP; 1350 } else 1351 ulp_mode = ULP_MODE_NONE; 1352 rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode); 1353 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode); 1354 1355 synqe->tid = tid; 1356 synqe->lctx = lctx; 1357 synqe->syn = m; 1358 m = NULL; 1359 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */ 1360 synqe->l2e_idx = e->idx; 1361 synqe->rcv_bufsize = rx_credits; 1362 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); 1363 1364 insert_tid(sc, tid, synqe, ntids); 1365 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 1366 hold_synqe(synqe); /* hold for the duration it's in the synq */ 1367 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ 1368 1369 /* 1370 * If all goes well t4_syncache_respond will get called during 1371 * syncache_add. Note that syncache_add releases the pcb lock. 1372 */ 1373 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 1374 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ 1375 1376 /* 1377 * If we replied during syncache_add (synqe->wr has been consumed), 1378 * good. Otherwise, set it to 0 so that further syncache_respond 1379 * attempts by the kernel will be ignored. 1380 */ 1381 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { 1382 1383 /* 1384 * syncache may or may not have a hold on the synqe, which may 1385 * or may not be stashed in the original SYN mbuf passed to us. 1386 * Just copy it over instead of dealing with all possibilities. 1387 */ 1388 m = m_dup(synqe->syn, M_NOWAIT); 1389 if (m) 1390 m->m_pkthdr.rcvif = hw_ifp; 1391 1392 remove_tid(sc, synqe->tid, ntids); 1393 free(wr, M_CXGBE); 1394 1395 /* Yank the synqe out of the lctx synq. */ 1396 INP_WLOCK(inp); 1397 TAILQ_REMOVE(&lctx->synq, synqe, link); 1398 release_synqe(synqe); /* removed from synq list */ 1399 inp = release_lctx(sc, lctx); 1400 if (inp) 1401 INP_WUNLOCK(inp); 1402 1403 release_synqe(synqe); /* extra hold */ 1404 REJECT_PASS_ACCEPT(); 1405 } 1406 1407 CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK", 1408 __func__, stid, tid, lctx, synqe); 1409 1410 INP_WLOCK(inp); 1411 synqe->flags |= TPF_SYNQE_HAS_L2TE; 1412 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1413 /* 1414 * Listening socket closed but tod_listen_stop did not abort 1415 * this tid because there was no L2T entry for the tid at that 1416 * time. Abort it now. The reply to the abort will clean up. 1417 */ 1418 CTR6(KTR_CXGBE, 1419 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT", 1420 __func__, stid, tid, lctx, synqe, synqe->flags); 1421 if (!(synqe->flags & TPF_SYNQE_EXPANDED)) 1422 send_reset_synqe(tod, synqe); 1423 INP_WUNLOCK(inp); 1424 CURVNET_RESTORE(); 1425 1426 release_synqe(synqe); /* extra hold */ 1427 return (__LINE__); 1428 } 1429 INP_WUNLOCK(inp); 1430 CURVNET_RESTORE(); 1431 1432 release_synqe(synqe); /* extra hold */ 1433 return (0); 1434reject: 1435 CURVNET_RESTORE(); 1436 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1437 reject_reason); 1438 1439 if (e) 1440 t4_l2t_release(e); 1441 release_tid(sc, tid, lctx->ctrlq); 1442 1443 if (__predict_true(m != NULL)) { 1444 m_adj(m, sizeof(*cpl)); 1445 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1446 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1447 m->m_pkthdr.csum_data = 0xffff; 1448 hw_ifp->if_input(hw_ifp, m); 1449 } 1450 1451 return (reject_reason); 1452} 1453 1454static void 1455synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1456 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1457 struct tcphdr *th, struct tcpopt *to) 1458{ 1459 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1460 1461 /* start off with the original SYN */ 1462 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); 1463 1464 /* modify parts to make it look like the ACK to our SYN|ACK */ 1465 th->th_flags = TH_ACK; 1466 th->th_ack = synqe->iss + 1; 1467 th->th_seq = be32toh(cpl->rcv_isn); 1468 bzero(to, sizeof(*to)); 1469 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1470 to->to_flags |= TOF_TS; 1471 to->to_tsecr = synqe->ts; 1472 } 1473} 1474 1475static int 1476do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1477 struct mbuf *m) 1478{ 1479 struct adapter *sc = iq->adapter; 1480 struct vi_info *vi; 1481 struct ifnet *ifp; 1482 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1483#if defined(KTR) || defined(INVARIANTS) 1484 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1485#endif 1486 unsigned int tid = GET_TID(cpl); 1487 struct synq_entry *synqe = lookup_tid(sc, tid); 1488 struct listen_ctx *lctx = synqe->lctx; 1489 struct inpcb *inp = lctx->inp, *new_inp; 1490 struct socket *so; 1491 struct tcphdr th; 1492 struct tcpopt to; 1493 struct in_conninfo inc; 1494 struct toepcb *toep; 1495 u_int txqid, rxqid; 1496#ifdef INVARIANTS 1497 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1498#endif 1499 1500 KASSERT(opcode == CPL_PASS_ESTABLISH, 1501 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1502 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1503 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1504 KASSERT(synqe->flags & TPF_SYNQE, 1505 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1506 1507 CURVNET_SET(lctx->vnet); 1508 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 1509 INP_WLOCK(inp); 1510 1511 CTR6(KTR_CXGBE, 1512 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1513 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1514 1515 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1516 1517 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1518 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1519 ("%s: listen socket closed but tid %u not aborted.", 1520 __func__, tid)); 1521 } 1522 1523 INP_WUNLOCK(inp); 1524 INP_INFO_RUNLOCK(&V_tcbinfo); 1525 CURVNET_RESTORE(); 1526 return (0); 1527 } 1528 1529 ifp = synqe->syn->m_pkthdr.rcvif; 1530 vi = ifp->if_softc; 1531 KASSERT(vi->pi->adapter == sc, 1532 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1533 1534 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); 1535 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1536 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, 1537 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1538 1539 toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT); 1540 if (toep == NULL) { 1541reset: 1542 /* 1543 * The reply to this abort will perform final cleanup. There is 1544 * no need to check for HAS_L2TE here. We can be here only if 1545 * we responded to the PASS_ACCEPT_REQ, and our response had the 1546 * L2T idx. 1547 */ 1548 send_reset_synqe(TOEDEV(ifp), synqe); 1549 INP_WUNLOCK(inp); 1550 INP_INFO_RUNLOCK(&V_tcbinfo); 1551 CURVNET_RESTORE(); 1552 return (0); 1553 } 1554 toep->tid = tid; 1555 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; 1556 if (synqe->flags & TPF_SYNQE_TCPDDP) 1557 set_tcpddp_ulp_mode(toep); 1558 else 1559 toep->ulp_mode = ULP_MODE_NONE; 1560 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1561 toep->rx_credits = synqe->rcv_bufsize; 1562 1563 so = inp->inp_socket; 1564 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1565 1566 /* Come up with something that syncache_expand should be ok with. */ 1567 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1568 1569 /* 1570 * No more need for anything in the mbuf that carried the 1571 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer 1572 * there. XXX: bad form but I don't want to increase the size of synqe. 1573 */ 1574 m = synqe->syn; 1575 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, 1576 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); 1577 bcopy(cpl, mtod(m, void *), sizeof(*cpl)); 1578 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; 1579 1580 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 1581 free_toepcb(toep); 1582 goto reset; 1583 } 1584 1585 /* New connection inpcb is already locked by syncache_expand(). */ 1586 new_inp = sotoinpcb(so); 1587 INP_WLOCK_ASSERT(new_inp); 1588 MPASS(so->so_vnet == lctx->vnet); 1589 toep->vnet = lctx->vnet; 1590 if (inc.inc_flags & INC_ISIPV6) 1591 toep->ce = hold_lip(sc->tom_softc, &inc.inc6_laddr, lctx->ce); 1592 1593 /* 1594 * This is for the unlikely case where the syncache entry that we added 1595 * has been evicted from the syncache, but the syncache_expand above 1596 * works because of syncookies. 1597 * 1598 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1599 * anyone accept'ing a connection before we've installed our hooks, but 1600 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1601 */ 1602 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1603 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1604 t4_offload_socket(TOEDEV(ifp), synqe, so); 1605 } 1606 1607 INP_WUNLOCK(new_inp); 1608 1609 /* Done with the synqe */ 1610 TAILQ_REMOVE(&lctx->synq, synqe, link); 1611 inp = release_lctx(sc, lctx); 1612 if (inp != NULL) 1613 INP_WUNLOCK(inp); 1614 INP_INFO_RUNLOCK(&V_tcbinfo); 1615 CURVNET_RESTORE(); 1616 release_synqe(synqe); 1617 1618 return (0); 1619} 1620 1621void 1622t4_init_listen_cpl_handlers(void) 1623{ 1624 1625 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1626 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1627 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1628 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1629} 1630 1631void 1632t4_uninit_listen_cpl_handlers(void) 1633{ 1634 1635 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1636 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1637 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1638 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1639} 1640#endif 1641