sdp_main.c revision 330897
1/*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36/* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65#include <sys/cdefs.h> 66__FBSDID("$FreeBSD$"); 67 68#include "sdp.h" 69 70#include <net/if.h> 71#include <net/route.h> 72#include <net/vnet.h> 73#include <sys/sysctl.h> 74 75uma_zone_t sdp_zone; 76struct rwlock sdp_lock; 77LIST_HEAD(, sdp_sock) sdp_list; 78 79struct workqueue_struct *rx_comp_wq; 80 81RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 82#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 83#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 84#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 85#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 86#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 87#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 88#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 89 90static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol"); 91 92static void sdp_stop_keepalive_timer(struct socket *so); 93 94/* 95 * SDP protocol interface to socket abstraction. 96 */ 97/* 98 * sdp_sendspace and sdp_recvspace are the default send and receive window 99 * sizes, respectively. 100 */ 101u_long sdp_sendspace = 1024*32; 102u_long sdp_recvspace = 1024*64; 103 104static int sdp_count; 105 106/* 107 * Disable async. CMA events for sockets which are being torn down. 108 */ 109static void 110sdp_destroy_cma(struct sdp_sock *ssk) 111{ 112 113 if (ssk->id == NULL) 114 return; 115 rdma_destroy_id(ssk->id); 116 ssk->id = NULL; 117} 118 119static int 120sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 121{ 122 struct sockaddr_in *sin; 123 struct sockaddr_in null; 124 int error; 125 126 SDP_WLOCK_ASSERT(ssk); 127 128 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 129 return (EINVAL); 130 /* rdma_bind_addr handles bind races. */ 131 SDP_WUNLOCK(ssk); 132 if (ssk->id == NULL) 133 ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 134 if (ssk->id == NULL) { 135 SDP_WLOCK(ssk); 136 return (ENOMEM); 137 } 138 if (nam == NULL) { 139 null.sin_family = AF_INET; 140 null.sin_len = sizeof(null); 141 null.sin_addr.s_addr = INADDR_ANY; 142 null.sin_port = 0; 143 bzero(&null.sin_zero, sizeof(null.sin_zero)); 144 nam = (struct sockaddr *)&null; 145 } 146 error = -rdma_bind_addr(ssk->id, nam); 147 SDP_WLOCK(ssk); 148 if (error == 0) { 149 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 150 ssk->laddr = sin->sin_addr.s_addr; 151 ssk->lport = sin->sin_port; 152 } else 153 sdp_destroy_cma(ssk); 154 return (error); 155} 156 157static void 158sdp_pcbfree(struct sdp_sock *ssk) 159{ 160 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 161 162 sdp_dbg(ssk->socket, "Freeing pcb"); 163 SDP_WLOCK_ASSERT(ssk); 164 ssk->flags |= SDP_DESTROY; 165 SDP_WUNLOCK(ssk); 166 SDP_LIST_WLOCK(); 167 sdp_count--; 168 LIST_REMOVE(ssk, list); 169 SDP_LIST_WUNLOCK(); 170 crfree(ssk->cred); 171 sdp_destroy_cma(ssk); 172 ssk->qp_active = 0; 173 if (ssk->qp) { 174 ib_destroy_qp(ssk->qp); 175 ssk->qp = NULL; 176 } 177 sdp_tx_ring_destroy(ssk); 178 sdp_rx_ring_destroy(ssk); 179 rw_destroy(&ssk->rx_ring.destroyed_lock); 180 uma_zfree(sdp_zone, ssk); 181 rw_destroy(&ssk->lock); 182} 183 184/* 185 * Common routines to return a socket address. 186 */ 187static struct sockaddr * 188sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 189{ 190 struct sockaddr_in *sin; 191 192 sin = malloc(sizeof *sin, M_SONAME, 193 M_WAITOK | M_ZERO); 194 sin->sin_family = AF_INET; 195 sin->sin_len = sizeof(*sin); 196 sin->sin_addr = *addr_p; 197 sin->sin_port = port; 198 199 return (struct sockaddr *)sin; 200} 201 202static int 203sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 204{ 205 struct sdp_sock *ssk; 206 struct in_addr addr; 207 in_port_t port; 208 209 ssk = sdp_sk(so); 210 SDP_RLOCK(ssk); 211 port = ssk->lport; 212 addr.s_addr = ssk->laddr; 213 SDP_RUNLOCK(ssk); 214 215 *nam = sdp_sockaddr(port, &addr); 216 return 0; 217} 218 219static int 220sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 221{ 222 struct sdp_sock *ssk; 223 struct in_addr addr; 224 in_port_t port; 225 226 ssk = sdp_sk(so); 227 SDP_RLOCK(ssk); 228 port = ssk->fport; 229 addr.s_addr = ssk->faddr; 230 SDP_RUNLOCK(ssk); 231 232 *nam = sdp_sockaddr(port, &addr); 233 return 0; 234} 235 236static void 237sdp_pcbnotifyall(struct in_addr faddr, int errno, 238 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 239{ 240 struct sdp_sock *ssk, *ssk_temp; 241 242 SDP_LIST_WLOCK(); 243 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 244 SDP_WLOCK(ssk); 245 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 246 SDP_WUNLOCK(ssk); 247 continue; 248 } 249 if ((ssk->flags & SDP_DESTROY) == 0) 250 if ((*notify)(ssk, errno)) 251 SDP_WUNLOCK(ssk); 252 } 253 SDP_LIST_WUNLOCK(); 254} 255 256#if 0 257static void 258sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 259{ 260 struct sdp_sock *ssk; 261 262 SDP_LIST_RLOCK(); 263 LIST_FOREACH(ssk, &sdp_list, list) { 264 SDP_WLOCK(ssk); 265 func(ssk, arg); 266 SDP_WUNLOCK(ssk); 267 } 268 SDP_LIST_RUNLOCK(); 269} 270#endif 271 272static void 273sdp_output_reset(struct sdp_sock *ssk) 274{ 275 struct rdma_cm_id *id; 276 277 SDP_WLOCK_ASSERT(ssk); 278 if (ssk->id) { 279 id = ssk->id; 280 ssk->qp_active = 0; 281 SDP_WUNLOCK(ssk); 282 rdma_disconnect(id); 283 SDP_WLOCK(ssk); 284 } 285 ssk->state = TCPS_CLOSED; 286} 287 288/* 289 * Attempt to close a SDP socket, marking it as dropped, and freeing 290 * the socket if we hold the only reference. 291 */ 292static struct sdp_sock * 293sdp_closed(struct sdp_sock *ssk) 294{ 295 struct socket *so; 296 297 SDP_WLOCK_ASSERT(ssk); 298 299 ssk->flags |= SDP_DROPPED; 300 so = ssk->socket; 301 soisdisconnected(so); 302 if (ssk->flags & SDP_SOCKREF) { 303 KASSERT(so->so_state & SS_PROTOREF, 304 ("sdp_closed: !SS_PROTOREF")); 305 ssk->flags &= ~SDP_SOCKREF; 306 SDP_WUNLOCK(ssk); 307 ACCEPT_LOCK(); 308 SOCK_LOCK(so); 309 so->so_state &= ~SS_PROTOREF; 310 sofree(so); 311 return (NULL); 312 } 313 return (ssk); 314} 315 316/* 317 * Perform timer based shutdowns which can not operate in 318 * callout context. 319 */ 320static void 321sdp_shutdown_task(void *data, int pending) 322{ 323 struct sdp_sock *ssk; 324 325 ssk = data; 326 SDP_WLOCK(ssk); 327 /* 328 * I don't think this can race with another call to pcbfree() 329 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 330 */ 331 if (ssk->flags & SDP_DESTROY) 332 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 333 ssk); 334 if (ssk->flags & SDP_DISCON) 335 sdp_output_reset(ssk); 336 /* We have to clear this so sdp_detach() will call pcbfree(). */ 337 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 338 if ((ssk->flags & SDP_DROPPED) == 0 && 339 sdp_closed(ssk) == NULL) 340 return; 341 if (ssk->socket == NULL) { 342 sdp_pcbfree(ssk); 343 return; 344 } 345 SDP_WUNLOCK(ssk); 346} 347 348/* 349 * 2msl has expired, schedule the shutdown task. 350 */ 351static void 352sdp_2msl_timeout(void *data) 353{ 354 struct sdp_sock *ssk; 355 356 ssk = data; 357 /* Callout canceled. */ 358 if (!callout_active(&ssk->keep2msl)) 359 goto out; 360 callout_deactivate(&ssk->keep2msl); 361 /* Should be impossible, defensive programming. */ 362 if ((ssk->flags & SDP_TIMEWAIT) == 0) 363 goto out; 364 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 365out: 366 SDP_WUNLOCK(ssk); 367 return; 368} 369 370/* 371 * Schedule the 2msl wait timer. 372 */ 373static void 374sdp_2msl_wait(struct sdp_sock *ssk) 375{ 376 377 SDP_WLOCK_ASSERT(ssk); 378 ssk->flags |= SDP_TIMEWAIT; 379 ssk->state = TCPS_TIME_WAIT; 380 soisdisconnected(ssk->socket); 381 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 382} 383 384/* 385 * Timed out waiting for the final fin/ack from rdma_disconnect(). 386 */ 387static void 388sdp_dreq_timeout(void *data) 389{ 390 struct sdp_sock *ssk; 391 392 ssk = data; 393 /* Callout canceled. */ 394 if (!callout_active(&ssk->keep2msl)) 395 goto out; 396 /* Callout rescheduled, probably as a different timer. */ 397 if (callout_pending(&ssk->keep2msl)) 398 goto out; 399 callout_deactivate(&ssk->keep2msl); 400 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 401 goto out; 402 if ((ssk->flags & SDP_DREQWAIT) == 0) 403 goto out; 404 ssk->flags &= ~SDP_DREQWAIT; 405 ssk->flags |= SDP_DISCON; 406 sdp_2msl_wait(ssk); 407 ssk->qp_active = 0; 408out: 409 SDP_WUNLOCK(ssk); 410} 411 412/* 413 * Received the final fin/ack. Cancel the 2msl. 414 */ 415void 416sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 417{ 418 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 419 ssk->flags &= ~SDP_DREQWAIT; 420 sdp_2msl_wait(ssk); 421} 422 423static int 424sdp_init_sock(struct socket *sk) 425{ 426 struct sdp_sock *ssk = sdp_sk(sk); 427 428 sdp_dbg(sk, "%s\n", __func__); 429 430 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 431 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 432#ifdef SDP_ZCOPY 433 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 434 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 435 ssk->tx_ring.rdma_inflight = NULL; 436#endif 437 atomic_set(&ssk->mseq_ack, 0); 438 sdp_rx_ring_init(ssk); 439 ssk->tx_ring.buffer = NULL; 440 441 return 0; 442} 443 444/* 445 * Allocate an sdp_sock for the socket and reserve socket buffer space. 446 */ 447static int 448sdp_attach(struct socket *so, int proto, struct thread *td) 449{ 450 struct sdp_sock *ssk; 451 int error; 452 453 ssk = sdp_sk(so); 454 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 455 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 456 error = soreserve(so, sdp_sendspace, sdp_recvspace); 457 if (error) 458 return (error); 459 } 460 so->so_rcv.sb_flags |= SB_AUTOSIZE; 461 so->so_snd.sb_flags |= SB_AUTOSIZE; 462 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 463 if (ssk == NULL) 464 return (ENOBUFS); 465 rw_init(&ssk->lock, "sdpsock"); 466 ssk->socket = so; 467 ssk->cred = crhold(so->so_cred); 468 so->so_pcb = (caddr_t)ssk; 469 sdp_init_sock(so); 470 ssk->flags = 0; 471 ssk->qp_active = 0; 472 ssk->state = TCPS_CLOSED; 473 SDP_LIST_WLOCK(); 474 LIST_INSERT_HEAD(&sdp_list, ssk, list); 475 sdp_count++; 476 SDP_LIST_WUNLOCK(); 477 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 478 so->so_linger = TCP_LINGERTIME; 479 480 return (0); 481} 482 483/* 484 * Detach SDP from the socket, potentially leaving it around for the 485 * timewait to expire. 486 */ 487static void 488sdp_detach(struct socket *so) 489{ 490 struct sdp_sock *ssk; 491 492 ssk = sdp_sk(so); 493 SDP_WLOCK(ssk); 494 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 495 ssk->socket->so_pcb = NULL; 496 ssk->socket = NULL; 497 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 498 SDP_WUNLOCK(ssk); 499 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 500 sdp_pcbfree(ssk); 501 else 502 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 503} 504 505/* 506 * Allocate a local address for the socket. 507 */ 508static int 509sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 510{ 511 int error = 0; 512 struct sdp_sock *ssk; 513 struct sockaddr_in *sin; 514 515 sin = (struct sockaddr_in *)nam; 516 if (nam->sa_len != sizeof (*sin)) 517 return (EINVAL); 518 if (sin->sin_family != AF_INET) 519 return (EINVAL); 520 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 521 return (EAFNOSUPPORT); 522 523 ssk = sdp_sk(so); 524 SDP_WLOCK(ssk); 525 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 526 error = EINVAL; 527 goto out; 528 } 529 error = sdp_pcbbind(ssk, nam, td->td_ucred); 530out: 531 SDP_WUNLOCK(ssk); 532 533 return (error); 534} 535 536/* 537 * Prepare to accept connections. 538 */ 539static int 540sdp_listen(struct socket *so, int backlog, struct thread *td) 541{ 542 int error = 0; 543 struct sdp_sock *ssk; 544 545 ssk = sdp_sk(so); 546 SDP_WLOCK(ssk); 547 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 548 error = EINVAL; 549 goto out; 550 } 551 if (error == 0 && ssk->lport == 0) 552 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 553 SOCK_LOCK(so); 554 if (error == 0) 555 error = solisten_proto_check(so); 556 if (error == 0) { 557 solisten_proto(so, backlog); 558 ssk->state = TCPS_LISTEN; 559 } 560 SOCK_UNLOCK(so); 561 562out: 563 SDP_WUNLOCK(ssk); 564 if (error == 0) 565 error = -rdma_listen(ssk->id, backlog); 566 return (error); 567} 568 569/* 570 * Initiate a SDP connection to nam. 571 */ 572static int 573sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 574{ 575 struct sockaddr_in src; 576 struct socket *so; 577 int error; 578 579 so = ssk->socket; 580 581 SDP_WLOCK_ASSERT(ssk); 582 if (ssk->lport == 0) { 583 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 584 if (error) 585 return error; 586 } 587 src.sin_family = AF_INET; 588 src.sin_len = sizeof(src); 589 bzero(&src.sin_zero, sizeof(src.sin_zero)); 590 src.sin_port = ssk->lport; 591 src.sin_addr.s_addr = ssk->laddr; 592 soisconnecting(so); 593 SDP_WUNLOCK(ssk); 594 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 595 SDP_RESOLVE_TIMEOUT); 596 SDP_WLOCK(ssk); 597 if (error == 0) 598 ssk->state = TCPS_SYN_SENT; 599 600 return 0; 601} 602 603/* 604 * Initiate SDP connection. 605 */ 606static int 607sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 608{ 609 int error = 0; 610 struct sdp_sock *ssk; 611 struct sockaddr_in *sin; 612 613 sin = (struct sockaddr_in *)nam; 614 if (nam->sa_len != sizeof (*sin)) 615 return (EINVAL); 616 if (sin->sin_family != AF_INET) 617 return (EINVAL); 618 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 619 return (EAFNOSUPPORT); 620 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 621 return (error); 622 ssk = sdp_sk(so); 623 SDP_WLOCK(ssk); 624 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 625 error = EINVAL; 626 else 627 error = sdp_start_connect(ssk, nam, td); 628 SDP_WUNLOCK(ssk); 629 return (error); 630} 631 632/* 633 * Drop a SDP socket, reporting 634 * the specified error. If connection is synchronized, 635 * then send a RST to peer. 636 */ 637static struct sdp_sock * 638sdp_drop(struct sdp_sock *ssk, int errno) 639{ 640 struct socket *so; 641 642 SDP_WLOCK_ASSERT(ssk); 643 so = ssk->socket; 644 if (TCPS_HAVERCVDSYN(ssk->state)) 645 sdp_output_reset(ssk); 646 if (errno == ETIMEDOUT && ssk->softerror) 647 errno = ssk->softerror; 648 so->so_error = errno; 649 return (sdp_closed(ssk)); 650} 651 652/* 653 * User issued close, and wish to trail through shutdown states: 654 * if never received SYN, just forget it. If got a SYN from peer, 655 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 656 * If already got a FIN from peer, then almost done; go to LAST_ACK 657 * state. In all other cases, have already sent FIN to peer (e.g. 658 * after PRU_SHUTDOWN), and just have to play tedious game waiting 659 * for peer to send FIN or not respond to keep-alives, etc. 660 * We can let the user exit from the close as soon as the FIN is acked. 661 */ 662static void 663sdp_usrclosed(struct sdp_sock *ssk) 664{ 665 666 SDP_WLOCK_ASSERT(ssk); 667 668 switch (ssk->state) { 669 case TCPS_LISTEN: 670 ssk->state = TCPS_CLOSED; 671 SDP_WUNLOCK(ssk); 672 sdp_destroy_cma(ssk); 673 SDP_WLOCK(ssk); 674 /* FALLTHROUGH */ 675 case TCPS_CLOSED: 676 ssk = sdp_closed(ssk); 677 /* 678 * sdp_closed() should never return NULL here as the socket is 679 * still open. 680 */ 681 KASSERT(ssk != NULL, 682 ("sdp_usrclosed: sdp_closed() returned NULL")); 683 break; 684 685 case TCPS_SYN_SENT: 686 /* FALLTHROUGH */ 687 case TCPS_SYN_RECEIVED: 688 ssk->flags |= SDP_NEEDFIN; 689 break; 690 691 case TCPS_ESTABLISHED: 692 ssk->flags |= SDP_NEEDFIN; 693 ssk->state = TCPS_FIN_WAIT_1; 694 break; 695 696 case TCPS_CLOSE_WAIT: 697 ssk->state = TCPS_LAST_ACK; 698 break; 699 } 700 if (ssk->state >= TCPS_FIN_WAIT_2) { 701 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 702 if (ssk->state == TCPS_FIN_WAIT_2) 703 sdp_2msl_wait(ssk); 704 else 705 soisdisconnected(ssk->socket); 706 } 707} 708 709static void 710sdp_output_disconnect(struct sdp_sock *ssk) 711{ 712 713 SDP_WLOCK_ASSERT(ssk); 714 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 715 sdp_dreq_timeout, ssk); 716 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 717 sdp_post_sends(ssk, M_NOWAIT); 718} 719 720/* 721 * Initiate or continue a disconnect. 722 * If embryonic state, just send reset (once). 723 * If in ``let data drain'' option and linger null, just drop. 724 * Otherwise (hard), mark socket disconnecting and drop 725 * current input data; switch states based on user close, and 726 * send segment to peer (with FIN). 727 */ 728static void 729sdp_start_disconnect(struct sdp_sock *ssk) 730{ 731 struct socket *so; 732 int unread; 733 734 so = ssk->socket; 735 SDP_WLOCK_ASSERT(ssk); 736 sdp_stop_keepalive_timer(so); 737 /* 738 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 739 * socket is still open. 740 */ 741 if (ssk->state < TCPS_ESTABLISHED) { 742 ssk = sdp_closed(ssk); 743 KASSERT(ssk != NULL, 744 ("sdp_start_disconnect: sdp_close() returned NULL")); 745 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 746 ssk = sdp_drop(ssk, 0); 747 KASSERT(ssk != NULL, 748 ("sdp_start_disconnect: sdp_drop() returned NULL")); 749 } else { 750 soisdisconnecting(so); 751 unread = sbused(&so->so_rcv); 752 sbflush(&so->so_rcv); 753 sdp_usrclosed(ssk); 754 if (!(ssk->flags & SDP_DROPPED)) { 755 if (unread) 756 sdp_output_reset(ssk); 757 else 758 sdp_output_disconnect(ssk); 759 } 760 } 761} 762 763/* 764 * User initiated disconnect. 765 */ 766static int 767sdp_disconnect(struct socket *so) 768{ 769 struct sdp_sock *ssk; 770 int error = 0; 771 772 ssk = sdp_sk(so); 773 SDP_WLOCK(ssk); 774 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 775 error = ECONNRESET; 776 goto out; 777 } 778 sdp_start_disconnect(ssk); 779out: 780 SDP_WUNLOCK(ssk); 781 return (error); 782} 783 784/* 785 * Accept a connection. Essentially all the work is done at higher levels; 786 * just return the address of the peer, storing through addr. 787 * 788 * 789 * XXX This is broken XXX 790 * 791 * The rationale for acquiring the sdp lock here is somewhat complicated, 792 * and is described in detail in the commit log entry for r175612. Acquiring 793 * it delays an accept(2) racing with sonewconn(), which inserts the socket 794 * before the address/port fields are initialized. A better fix would 795 * prevent the socket from being placed in the listen queue until all fields 796 * are fully initialized. 797 */ 798static int 799sdp_accept(struct socket *so, struct sockaddr **nam) 800{ 801 struct sdp_sock *ssk = NULL; 802 struct in_addr addr; 803 in_port_t port; 804 int error; 805 806 if (so->so_state & SS_ISDISCONNECTED) 807 return (ECONNABORTED); 808 809 port = 0; 810 addr.s_addr = 0; 811 error = 0; 812 ssk = sdp_sk(so); 813 SDP_WLOCK(ssk); 814 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 815 error = ECONNABORTED; 816 goto out; 817 } 818 port = ssk->fport; 819 addr.s_addr = ssk->faddr; 820out: 821 SDP_WUNLOCK(ssk); 822 if (error == 0) 823 *nam = sdp_sockaddr(port, &addr); 824 return error; 825} 826 827/* 828 * Mark the connection as being incapable of further output. 829 */ 830static int 831sdp_shutdown(struct socket *so) 832{ 833 int error = 0; 834 struct sdp_sock *ssk; 835 836 ssk = sdp_sk(so); 837 SDP_WLOCK(ssk); 838 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 839 error = ECONNRESET; 840 goto out; 841 } 842 socantsendmore(so); 843 sdp_usrclosed(ssk); 844 if (!(ssk->flags & SDP_DROPPED)) 845 sdp_output_disconnect(ssk); 846 847out: 848 SDP_WUNLOCK(ssk); 849 850 return (error); 851} 852 853static void 854sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 855{ 856 struct mbuf *n; 857 int ncnt; 858 859 SOCKBUF_LOCK_ASSERT(sb); 860 SBLASTRECORDCHK(sb); 861 KASSERT(mb->m_flags & M_PKTHDR, 862 ("sdp_append: %p Missing packet header.\n", mb)); 863 n = sb->sb_lastrecord; 864 /* 865 * If the queue is empty just set all pointers and proceed. 866 */ 867 if (n == NULL) { 868 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 869 for (; mb; mb = mb->m_next) { 870 sb->sb_mbtail = mb; 871 sballoc(sb, mb); 872 } 873 return; 874 } 875 /* 876 * Count the number of mbufs in the current tail. 877 */ 878 for (ncnt = 0; n->m_next; n = n->m_next) 879 ncnt++; 880 n = sb->sb_lastrecord; 881 /* 882 * If the two chains can fit in a single sdp packet and 883 * the last record has not been sent yet (WRITABLE) coalesce 884 * them. The lastrecord remains the same but we must strip the 885 * packet header and then let sbcompress do the hard part. 886 */ 887 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 888 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 889 ssk->xmit_size_goal) { 890 m_adj(mb, SDP_HEAD_SIZE); 891 n->m_pkthdr.len += mb->m_pkthdr.len; 892 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 893 m_demote(mb, 1, 0); 894 sbcompress(sb, mb, sb->sb_mbtail); 895 return; 896 } 897 /* 898 * Not compressible, just append to the end and adjust counters. 899 */ 900 sb->sb_lastrecord->m_flags |= M_PUSH; 901 sb->sb_lastrecord->m_nextpkt = mb; 902 sb->sb_lastrecord = mb; 903 if (sb->sb_sndptr == NULL) 904 sb->sb_sndptr = mb; 905 for (; mb; mb = mb->m_next) { 906 sb->sb_mbtail = mb; 907 sballoc(sb, mb); 908 } 909} 910 911/* 912 * Do a send by putting data in output queue and updating urgent 913 * marker if URG set. Possibly send more data. Unlike the other 914 * pru_*() routines, the mbuf chains are our responsibility. We 915 * must either enqueue them or free them. The other pru_* routines 916 * generally are caller-frees. 917 * 918 * This comes from sendfile, normal sends will come from sdp_sosend(). 919 */ 920static int 921sdp_send(struct socket *so, int flags, struct mbuf *m, 922 struct sockaddr *nam, struct mbuf *control, struct thread *td) 923{ 924 struct sdp_sock *ssk; 925 struct mbuf *n; 926 int error; 927 int cnt; 928 929 error = 0; 930 ssk = sdp_sk(so); 931 KASSERT(m->m_flags & M_PKTHDR, 932 ("sdp_send: %p no packet header", m)); 933 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 934 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 935 for (n = m, cnt = 0; n->m_next; n = n->m_next) 936 cnt++; 937 if (cnt > SDP_MAX_SEND_SGES) { 938 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 939 if (n == NULL) { 940 m_freem(m); 941 return (EMSGSIZE); 942 } 943 m = n; 944 for (cnt = 0; n->m_next; n = n->m_next) 945 cnt++; 946 } 947 SDP_WLOCK(ssk); 948 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 949 if (control) 950 m_freem(control); 951 if (m) 952 m_freem(m); 953 error = ECONNRESET; 954 goto out; 955 } 956 if (control) { 957 /* SDP doesn't support control messages. */ 958 if (control->m_len) { 959 m_freem(control); 960 if (m) 961 m_freem(m); 962 error = EINVAL; 963 goto out; 964 } 965 m_freem(control); /* empty control, just free it */ 966 } 967 if (!(flags & PRUS_OOB)) { 968 SOCKBUF_LOCK(&so->so_snd); 969 sdp_append(ssk, &so->so_snd, m, cnt); 970 SOCKBUF_UNLOCK(&so->so_snd); 971 if (nam && ssk->state < TCPS_SYN_SENT) { 972 /* 973 * Do implied connect if not yet connected. 974 */ 975 error = sdp_start_connect(ssk, nam, td); 976 if (error) 977 goto out; 978 } 979 if (flags & PRUS_EOF) { 980 /* 981 * Close the send side of the connection after 982 * the data is sent. 983 */ 984 socantsendmore(so); 985 sdp_usrclosed(ssk); 986 if (!(ssk->flags & SDP_DROPPED)) 987 sdp_output_disconnect(ssk); 988 } else if (!(ssk->flags & SDP_DROPPED) && 989 !(flags & PRUS_MORETOCOME)) 990 sdp_post_sends(ssk, M_NOWAIT); 991 SDP_WUNLOCK(ssk); 992 return (0); 993 } else { 994 SOCKBUF_LOCK(&so->so_snd); 995 if (sbspace(&so->so_snd) < -512) { 996 SOCKBUF_UNLOCK(&so->so_snd); 997 m_freem(m); 998 error = ENOBUFS; 999 goto out; 1000 } 1001 /* 1002 * According to RFC961 (Assigned Protocols), 1003 * the urgent pointer points to the last octet 1004 * of urgent data. We continue, however, 1005 * to consider it to indicate the first octet 1006 * of data past the urgent section. 1007 * Otherwise, snd_up should be one lower. 1008 */ 1009 m->m_flags |= M_URG | M_PUSH; 1010 sdp_append(ssk, &so->so_snd, m, cnt); 1011 SOCKBUF_UNLOCK(&so->so_snd); 1012 if (nam && ssk->state < TCPS_SYN_SENT) { 1013 /* 1014 * Do implied connect if not yet connected. 1015 */ 1016 error = sdp_start_connect(ssk, nam, td); 1017 if (error) 1018 goto out; 1019 } 1020 sdp_post_sends(ssk, M_NOWAIT); 1021 SDP_WUNLOCK(ssk); 1022 return (0); 1023 } 1024out: 1025 SDP_WUNLOCK(ssk); 1026 return (error); 1027} 1028 1029#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1030 1031/* 1032 * Send on a socket. If send must go all at once and message is larger than 1033 * send buffering, then hard error. Lock against other senders. If must go 1034 * all at once and not enough room now, then inform user that this would 1035 * block and do nothing. Otherwise, if nonblocking, send as much as 1036 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1037 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1038 * in mbuf chain must be small enough to send all at once. 1039 * 1040 * Returns nonzero on error, timeout or signal; callers must check for short 1041 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1042 * on return. 1043 */ 1044static int 1045sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1046 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1047{ 1048 struct sdp_sock *ssk; 1049 long space, resid; 1050 int atomic; 1051 int error; 1052 int copy; 1053 1054 if (uio != NULL) 1055 resid = uio->uio_resid; 1056 else 1057 resid = top->m_pkthdr.len; 1058 atomic = top != NULL; 1059 if (control != NULL) { 1060 if (control->m_len) { 1061 m_freem(control); 1062 if (top) 1063 m_freem(top); 1064 return (EINVAL); 1065 } 1066 m_freem(control); 1067 control = NULL; 1068 } 1069 /* 1070 * In theory resid should be unsigned. However, space must be 1071 * signed, as it might be less than 0 if we over-committed, and we 1072 * must use a signed comparison of space and resid. On the other 1073 * hand, a negative resid causes us to loop sending 0-length 1074 * segments to the protocol. 1075 * 1076 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1077 * type sockets since that's an error. 1078 */ 1079 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1080 error = EINVAL; 1081 goto out; 1082 } 1083 if (td != NULL) 1084 td->td_ru.ru_msgsnd++; 1085 1086 ssk = sdp_sk(so); 1087 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1088 if (error) 1089 goto out; 1090 1091restart: 1092 do { 1093 SOCKBUF_LOCK(&so->so_snd); 1094 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1095 SOCKBUF_UNLOCK(&so->so_snd); 1096 error = EPIPE; 1097 goto release; 1098 } 1099 if (so->so_error) { 1100 error = so->so_error; 1101 so->so_error = 0; 1102 SOCKBUF_UNLOCK(&so->so_snd); 1103 goto release; 1104 } 1105 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1106 SOCKBUF_UNLOCK(&so->so_snd); 1107 error = ENOTCONN; 1108 goto release; 1109 } 1110 space = sbspace(&so->so_snd); 1111 if (flags & MSG_OOB) 1112 space += 1024; 1113 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1114 SOCKBUF_UNLOCK(&so->so_snd); 1115 error = EMSGSIZE; 1116 goto release; 1117 } 1118 if (space < resid && 1119 (atomic || space < so->so_snd.sb_lowat)) { 1120 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1121 SOCKBUF_UNLOCK(&so->so_snd); 1122 error = EWOULDBLOCK; 1123 goto release; 1124 } 1125 error = sbwait(&so->so_snd); 1126 SOCKBUF_UNLOCK(&so->so_snd); 1127 if (error) 1128 goto release; 1129 goto restart; 1130 } 1131 SOCKBUF_UNLOCK(&so->so_snd); 1132 do { 1133 if (uio == NULL) { 1134 resid = 0; 1135 if (flags & MSG_EOR) 1136 top->m_flags |= M_EOR; 1137 } else { 1138 /* 1139 * Copy the data from userland into a mbuf 1140 * chain. If no data is to be copied in, 1141 * a single empty mbuf is returned. 1142 */ 1143 copy = min(space, 1144 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1145 top = m_uiotombuf(uio, M_WAITOK, copy, 1146 0, M_PKTHDR | 1147 ((flags & MSG_EOR) ? M_EOR : 0)); 1148 if (top == NULL) { 1149 /* only possible error */ 1150 error = EFAULT; 1151 goto release; 1152 } 1153 space -= resid - uio->uio_resid; 1154 resid = uio->uio_resid; 1155 } 1156 /* 1157 * XXX all the SBS_CANTSENDMORE checks previously 1158 * done could be out of date after dropping the 1159 * socket lock. 1160 */ 1161 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1162 /* 1163 * Set EOF on the last send if the user specified 1164 * MSG_EOF. 1165 */ 1166 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1167 /* If there is more to send set PRUS_MORETOCOME. */ 1168 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1169 top, addr, NULL, td); 1170 top = NULL; 1171 if (error) 1172 goto release; 1173 } while (resid && space > 0); 1174 } while (resid); 1175 1176release: 1177 sbunlock(&so->so_snd); 1178out: 1179 if (top != NULL) 1180 m_freem(top); 1181 return (error); 1182} 1183 1184/* 1185 * The part of soreceive() that implements reading non-inline out-of-band 1186 * data from a socket. For more complete comments, see soreceive(), from 1187 * which this code originated. 1188 * 1189 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1190 * unable to return an mbuf chain to the caller. 1191 */ 1192static int 1193soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1194{ 1195 struct protosw *pr = so->so_proto; 1196 struct mbuf *m; 1197 int error; 1198 1199 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1200 1201 m = m_get(M_WAITOK, MT_DATA); 1202 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1203 if (error) 1204 goto bad; 1205 do { 1206 error = uiomove(mtod(m, void *), 1207 (int) min(uio->uio_resid, m->m_len), uio); 1208 m = m_free(m); 1209 } while (uio->uio_resid && error == 0 && m); 1210bad: 1211 if (m != NULL) 1212 m_freem(m); 1213 return (error); 1214} 1215 1216/* 1217 * Optimized version of soreceive() for stream (TCP) sockets. 1218 */ 1219static int 1220sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1221 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1222{ 1223 int len = 0, error = 0, flags, oresid; 1224 struct sockbuf *sb; 1225 struct mbuf *m, *n = NULL; 1226 struct sdp_sock *ssk; 1227 1228 /* We only do stream sockets. */ 1229 if (so->so_type != SOCK_STREAM) 1230 return (EINVAL); 1231 if (psa != NULL) 1232 *psa = NULL; 1233 if (controlp != NULL) 1234 return (EINVAL); 1235 if (flagsp != NULL) 1236 flags = *flagsp &~ MSG_EOR; 1237 else 1238 flags = 0; 1239 if (flags & MSG_OOB) 1240 return (soreceive_rcvoob(so, uio, flags)); 1241 if (mp0 != NULL) 1242 *mp0 = NULL; 1243 1244 sb = &so->so_rcv; 1245 ssk = sdp_sk(so); 1246 1247 /* Prevent other readers from entering the socket. */ 1248 error = sblock(sb, SBLOCKWAIT(flags)); 1249 if (error) 1250 goto out; 1251 SOCKBUF_LOCK(sb); 1252 1253 /* Easy one, no space to copyout anything. */ 1254 if (uio->uio_resid == 0) { 1255 error = EINVAL; 1256 goto out; 1257 } 1258 oresid = uio->uio_resid; 1259 1260 /* We will never ever get anything unless we are connected. */ 1261 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1262 /* When disconnecting there may be still some data left. */ 1263 if (sbavail(sb)) 1264 goto deliver; 1265 if (!(so->so_state & SS_ISDISCONNECTED)) 1266 error = ENOTCONN; 1267 goto out; 1268 } 1269 1270 /* Socket buffer is empty and we shall not block. */ 1271 if (sbavail(sb) == 0 && 1272 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1273 error = EAGAIN; 1274 goto out; 1275 } 1276 1277restart: 1278 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1279 1280 /* Abort if socket has reported problems. */ 1281 if (so->so_error) { 1282 if (sbavail(sb)) 1283 goto deliver; 1284 if (oresid > uio->uio_resid) 1285 goto out; 1286 error = so->so_error; 1287 if (!(flags & MSG_PEEK)) 1288 so->so_error = 0; 1289 goto out; 1290 } 1291 1292 /* Door is closed. Deliver what is left, if any. */ 1293 if (sb->sb_state & SBS_CANTRCVMORE) { 1294 if (sbavail(sb)) 1295 goto deliver; 1296 else 1297 goto out; 1298 } 1299 1300 /* Socket buffer got some data that we shall deliver now. */ 1301 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1302 ((so->so_state & SS_NBIO) || 1303 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1304 sbavail(sb) >= sb->sb_lowat || 1305 sbavail(sb) >= uio->uio_resid || 1306 sbavail(sb) >= sb->sb_hiwat) ) { 1307 goto deliver; 1308 } 1309 1310 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1311 if ((flags & MSG_WAITALL) && 1312 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1313 goto deliver; 1314 1315 /* 1316 * Wait and block until (more) data comes in. 1317 * NB: Drops the sockbuf lock during wait. 1318 */ 1319 error = sbwait(sb); 1320 if (error) 1321 goto out; 1322 goto restart; 1323 1324deliver: 1325 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1326 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1327 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1328 1329 /* Statistics. */ 1330 if (uio->uio_td) 1331 uio->uio_td->td_ru.ru_msgrcv++; 1332 1333 /* Fill uio until full or current end of socket buffer is reached. */ 1334 len = min(uio->uio_resid, sbavail(sb)); 1335 if (mp0 != NULL) { 1336 /* Dequeue as many mbufs as possible. */ 1337 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1338 for (*mp0 = m = sb->sb_mb; 1339 m != NULL && m->m_len <= len; 1340 m = m->m_next) { 1341 len -= m->m_len; 1342 uio->uio_resid -= m->m_len; 1343 sbfree(sb, m); 1344 n = m; 1345 } 1346 sb->sb_mb = m; 1347 if (sb->sb_mb == NULL) 1348 SB_EMPTY_FIXUP(sb); 1349 n->m_next = NULL; 1350 } 1351 /* Copy the remainder. */ 1352 if (len > 0) { 1353 KASSERT(sb->sb_mb != NULL, 1354 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1355 1356 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1357 if (m == NULL) 1358 len = 0; /* Don't flush data from sockbuf. */ 1359 else 1360 uio->uio_resid -= m->m_len; 1361 if (*mp0 != NULL) 1362 n->m_next = m; 1363 else 1364 *mp0 = m; 1365 if (*mp0 == NULL) { 1366 error = ENOBUFS; 1367 goto out; 1368 } 1369 } 1370 } else { 1371 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1372 SOCKBUF_UNLOCK(sb); 1373 error = m_mbuftouio(uio, sb->sb_mb, len); 1374 SOCKBUF_LOCK(sb); 1375 if (error) 1376 goto out; 1377 } 1378 SBLASTRECORDCHK(sb); 1379 SBLASTMBUFCHK(sb); 1380 1381 /* 1382 * Remove the delivered data from the socket buffer unless we 1383 * were only peeking. 1384 */ 1385 if (!(flags & MSG_PEEK)) { 1386 if (len > 0) 1387 sbdrop_locked(sb, len); 1388 1389 /* Notify protocol that we drained some data. */ 1390 SOCKBUF_UNLOCK(sb); 1391 SDP_WLOCK(ssk); 1392 sdp_do_posts(ssk); 1393 SDP_WUNLOCK(ssk); 1394 SOCKBUF_LOCK(sb); 1395 } 1396 1397 /* 1398 * For MSG_WAITALL we may have to loop again and wait for 1399 * more data to come in. 1400 */ 1401 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1402 goto restart; 1403out: 1404 SOCKBUF_LOCK_ASSERT(sb); 1405 SBLASTRECORDCHK(sb); 1406 SBLASTMBUFCHK(sb); 1407 SOCKBUF_UNLOCK(sb); 1408 sbunlock(sb); 1409 return (error); 1410} 1411 1412/* 1413 * Abort is used to teardown a connection typically while sitting in 1414 * the accept queue. 1415 */ 1416void 1417sdp_abort(struct socket *so) 1418{ 1419 struct sdp_sock *ssk; 1420 1421 ssk = sdp_sk(so); 1422 SDP_WLOCK(ssk); 1423 /* 1424 * If we have not yet dropped, do it now. 1425 */ 1426 if (!(ssk->flags & SDP_TIMEWAIT) && 1427 !(ssk->flags & SDP_DROPPED)) 1428 sdp_drop(ssk, ECONNABORTED); 1429 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1430 ssk, ssk->flags)); 1431 SDP_WUNLOCK(ssk); 1432} 1433 1434/* 1435 * Close a SDP socket and initiate a friendly disconnect. 1436 */ 1437static void 1438sdp_close(struct socket *so) 1439{ 1440 struct sdp_sock *ssk; 1441 1442 ssk = sdp_sk(so); 1443 SDP_WLOCK(ssk); 1444 /* 1445 * If we have not yet dropped, do it now. 1446 */ 1447 if (!(ssk->flags & SDP_TIMEWAIT) && 1448 !(ssk->flags & SDP_DROPPED)) 1449 sdp_start_disconnect(ssk); 1450 1451 /* 1452 * If we've still not dropped let the socket layer know we're 1453 * holding on to the socket and pcb for a while. 1454 */ 1455 if (!(ssk->flags & SDP_DROPPED)) { 1456 SOCK_LOCK(so); 1457 so->so_state |= SS_PROTOREF; 1458 SOCK_UNLOCK(so); 1459 ssk->flags |= SDP_SOCKREF; 1460 } 1461 SDP_WUNLOCK(ssk); 1462} 1463 1464/* 1465 * User requests out-of-band data. 1466 */ 1467static int 1468sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1469{ 1470 int error = 0; 1471 struct sdp_sock *ssk; 1472 1473 ssk = sdp_sk(so); 1474 SDP_WLOCK(ssk); 1475 if (!rx_ring_trylock(&ssk->rx_ring)) { 1476 SDP_WUNLOCK(ssk); 1477 return (ECONNRESET); 1478 } 1479 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1480 error = ECONNRESET; 1481 goto out; 1482 } 1483 if ((so->so_oobmark == 0 && 1484 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1485 so->so_options & SO_OOBINLINE || 1486 ssk->oobflags & SDP_HADOOB) { 1487 error = EINVAL; 1488 goto out; 1489 } 1490 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1491 error = EWOULDBLOCK; 1492 goto out; 1493 } 1494 m->m_len = 1; 1495 *mtod(m, caddr_t) = ssk->iobc; 1496 if ((flags & MSG_PEEK) == 0) 1497 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1498out: 1499 rx_ring_unlock(&ssk->rx_ring); 1500 SDP_WUNLOCK(ssk); 1501 return (error); 1502} 1503 1504void 1505sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1506{ 1507 struct mbuf *m; 1508 struct socket *so; 1509 1510 so = ssk->socket; 1511 if (so == NULL) 1512 return; 1513 1514 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1515 sohasoutofband(so); 1516 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1517 if (!(so->so_options & SO_OOBINLINE)) { 1518 for (m = mb; m->m_next != NULL; m = m->m_next); 1519 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1520 ssk->oobflags |= SDP_HAVEOOB; 1521 m->m_len--; 1522 mb->m_pkthdr.len--; 1523 } 1524} 1525 1526/* 1527 * Notify a sdp socket of an asynchronous error. 1528 * 1529 * Do not wake up user since there currently is no mechanism for 1530 * reporting soft errors (yet - a kqueue filter may be added). 1531 */ 1532struct sdp_sock * 1533sdp_notify(struct sdp_sock *ssk, int error) 1534{ 1535 1536 SDP_WLOCK_ASSERT(ssk); 1537 1538 if ((ssk->flags & SDP_TIMEWAIT) || 1539 (ssk->flags & SDP_DROPPED)) 1540 return (ssk); 1541 1542 /* 1543 * Ignore some errors if we are hooked up. 1544 */ 1545 if (ssk->state == TCPS_ESTABLISHED && 1546 (error == EHOSTUNREACH || error == ENETUNREACH || 1547 error == EHOSTDOWN)) 1548 return (ssk); 1549 ssk->softerror = error; 1550 return sdp_drop(ssk, error); 1551} 1552 1553static void 1554sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1555{ 1556 struct in_addr faddr; 1557 1558 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1559 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1560 return; 1561 1562 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1563} 1564 1565static int 1566sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1567 struct thread *td) 1568{ 1569 return (EOPNOTSUPP); 1570} 1571 1572static void 1573sdp_keepalive_timeout(void *data) 1574{ 1575 struct sdp_sock *ssk; 1576 1577 ssk = data; 1578 /* Callout canceled. */ 1579 if (!callout_active(&ssk->keep2msl)) 1580 return; 1581 /* Callout rescheduled as a different kind of timer. */ 1582 if (callout_pending(&ssk->keep2msl)) 1583 goto out; 1584 callout_deactivate(&ssk->keep2msl); 1585 if (ssk->flags & SDP_DROPPED || 1586 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1587 goto out; 1588 sdp_post_keepalive(ssk); 1589 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1590 sdp_keepalive_timeout, ssk); 1591out: 1592 SDP_WUNLOCK(ssk); 1593} 1594 1595 1596void 1597sdp_start_keepalive_timer(struct socket *so) 1598{ 1599 struct sdp_sock *ssk; 1600 1601 ssk = sdp_sk(so); 1602 if (!callout_pending(&ssk->keep2msl)) 1603 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1604 sdp_keepalive_timeout, ssk); 1605} 1606 1607static void 1608sdp_stop_keepalive_timer(struct socket *so) 1609{ 1610 struct sdp_sock *ssk; 1611 1612 ssk = sdp_sk(so); 1613 callout_stop(&ssk->keep2msl); 1614} 1615 1616/* 1617 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1618 * socket option arguments. When it re-acquires the lock after the copy, it 1619 * has to revalidate that the connection is still valid for the socket 1620 * option. 1621 */ 1622#define SDP_WLOCK_RECHECK(inp) do { \ 1623 SDP_WLOCK(ssk); \ 1624 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1625 SDP_WUNLOCK(ssk); \ 1626 return (ECONNRESET); \ 1627 } \ 1628} while(0) 1629 1630static int 1631sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1632{ 1633 int error, opt, optval; 1634 struct sdp_sock *ssk; 1635 1636 error = 0; 1637 ssk = sdp_sk(so); 1638 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1639 SDP_WLOCK(ssk); 1640 if (so->so_options & SO_KEEPALIVE) 1641 sdp_start_keepalive_timer(so); 1642 else 1643 sdp_stop_keepalive_timer(so); 1644 SDP_WUNLOCK(ssk); 1645 } 1646 if (sopt->sopt_level != IPPROTO_TCP) 1647 return (error); 1648 1649 SDP_WLOCK(ssk); 1650 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1651 SDP_WUNLOCK(ssk); 1652 return (ECONNRESET); 1653 } 1654 1655 switch (sopt->sopt_dir) { 1656 case SOPT_SET: 1657 switch (sopt->sopt_name) { 1658 case TCP_NODELAY: 1659 SDP_WUNLOCK(ssk); 1660 error = sooptcopyin(sopt, &optval, sizeof optval, 1661 sizeof optval); 1662 if (error) 1663 return (error); 1664 1665 SDP_WLOCK_RECHECK(ssk); 1666 opt = SDP_NODELAY; 1667 if (optval) 1668 ssk->flags |= opt; 1669 else 1670 ssk->flags &= ~opt; 1671 sdp_do_posts(ssk); 1672 SDP_WUNLOCK(ssk); 1673 break; 1674 1675 default: 1676 SDP_WUNLOCK(ssk); 1677 error = ENOPROTOOPT; 1678 break; 1679 } 1680 break; 1681 1682 case SOPT_GET: 1683 switch (sopt->sopt_name) { 1684 case TCP_NODELAY: 1685 optval = ssk->flags & SDP_NODELAY; 1686 SDP_WUNLOCK(ssk); 1687 error = sooptcopyout(sopt, &optval, sizeof optval); 1688 break; 1689 default: 1690 SDP_WUNLOCK(ssk); 1691 error = ENOPROTOOPT; 1692 break; 1693 } 1694 break; 1695 } 1696 return (error); 1697} 1698#undef SDP_WLOCK_RECHECK 1699 1700int sdp_mod_count = 0; 1701int sdp_mod_usec = 0; 1702 1703void 1704sdp_set_default_moderation(struct sdp_sock *ssk) 1705{ 1706 struct ib_cq_attr attr; 1707 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1708 return; 1709 memset(&attr, 0, sizeof(attr)); 1710 attr.moderation.cq_count = sdp_mod_count; 1711 attr.moderation.cq_period = sdp_mod_usec; 1712 1713 ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION); 1714} 1715 1716static void 1717sdp_dev_add(struct ib_device *device) 1718{ 1719 struct ib_fmr_pool_param param; 1720 struct sdp_device *sdp_dev; 1721 1722 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1723 sdp_dev->pd = ib_alloc_pd(device); 1724 if (IS_ERR(sdp_dev->pd)) 1725 goto out_pd; 1726 sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); 1727 if (IS_ERR(sdp_dev->mr)) 1728 goto out_mr; 1729 memset(¶m, 0, sizeof param); 1730 param.max_pages_per_fmr = SDP_FMR_SIZE; 1731 param.page_shift = PAGE_SHIFT; 1732 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1733 param.pool_size = SDP_FMR_POOL_SIZE; 1734 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1735 param.cache = 1; 1736 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1737 if (IS_ERR(sdp_dev->fmr_pool)) 1738 goto out_fmr; 1739 ib_set_client_data(device, &sdp_client, sdp_dev); 1740 return; 1741 1742out_fmr: 1743 ib_dereg_mr(sdp_dev->mr); 1744out_mr: 1745 ib_dealloc_pd(sdp_dev->pd); 1746out_pd: 1747 free(sdp_dev, M_SDP); 1748} 1749 1750static void 1751sdp_dev_rem(struct ib_device *device) 1752{ 1753 struct sdp_device *sdp_dev; 1754 struct sdp_sock *ssk; 1755 1756 SDP_LIST_WLOCK(); 1757 LIST_FOREACH(ssk, &sdp_list, list) { 1758 if (ssk->ib_device != device) 1759 continue; 1760 SDP_WLOCK(ssk); 1761 if ((ssk->flags & SDP_DESTROY) == 0) 1762 ssk = sdp_notify(ssk, ECONNRESET); 1763 if (ssk) 1764 SDP_WUNLOCK(ssk); 1765 } 1766 SDP_LIST_WUNLOCK(); 1767 /* 1768 * XXX Do I need to wait between these two? 1769 */ 1770 sdp_dev = ib_get_client_data(device, &sdp_client); 1771 if (!sdp_dev) 1772 return; 1773 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1774 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1775 ib_dereg_mr(sdp_dev->mr); 1776 ib_dealloc_pd(sdp_dev->pd); 1777 free(sdp_dev, M_SDP); 1778} 1779 1780struct ib_client sdp_client = 1781 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1782 1783 1784static int 1785sdp_pcblist(SYSCTL_HANDLER_ARGS) 1786{ 1787 int error, n, i; 1788 struct sdp_sock *ssk; 1789 struct xinpgen xig; 1790 1791 /* 1792 * The process of preparing the TCB list is too time-consuming and 1793 * resource-intensive to repeat twice on every request. 1794 */ 1795 if (req->oldptr == NULL) { 1796 n = sdp_count; 1797 n += imax(n / 8, 10); 1798 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1799 return (0); 1800 } 1801 1802 if (req->newptr != NULL) 1803 return (EPERM); 1804 1805 /* 1806 * OK, now we're committed to doing something. 1807 */ 1808 SDP_LIST_RLOCK(); 1809 n = sdp_count; 1810 SDP_LIST_RUNLOCK(); 1811 1812 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1813 + n * sizeof(struct xtcpcb)); 1814 if (error != 0) 1815 return (error); 1816 1817 xig.xig_len = sizeof xig; 1818 xig.xig_count = n; 1819 xig.xig_gen = 0; 1820 xig.xig_sogen = so_gencnt; 1821 error = SYSCTL_OUT(req, &xig, sizeof xig); 1822 if (error) 1823 return (error); 1824 1825 SDP_LIST_RLOCK(); 1826 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1827 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1828 struct xtcpcb xt; 1829 1830 SDP_RLOCK(ssk); 1831 if (ssk->flags & SDP_TIMEWAIT) { 1832 if (ssk->cred != NULL) 1833 error = cr_cansee(req->td->td_ucred, 1834 ssk->cred); 1835 else 1836 error = EINVAL; /* Skip this inp. */ 1837 } else if (ssk->socket) 1838 error = cr_canseesocket(req->td->td_ucred, 1839 ssk->socket); 1840 else 1841 error = EINVAL; 1842 if (error) { 1843 error = 0; 1844 goto next; 1845 } 1846 1847 bzero(&xt, sizeof(xt)); 1848 xt.xt_len = sizeof xt; 1849 xt.xt_inp.inp_gencnt = 0; 1850 xt.xt_inp.inp_vflag = INP_IPV4; 1851 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1852 xt.xt_inp.inp_lport = ssk->lport; 1853 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1854 xt.xt_inp.inp_fport = ssk->fport; 1855 xt.xt_tp.t_state = ssk->state; 1856 if (ssk->socket != NULL) 1857 sotoxsocket(ssk->socket, &xt.xt_socket); 1858 else 1859 bzero(&xt.xt_socket, sizeof xt.xt_socket); 1860 xt.xt_socket.xso_protocol = IPPROTO_TCP; 1861 SDP_RUNLOCK(ssk); 1862 error = SYSCTL_OUT(req, &xt, sizeof xt); 1863 if (error) 1864 break; 1865 i++; 1866 continue; 1867next: 1868 SDP_RUNLOCK(ssk); 1869 } 1870 if (!error) { 1871 /* 1872 * Give the user an updated idea of our state. 1873 * If the generation differs from what we told 1874 * her before, she knows that something happened 1875 * while we were processing this request, and it 1876 * might be necessary to retry. 1877 */ 1878 xig.xig_gen = 0; 1879 xig.xig_sogen = so_gencnt; 1880 xig.xig_count = sdp_count; 1881 error = SYSCTL_OUT(req, &xig, sizeof xig); 1882 } 1883 SDP_LIST_RUNLOCK(); 1884 return (error); 1885} 1886 1887static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1888 1889SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1890 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1891 "List of active SDP connections"); 1892 1893static void 1894sdp_zone_change(void *tag) 1895{ 1896 1897 uma_zone_set_max(sdp_zone, maxsockets); 1898} 1899 1900static void 1901sdp_init(void) 1902{ 1903 1904 LIST_INIT(&sdp_list); 1905 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1906 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1907 uma_zone_set_max(sdp_zone, maxsockets); 1908 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1909 EVENTHANDLER_PRI_ANY); 1910 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1911 ib_register_client(&sdp_client); 1912} 1913 1914extern struct domain sdpdomain; 1915 1916struct pr_usrreqs sdp_usrreqs = { 1917 .pru_abort = sdp_abort, 1918 .pru_accept = sdp_accept, 1919 .pru_attach = sdp_attach, 1920 .pru_bind = sdp_bind, 1921 .pru_connect = sdp_connect, 1922 .pru_control = sdp_control, 1923 .pru_detach = sdp_detach, 1924 .pru_disconnect = sdp_disconnect, 1925 .pru_listen = sdp_listen, 1926 .pru_peeraddr = sdp_getpeeraddr, 1927 .pru_rcvoob = sdp_rcvoob, 1928 .pru_send = sdp_send, 1929 .pru_sosend = sdp_sosend, 1930 .pru_soreceive = sdp_sorecv, 1931 .pru_shutdown = sdp_shutdown, 1932 .pru_sockaddr = sdp_getsockaddr, 1933 .pru_close = sdp_close, 1934}; 1935 1936struct protosw sdpsw[] = { 1937{ 1938 .pr_type = SOCK_STREAM, 1939 .pr_domain = &sdpdomain, 1940 .pr_protocol = IPPROTO_IP, 1941 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1942 .pr_ctlinput = sdp_ctlinput, 1943 .pr_ctloutput = sdp_ctloutput, 1944 .pr_usrreqs = &sdp_usrreqs 1945}, 1946{ 1947 .pr_type = SOCK_STREAM, 1948 .pr_domain = &sdpdomain, 1949 .pr_protocol = IPPROTO_TCP, 1950 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1951 .pr_ctlinput = sdp_ctlinput, 1952 .pr_ctloutput = sdp_ctloutput, 1953 .pr_usrreqs = &sdp_usrreqs 1954}, 1955}; 1956 1957struct domain sdpdomain = { 1958 .dom_family = AF_INET_SDP, 1959 .dom_name = "SDP", 1960 .dom_init = sdp_init, 1961 .dom_protosw = sdpsw, 1962 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1963}; 1964 1965DOMAIN_SET(sdp); 1966 1967int sdp_debug_level = 1; 1968int sdp_data_debug_level = 0; 1969