icl.c revision 265501
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265501 2014-05-07 06:46:59Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capability.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN, 65 &debug, 1, "Enable debug messages"); 66static int partial_receive_len = 1 * 1024; /* XXX: More? */ 67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, 69 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 70 "data segment"); 71static int sendspace = 1048576; 72TUNABLE_INT("kern.icl.sendspace", &sendspace); 73SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN, 74 &sendspace, 1048576, "Default send socket buffer size"); 75static int recvspace = 1048576; 76TUNABLE_INT("kern.icl.recvspace", &recvspace); 77SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN, 78 &recvspace, 1048576, "Default receive socket buffer size"); 79 80static uma_zone_t icl_conn_zone; 81static uma_zone_t icl_pdu_zone; 82 83static volatile u_int icl_ncons; 84 85#define ICL_DEBUG(X, ...) \ 86 if (debug > 1) { \ 87 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 88 } while (0) 89 90#define ICL_WARN(X, ...) \ 91 if (debug > 0) { \ 92 printf("WARNING: %s: " X "\n", \ 93 __func__, ## __VA_ARGS__); \ 94 } while (0) 95 96#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 97#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 98#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 99#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 100 101static void 102icl_conn_fail(struct icl_conn *ic) 103{ 104 if (ic->ic_socket == NULL) 105 return; 106 107 /* 108 * XXX 109 */ 110 ic->ic_socket->so_error = EDOOFUS; 111 (ic->ic_error)(ic); 112} 113 114static struct mbuf * 115icl_conn_receive(struct icl_conn *ic, size_t len) 116{ 117 struct uio uio; 118 struct socket *so; 119 struct mbuf *m; 120 int error, flags; 121 122 so = ic->ic_socket; 123 124 memset(&uio, 0, sizeof(uio)); 125 uio.uio_resid = len; 126 127 flags = MSG_DONTWAIT; 128 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 129 if (error != 0) { 130 ICL_DEBUG("soreceive error %d", error); 131 return (NULL); 132 } 133 if (uio.uio_resid != 0) { 134 m_freem(m); 135 ICL_DEBUG("short read"); 136 return (NULL); 137 } 138 139 return (m); 140} 141 142static struct icl_pdu * 143icl_pdu_new(struct icl_conn *ic, int flags) 144{ 145 struct icl_pdu *ip; 146 147#ifdef DIAGNOSTIC 148 refcount_acquire(&ic->ic_outstanding_pdus); 149#endif 150 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 151 if (ip == NULL) { 152 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 153#ifdef DIAGNOSTIC 154 refcount_release(&ic->ic_outstanding_pdus); 155#endif 156 return (NULL); 157 } 158 159 ip->ip_conn = ic; 160 161 return (ip); 162} 163 164void 165icl_pdu_free(struct icl_pdu *ip) 166{ 167 struct icl_conn *ic; 168 169 ic = ip->ip_conn; 170 171 m_freem(ip->ip_bhs_mbuf); 172 m_freem(ip->ip_ahs_mbuf); 173 m_freem(ip->ip_data_mbuf); 174 uma_zfree(icl_pdu_zone, ip); 175#ifdef DIAGNOSTIC 176 refcount_release(&ic->ic_outstanding_pdus); 177#endif 178} 179 180/* 181 * Allocate icl_pdu with empty BHS to fill up by the caller. 182 */ 183struct icl_pdu * 184icl_pdu_new_bhs(struct icl_conn *ic, int flags) 185{ 186 struct icl_pdu *ip; 187 188 ip = icl_pdu_new(ic, flags); 189 if (ip == NULL) 190 return (NULL); 191 192 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 193 flags, MT_DATA, M_PKTHDR); 194 if (ip->ip_bhs_mbuf == NULL) { 195 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 196 icl_pdu_free(ip); 197 return (NULL); 198 } 199 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 200 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 201 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 202 203 return (ip); 204} 205 206static int 207icl_pdu_ahs_length(const struct icl_pdu *request) 208{ 209 210 return (request->ip_bhs->bhs_total_ahs_len * 4); 211} 212 213size_t 214icl_pdu_data_segment_length(const struct icl_pdu *request) 215{ 216 uint32_t len = 0; 217 218 len += request->ip_bhs->bhs_data_segment_len[0]; 219 len <<= 8; 220 len += request->ip_bhs->bhs_data_segment_len[1]; 221 len <<= 8; 222 len += request->ip_bhs->bhs_data_segment_len[2]; 223 224 return (len); 225} 226 227static void 228icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 229{ 230 231 response->ip_bhs->bhs_data_segment_len[2] = len; 232 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 233 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 234} 235 236static size_t 237icl_pdu_padding(const struct icl_pdu *ip) 238{ 239 240 if ((ip->ip_data_len % 4) != 0) 241 return (4 - (ip->ip_data_len % 4)); 242 243 return (0); 244} 245 246static size_t 247icl_pdu_size(const struct icl_pdu *response) 248{ 249 size_t len; 250 251 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 252 253 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 254 icl_pdu_padding(response); 255 if (response->ip_conn->ic_header_crc32c) 256 len += ISCSI_HEADER_DIGEST_SIZE; 257 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 258 len += ISCSI_DATA_DIGEST_SIZE; 259 260 return (len); 261} 262 263static int 264icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 265{ 266 struct mbuf *m; 267 268 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 269 if (m == NULL) { 270 ICL_DEBUG("failed to receive BHS"); 271 return (-1); 272 } 273 274 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 275 if (request->ip_bhs_mbuf == NULL) { 276 ICL_WARN("m_pullup failed"); 277 return (-1); 278 } 279 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 280 281 /* 282 * XXX: For architectures with strict alignment requirements 283 * we may need to allocate ip_bhs and copy the data into it. 284 * For some reason, though, not doing this doesn't seem 285 * to cause problems; tested on sparc64. 286 */ 287 288 *availablep -= sizeof(struct iscsi_bhs); 289 return (0); 290} 291 292static int 293icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 294{ 295 296 request->ip_ahs_len = icl_pdu_ahs_length(request); 297 if (request->ip_ahs_len == 0) 298 return (0); 299 300 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 301 request->ip_ahs_len); 302 if (request->ip_ahs_mbuf == NULL) { 303 ICL_DEBUG("failed to receive AHS"); 304 return (-1); 305 } 306 307 *availablep -= request->ip_ahs_len; 308 return (0); 309} 310 311static uint32_t 312icl_mbuf_to_crc32c(const struct mbuf *m0) 313{ 314 uint32_t digest = 0xffffffff; 315 const struct mbuf *m; 316 317 for (m = m0; m != NULL; m = m->m_next) 318 digest = calculate_crc32c(digest, 319 mtod(m, const void *), m->m_len); 320 321 digest = digest ^ 0xffffffff; 322 323 return (digest); 324} 325 326static int 327icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 328{ 329 struct mbuf *m; 330 uint32_t received_digest, valid_digest; 331 332 if (request->ip_conn->ic_header_crc32c == false) 333 return (0); 334 335 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 336 if (m == NULL) { 337 ICL_DEBUG("failed to receive header digest"); 338 return (-1); 339 } 340 341 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 342 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 343 m_freem(m); 344 345 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 346 347 /* 348 * XXX: Handle AHS. 349 */ 350 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 351 if (received_digest != valid_digest) { 352 ICL_WARN("header digest check failed; got 0x%x, " 353 "should be 0x%x", received_digest, valid_digest); 354 return (-1); 355 } 356 357 return (0); 358} 359 360/* 361 * Return the number of bytes that should be waiting in the receive socket 362 * before icl_pdu_receive_data_segment() gets called. 363 */ 364static size_t 365icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 366{ 367 size_t len; 368 369 len = icl_pdu_data_segment_length(request); 370 if (len == 0) 371 return (0); 372 373 /* 374 * Account for the parts of data segment already read from 375 * the socket buffer. 376 */ 377 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 378 len -= request->ip_data_len; 379 380 /* 381 * Don't always wait for the full data segment to be delivered 382 * to the socket; this might badly affect performance due to 383 * TCP window scaling. 384 */ 385 if (len > partial_receive_len) { 386#if 0 387 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 388 len, partial_receive_len)); 389#endif 390 len = partial_receive_len; 391 392 return (len); 393 } 394 395 /* 396 * Account for padding. Note that due to the way code is written, 397 * the icl_pdu_receive_data_segment() must always receive padding 398 * along with the last part of data segment, because it would be 399 * impossible to tell whether we've already received the full data 400 * segment including padding, or without it. 401 */ 402 if ((len % 4) != 0) 403 len += 4 - (len % 4); 404 405#if 0 406 ICL_DEBUG("need %zd bytes of data", len)); 407#endif 408 409 return (len); 410} 411 412static int 413icl_pdu_receive_data_segment(struct icl_pdu *request, 414 size_t *availablep, bool *more_neededp) 415{ 416 struct icl_conn *ic; 417 size_t len, padding = 0; 418 struct mbuf *m; 419 420 ic = request->ip_conn; 421 422 *more_neededp = false; 423 ic->ic_receive_len = 0; 424 425 len = icl_pdu_data_segment_length(request); 426 if (len == 0) 427 return (0); 428 429 if ((len % 4) != 0) 430 padding = 4 - (len % 4); 431 432 /* 433 * Account for already received parts of data segment. 434 */ 435 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 436 len -= request->ip_data_len; 437 438 if (len + padding > *availablep) { 439 /* 440 * Not enough data in the socket buffer. Receive as much 441 * as we can. Don't receive padding, since, obviously, it's 442 * not the end of data segment yet. 443 */ 444#if 0 445 ICL_DEBUG("limited from %zd to %zd", 446 len + padding, *availablep - padding)); 447#endif 448 len = *availablep - padding; 449 *more_neededp = true; 450 padding = 0; 451 } 452 453 /* 454 * Must not try to receive padding without at least one byte 455 * of actual data segment. 456 */ 457 if (len > 0) { 458 m = icl_conn_receive(request->ip_conn, len + padding); 459 if (m == NULL) { 460 ICL_DEBUG("failed to receive data segment"); 461 return (-1); 462 } 463 464 if (request->ip_data_mbuf == NULL) 465 request->ip_data_mbuf = m; 466 else 467 m_cat(request->ip_data_mbuf, m); 468 469 request->ip_data_len += len; 470 *availablep -= len + padding; 471 } else 472 ICL_DEBUG("len 0"); 473 474 if (*more_neededp) 475 ic->ic_receive_len = 476 icl_pdu_data_segment_receive_len(request); 477 478 return (0); 479} 480 481static int 482icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 483{ 484 struct mbuf *m; 485 uint32_t received_digest, valid_digest; 486 487 if (request->ip_conn->ic_data_crc32c == false) 488 return (0); 489 490 if (request->ip_data_len == 0) 491 return (0); 492 493 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 494 if (m == NULL) { 495 ICL_DEBUG("failed to receive data digest"); 496 return (-1); 497 } 498 499 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 500 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 501 m_freem(m); 502 503 *availablep -= ISCSI_DATA_DIGEST_SIZE; 504 505 /* 506 * Note that ip_data_mbuf also contains padding; since digest 507 * calculation is supposed to include that, we iterate over 508 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 509 */ 510 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 511 if (received_digest != valid_digest) { 512 ICL_WARN("data digest check failed; got 0x%x, " 513 "should be 0x%x", received_digest, valid_digest); 514 return (-1); 515 } 516 517 return (0); 518} 519 520/* 521 * Somewhat contrary to the name, this attempts to receive only one 522 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 523 */ 524static struct icl_pdu * 525icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 526{ 527 struct icl_pdu *request; 528 struct socket *so; 529 size_t len; 530 int error; 531 bool more_needed; 532 533 so = ic->ic_socket; 534 535 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 536 KASSERT(ic->ic_receive_pdu == NULL, 537 ("ic->ic_receive_pdu != NULL")); 538 request = icl_pdu_new(ic, M_NOWAIT); 539 if (request == NULL) { 540 ICL_DEBUG("failed to allocate PDU; " 541 "dropping connection"); 542 icl_conn_fail(ic); 543 return (NULL); 544 } 545 ic->ic_receive_pdu = request; 546 } else { 547 KASSERT(ic->ic_receive_pdu != NULL, 548 ("ic->ic_receive_pdu == NULL")); 549 request = ic->ic_receive_pdu; 550 } 551 552 if (*availablep < ic->ic_receive_len) { 553#if 0 554 ICL_DEBUG("not enough data; need %zd, " 555 "have %zd", ic->ic_receive_len, *availablep); 556#endif 557 return (NULL); 558 } 559 560 switch (ic->ic_receive_state) { 561 case ICL_CONN_STATE_BHS: 562 //ICL_DEBUG("receiving BHS"); 563 error = icl_pdu_receive_bhs(request, availablep); 564 if (error != 0) { 565 ICL_DEBUG("failed to receive BHS; " 566 "dropping connection"); 567 break; 568 } 569 570 /* 571 * We don't enforce any limit for AHS length; 572 * its length is stored in 8 bit field. 573 */ 574 575 len = icl_pdu_data_segment_length(request); 576 if (len > ic->ic_max_data_segment_length) { 577 ICL_WARN("received data segment " 578 "length %zd is larger than negotiated " 579 "MaxDataSegmentLength %zd; " 580 "dropping connection", 581 len, ic->ic_max_data_segment_length); 582 error = EINVAL; 583 break; 584 } 585 586 ic->ic_receive_state = ICL_CONN_STATE_AHS; 587 ic->ic_receive_len = icl_pdu_ahs_length(request); 588 break; 589 590 case ICL_CONN_STATE_AHS: 591 //ICL_DEBUG("receiving AHS"); 592 error = icl_pdu_receive_ahs(request, availablep); 593 if (error != 0) { 594 ICL_DEBUG("failed to receive AHS; " 595 "dropping connection"); 596 break; 597 } 598 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 599 if (ic->ic_header_crc32c == false) 600 ic->ic_receive_len = 0; 601 else 602 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 603 break; 604 605 case ICL_CONN_STATE_HEADER_DIGEST: 606 //ICL_DEBUG("receiving header digest"); 607 error = icl_pdu_check_header_digest(request, availablep); 608 if (error != 0) { 609 ICL_DEBUG("header digest failed; " 610 "dropping connection"); 611 break; 612 } 613 614 ic->ic_receive_state = ICL_CONN_STATE_DATA; 615 ic->ic_receive_len = 616 icl_pdu_data_segment_receive_len(request); 617 break; 618 619 case ICL_CONN_STATE_DATA: 620 //ICL_DEBUG("receiving data segment"); 621 error = icl_pdu_receive_data_segment(request, availablep, 622 &more_needed); 623 if (error != 0) { 624 ICL_DEBUG("failed to receive data segment;" 625 "dropping connection"); 626 break; 627 } 628 629 if (more_needed) 630 break; 631 632 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 633 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 634 ic->ic_receive_len = 0; 635 else 636 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 637 break; 638 639 case ICL_CONN_STATE_DATA_DIGEST: 640 //ICL_DEBUG("receiving data digest"); 641 error = icl_pdu_check_data_digest(request, availablep); 642 if (error != 0) { 643 ICL_DEBUG("data digest failed; " 644 "dropping connection"); 645 break; 646 } 647 648 /* 649 * We've received complete PDU; reset the receive state machine 650 * and return the PDU. 651 */ 652 ic->ic_receive_state = ICL_CONN_STATE_BHS; 653 ic->ic_receive_len = sizeof(struct iscsi_bhs); 654 ic->ic_receive_pdu = NULL; 655 return (request); 656 657 default: 658 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 659 } 660 661 if (error != 0) { 662 icl_pdu_free(request); 663 icl_conn_fail(ic); 664 } 665 666 return (NULL); 667} 668 669static void 670icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 671{ 672 struct icl_pdu *response; 673 struct socket *so; 674 675 so = ic->ic_socket; 676 677 /* 678 * This can never happen; we're careful to only mess with ic->ic_socket 679 * pointer when the send/receive threads are not running. 680 */ 681 KASSERT(so != NULL, ("NULL socket")); 682 683 for (;;) { 684 if (ic->ic_disconnecting) 685 return; 686 687 if (so->so_error != 0) { 688 ICL_DEBUG("connection error %d; " 689 "dropping connection", so->so_error); 690 icl_conn_fail(ic); 691 return; 692 } 693 694 /* 695 * Loop until we have a complete PDU or there is not enough 696 * data in the socket buffer. 697 */ 698 if (available < ic->ic_receive_len) { 699#if 0 700 ICL_DEBUG("not enough data; have %zd, " 701 "need %zd", available, 702 ic->ic_receive_len); 703#endif 704 return; 705 } 706 707 response = icl_conn_receive_pdu(ic, &available); 708 if (response == NULL) 709 continue; 710 711 if (response->ip_ahs_len > 0) { 712 ICL_WARN("received PDU with unsupported " 713 "AHS; opcode 0x%x; dropping connection", 714 response->ip_bhs->bhs_opcode); 715 icl_pdu_free(response); 716 icl_conn_fail(ic); 717 return; 718 } 719 720 (ic->ic_receive)(response); 721 } 722} 723 724static void 725icl_receive_thread(void *arg) 726{ 727 struct icl_conn *ic; 728 size_t available; 729 struct socket *so; 730 731 ic = arg; 732 so = ic->ic_socket; 733 734 ICL_CONN_LOCK(ic); 735 ic->ic_receive_running = true; 736 ICL_CONN_UNLOCK(ic); 737 738 for (;;) { 739 if (ic->ic_disconnecting) { 740 //ICL_DEBUG("terminating"); 741 break; 742 } 743 744 SOCKBUF_LOCK(&so->so_rcv); 745 available = so->so_rcv.sb_cc; 746 if (available < ic->ic_receive_len) { 747 so->so_rcv.sb_lowat = ic->ic_receive_len; 748 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 749 } 750 SOCKBUF_UNLOCK(&so->so_rcv); 751 752 icl_conn_receive_pdus(ic, available); 753 } 754 755 ICL_CONN_LOCK(ic); 756 ic->ic_receive_running = false; 757 ICL_CONN_UNLOCK(ic); 758 kthread_exit(); 759} 760 761static int 762icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 763{ 764 struct icl_conn *ic; 765 766 ic = arg; 767 cv_signal(&ic->ic_receive_cv); 768 return (SU_OK); 769} 770 771static int 772icl_pdu_send(struct icl_pdu *request) 773{ 774 size_t padding, pdu_len; 775 uint32_t digest, zero = 0; 776 int error, ok; 777 struct socket *so; 778 struct icl_conn *ic; 779 780 ic = request->ip_conn; 781 so = request->ip_conn->ic_socket; 782 783 ICL_CONN_LOCK_ASSERT(ic); 784 785 icl_pdu_set_data_segment_length(request, request->ip_data_len); 786 787 pdu_len = icl_pdu_size(request); 788 789 if (ic->ic_header_crc32c) { 790 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 791 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 792 (void *)&digest); 793 if (ok != 1) { 794 ICL_WARN("failed to append header digest"); 795 return (1); 796 } 797 } 798 799 if (request->ip_data_len != 0) { 800 padding = icl_pdu_padding(request); 801 if (padding > 0) { 802 ok = m_append(request->ip_data_mbuf, padding, 803 (void *)&zero); 804 if (ok != 1) { 805 ICL_WARN("failed to append padding"); 806 return (1); 807 } 808 } 809 810 if (ic->ic_data_crc32c) { 811 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 812 813 ok = m_append(request->ip_data_mbuf, sizeof(digest), 814 (void *)&digest); 815 if (ok != 1) { 816 ICL_WARN("failed to append header digest"); 817 return (1); 818 } 819 } 820 821 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 822 request->ip_data_mbuf = NULL; 823 } 824 825 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 826 827 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 828 NULL, MSG_DONTWAIT, curthread); 829 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 830 if (error != 0) { 831 ICL_DEBUG("sosend error %d", error); 832 return (error); 833 } 834 835 return (0); 836} 837 838static void 839icl_conn_send_pdus(struct icl_conn *ic) 840{ 841 struct icl_pdu *request; 842 struct socket *so; 843 size_t available, size; 844 int error; 845 846 ICL_CONN_LOCK_ASSERT(ic); 847 848 so = ic->ic_socket; 849 850 SOCKBUF_LOCK(&so->so_snd); 851 available = sbspace(&so->so_snd); 852 SOCKBUF_UNLOCK(&so->so_snd); 853 854 while (!STAILQ_EMPTY(&ic->ic_to_send)) { 855 if (ic->ic_disconnecting) 856 return; 857 858 request = STAILQ_FIRST(&ic->ic_to_send); 859 size = icl_pdu_size(request); 860 if (available < size) { 861 /* 862 * Set the low watermark on the socket, 863 * to avoid waking up until there is enough 864 * space. 865 */ 866 SOCKBUF_LOCK(&so->so_snd); 867 so->so_snd.sb_lowat = size; 868 SOCKBUF_UNLOCK(&so->so_snd); 869#if 1 870 ICL_DEBUG("no space to send; " 871 "have %zd, need %zd", 872 available, size); 873#endif 874 return; 875 } 876 available -= size; 877 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next); 878 error = icl_pdu_send(request); 879 if (error != 0) { 880 ICL_DEBUG("failed to send PDU; " 881 "dropping connection"); 882 icl_conn_fail(ic); 883 return; 884 } 885 icl_pdu_free(request); 886 } 887} 888 889static void 890icl_send_thread(void *arg) 891{ 892 struct icl_conn *ic; 893 894 ic = arg; 895 896 ICL_CONN_LOCK(ic); 897 ic->ic_send_running = true; 898 899 for (;;) { 900 if (ic->ic_disconnecting) { 901 //ICL_DEBUG("terminating"); 902 break; 903 } 904 icl_conn_send_pdus(ic); 905 cv_wait(&ic->ic_send_cv, ic->ic_lock); 906 } 907 908 ic->ic_send_running = false; 909 ICL_CONN_UNLOCK(ic); 910 kthread_exit(); 911} 912 913static int 914icl_soupcall_send(struct socket *so, void *arg, int waitflag) 915{ 916 struct icl_conn *ic; 917 918 ic = arg; 919 cv_signal(&ic->ic_send_cv); 920 return (SU_OK); 921} 922 923int 924icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags) 925{ 926 struct mbuf *mb, *newmb; 927 size_t copylen, off = 0; 928 929 KASSERT(len > 0, ("len == 0")); 930 931 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 932 if (newmb == NULL) { 933 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 934 return (ENOMEM); 935 } 936 937 for (mb = newmb; mb != NULL; mb = mb->m_next) { 938 copylen = min(M_TRAILINGSPACE(mb), len - off); 939 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 940 mb->m_len = copylen; 941 off += copylen; 942 } 943 KASSERT(off == len, ("%s: off != len", __func__)); 944 945 if (request->ip_data_mbuf == NULL) { 946 request->ip_data_mbuf = newmb; 947 request->ip_data_len = len; 948 } else { 949 m_cat(request->ip_data_mbuf, newmb); 950 request->ip_data_len += len; 951 } 952 953 return (0); 954} 955 956void 957icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 958{ 959 960 m_copydata(ip->ip_data_mbuf, off, len, addr); 961} 962 963void 964icl_pdu_queue(struct icl_pdu *ip) 965{ 966 struct icl_conn *ic; 967 968 ic = ip->ip_conn; 969 970 ICL_CONN_LOCK_ASSERT(ic); 971 972 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 973 ICL_DEBUG("icl_pdu_queue on closed connection"); 974 icl_pdu_free(ip); 975 return; 976 } 977 STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 978 cv_signal(&ic->ic_send_cv); 979} 980 981struct icl_conn * 982icl_conn_new(const char *name, struct mtx *lock) 983{ 984 struct icl_conn *ic; 985 986 refcount_acquire(&icl_ncons); 987 988 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 989 990 STAILQ_INIT(&ic->ic_to_send); 991 ic->ic_lock = lock; 992 cv_init(&ic->ic_send_cv, "icl_tx"); 993 cv_init(&ic->ic_receive_cv, "icl_rx"); 994#ifdef DIAGNOSTIC 995 refcount_init(&ic->ic_outstanding_pdus, 0); 996#endif 997 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 998 ic->ic_name = name; 999 1000 return (ic); 1001} 1002 1003void 1004icl_conn_free(struct icl_conn *ic) 1005{ 1006 1007 cv_destroy(&ic->ic_send_cv); 1008 cv_destroy(&ic->ic_receive_cv); 1009 uma_zfree(icl_conn_zone, ic); 1010 refcount_release(&icl_ncons); 1011} 1012 1013static int 1014icl_conn_start(struct icl_conn *ic) 1015{ 1016 size_t minspace; 1017 struct sockopt opt; 1018 int error, one = 1; 1019 1020 ICL_CONN_LOCK(ic); 1021 1022 /* 1023 * XXX: Ugly hack. 1024 */ 1025 if (ic->ic_socket == NULL) { 1026 ICL_CONN_UNLOCK(ic); 1027 return (EINVAL); 1028 } 1029 1030 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1031 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1032 ic->ic_disconnecting = false; 1033 1034 ICL_CONN_UNLOCK(ic); 1035 1036 /* 1037 * For sendspace, this is required because the current code cannot 1038 * send a PDU in pieces; thus, the minimum buffer size is equal 1039 * to the maximum PDU size. "+4" is to account for possible padding. 1040 * 1041 * What we should actually do here is to use autoscaling, but set 1042 * some minimal buffer size to "minspace". I don't know a way to do 1043 * that, though. 1044 */ 1045 minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length + 1046 ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; 1047 if (sendspace < minspace) { 1048 ICL_WARN("kern.icl.sendspace too low; must be at least %zd", 1049 minspace); 1050 sendspace = minspace; 1051 } 1052 if (recvspace < minspace) { 1053 ICL_WARN("kern.icl.recvspace too low; must be at least %zd", 1054 minspace); 1055 recvspace = minspace; 1056 } 1057 1058 error = soreserve(ic->ic_socket, sendspace, recvspace); 1059 if (error != 0) { 1060 ICL_WARN("soreserve failed with error %d", error); 1061 icl_conn_close(ic); 1062 return (error); 1063 } 1064 1065 /* 1066 * Disable Nagle. 1067 */ 1068 bzero(&opt, sizeof(opt)); 1069 opt.sopt_dir = SOPT_SET; 1070 opt.sopt_level = IPPROTO_TCP; 1071 opt.sopt_name = TCP_NODELAY; 1072 opt.sopt_val = &one; 1073 opt.sopt_valsize = sizeof(one); 1074 error = sosetopt(ic->ic_socket, &opt); 1075 if (error != 0) { 1076 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1077 icl_conn_close(ic); 1078 return (error); 1079 } 1080 1081 /* 1082 * Start threads. 1083 */ 1084 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", 1085 ic->ic_name); 1086 if (error != 0) { 1087 ICL_WARN("kthread_add(9) failed with error %d", error); 1088 icl_conn_close(ic); 1089 return (error); 1090 } 1091 1092 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", 1093 ic->ic_name); 1094 if (error != 0) { 1095 ICL_WARN("kthread_add(9) failed with error %d", error); 1096 icl_conn_close(ic); 1097 return (error); 1098 } 1099 1100 /* 1101 * Register socket upcall, to get notified about incoming PDUs 1102 * and free space to send outgoing ones. 1103 */ 1104 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1105 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1106 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1107 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1108 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1109 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1110 1111 return (0); 1112} 1113 1114int 1115icl_conn_handoff(struct icl_conn *ic, int fd) 1116{ 1117 struct file *fp; 1118 struct socket *so; 1119 cap_rights_t rights; 1120 int error; 1121 1122 ICL_CONN_LOCK_ASSERT_NOT(ic); 1123 1124 /* 1125 * Steal the socket from userland. 1126 */ 1127 error = fget(curthread, fd, 1128 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1129 if (error != 0) 1130 return (error); 1131 if (fp->f_type != DTYPE_SOCKET) { 1132 fdrop(fp, curthread); 1133 return (EINVAL); 1134 } 1135 so = fp->f_data; 1136 if (so->so_type != SOCK_STREAM) { 1137 fdrop(fp, curthread); 1138 return (EINVAL); 1139 } 1140 1141 ICL_CONN_LOCK(ic); 1142 1143 if (ic->ic_socket != NULL) { 1144 ICL_CONN_UNLOCK(ic); 1145 fdrop(fp, curthread); 1146 return (EBUSY); 1147 } 1148 1149 ic->ic_socket = fp->f_data; 1150 fp->f_ops = &badfileops; 1151 fp->f_data = NULL; 1152 fdrop(fp, curthread); 1153 ICL_CONN_UNLOCK(ic); 1154 1155 error = icl_conn_start(ic); 1156 1157 return (error); 1158} 1159 1160void 1161icl_conn_shutdown(struct icl_conn *ic) 1162{ 1163 ICL_CONN_LOCK_ASSERT_NOT(ic); 1164 1165 ICL_CONN_LOCK(ic); 1166 if (ic->ic_socket == NULL) { 1167 ICL_CONN_UNLOCK(ic); 1168 return; 1169 } 1170 ICL_CONN_UNLOCK(ic); 1171 1172 soshutdown(ic->ic_socket, SHUT_RDWR); 1173} 1174 1175void 1176icl_conn_close(struct icl_conn *ic) 1177{ 1178 struct icl_pdu *pdu; 1179 1180 ICL_CONN_LOCK_ASSERT_NOT(ic); 1181 1182 ICL_CONN_LOCK(ic); 1183 if (ic->ic_socket == NULL) { 1184 ICL_CONN_UNLOCK(ic); 1185 return; 1186 } 1187 1188 ic->ic_disconnecting = true; 1189 1190 /* 1191 * Wake up the threads, so they can properly terminate. 1192 */ 1193 cv_signal(&ic->ic_receive_cv); 1194 cv_signal(&ic->ic_send_cv); 1195 while (ic->ic_receive_running || ic->ic_send_running) { 1196 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1197 ICL_CONN_UNLOCK(ic); 1198 cv_signal(&ic->ic_receive_cv); 1199 cv_signal(&ic->ic_send_cv); 1200 pause("icl_close", 1 * hz); 1201 ICL_CONN_LOCK(ic); 1202 } 1203 //ICL_DEBUG("send/receive threads terminated"); 1204 1205 soclose(ic->ic_socket); 1206 ic->ic_socket = NULL; 1207 1208 if (ic->ic_receive_pdu != NULL) { 1209 //ICL_DEBUG("freeing partially received PDU"); 1210 icl_pdu_free(ic->ic_receive_pdu); 1211 ic->ic_receive_pdu = NULL; 1212 } 1213 1214 /* 1215 * Remove any outstanding PDUs from the send queue. 1216 */ 1217 while (!STAILQ_EMPTY(&ic->ic_to_send)) { 1218 pdu = STAILQ_FIRST(&ic->ic_to_send); 1219 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next); 1220 icl_pdu_free(pdu); 1221 } 1222 1223 KASSERT(STAILQ_EMPTY(&ic->ic_to_send), 1224 ("destroying session with non-empty send queue")); 1225#ifdef DIAGNOSTIC 1226 KASSERT(ic->ic_outstanding_pdus == 0, 1227 ("destroying session with %d outstanding PDUs", 1228 ic->ic_outstanding_pdus)); 1229#endif 1230 ICL_CONN_UNLOCK(ic); 1231} 1232 1233bool 1234icl_conn_connected(struct icl_conn *ic) 1235{ 1236 ICL_CONN_LOCK_ASSERT_NOT(ic); 1237 1238 ICL_CONN_LOCK(ic); 1239 if (ic->ic_socket == NULL) { 1240 ICL_CONN_UNLOCK(ic); 1241 return (false); 1242 } 1243 if (ic->ic_socket->so_error != 0) { 1244 ICL_CONN_UNLOCK(ic); 1245 return (false); 1246 } 1247 ICL_CONN_UNLOCK(ic); 1248 return (true); 1249} 1250 1251#ifdef ICL_KERNEL_PROXY 1252int 1253icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1254{ 1255 int error; 1256 1257 ICL_CONN_LOCK_ASSERT_NOT(ic); 1258 1259 if (so->so_type != SOCK_STREAM) 1260 return (EINVAL); 1261 1262 ICL_CONN_LOCK(ic); 1263 if (ic->ic_socket != NULL) { 1264 ICL_CONN_UNLOCK(ic); 1265 return (EBUSY); 1266 } 1267 ic->ic_socket = so; 1268 ICL_CONN_UNLOCK(ic); 1269 1270 error = icl_conn_start(ic); 1271 1272 return (error); 1273} 1274#endif /* ICL_KERNEL_PROXY */ 1275 1276static int 1277icl_unload(void) 1278{ 1279 1280 if (icl_ncons != 0) 1281 return (EBUSY); 1282 1283 uma_zdestroy(icl_conn_zone); 1284 uma_zdestroy(icl_pdu_zone); 1285 1286 return (0); 1287} 1288 1289static void 1290icl_load(void) 1291{ 1292 1293 icl_conn_zone = uma_zcreate("icl_conn", 1294 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1295 UMA_ALIGN_PTR, 0); 1296 icl_pdu_zone = uma_zcreate("icl_pdu", 1297 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1298 UMA_ALIGN_PTR, 0); 1299 1300 refcount_init(&icl_ncons, 0); 1301} 1302 1303static int 1304icl_modevent(module_t mod, int what, void *arg) 1305{ 1306 1307 switch (what) { 1308 case MOD_LOAD: 1309 icl_load(); 1310 return (0); 1311 case MOD_UNLOAD: 1312 return (icl_unload()); 1313 default: 1314 return (EINVAL); 1315 } 1316} 1317 1318moduledata_t icl_data = { 1319 "icl", 1320 icl_modevent, 1321 0 1322}; 1323 1324DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1325MODULE_VERSION(icl, 1); 1326