icl.c revision 265496
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265496 2014-05-07 06:31:45Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capability.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW, 65 &debug, 1, "Enable debug messages"); 66static int partial_receive_len = 1 * 1024; /* XXX: More? */ 67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW, 69 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 70 "data segment"); 71 72static uma_zone_t icl_conn_zone; 73static uma_zone_t icl_pdu_zone; 74 75static volatile u_int icl_ncons; 76 77#define ICL_DEBUG(X, ...) \ 78 if (debug > 1) { \ 79 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 80 } while (0) 81 82#define ICL_WARN(X, ...) \ 83 if (debug > 0) { \ 84 printf("WARNING: %s: " X "\n", \ 85 __func__, ## __VA_ARGS__); \ 86 } while (0) 87 88#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 89#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 90#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 91#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 92 93static void 94icl_conn_fail(struct icl_conn *ic) 95{ 96 if (ic->ic_socket == NULL) 97 return; 98 99 /* 100 * XXX 101 */ 102 ic->ic_socket->so_error = EDOOFUS; 103 (ic->ic_error)(ic); 104} 105 106static struct mbuf * 107icl_conn_receive(struct icl_conn *ic, size_t len) 108{ 109 struct uio uio; 110 struct socket *so; 111 struct mbuf *m; 112 int error, flags; 113 114 so = ic->ic_socket; 115 116 memset(&uio, 0, sizeof(uio)); 117 uio.uio_resid = len; 118 119 flags = MSG_DONTWAIT; 120 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 121 if (error != 0) { 122 ICL_DEBUG("soreceive error %d", error); 123 return (NULL); 124 } 125 if (uio.uio_resid != 0) { 126 m_freem(m); 127 ICL_DEBUG("short read"); 128 return (NULL); 129 } 130 131 return (m); 132} 133 134static struct icl_pdu * 135icl_pdu_new(struct icl_conn *ic, int flags) 136{ 137 struct icl_pdu *ip; 138 139#ifdef DIAGNOSTIC 140 refcount_acquire(&ic->ic_outstanding_pdus); 141#endif 142 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 143 if (ip == NULL) { 144 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 145#ifdef DIAGNOSTIC 146 refcount_release(&ic->ic_outstanding_pdus); 147#endif 148 return (NULL); 149 } 150 151 ip->ip_conn = ic; 152 153 return (ip); 154} 155 156void 157icl_pdu_free(struct icl_pdu *ip) 158{ 159 struct icl_conn *ic; 160 161 ic = ip->ip_conn; 162 163 m_freem(ip->ip_bhs_mbuf); 164 m_freem(ip->ip_ahs_mbuf); 165 m_freem(ip->ip_data_mbuf); 166 uma_zfree(icl_pdu_zone, ip); 167#ifdef DIAGNOSTIC 168 refcount_release(&ic->ic_outstanding_pdus); 169#endif 170} 171 172/* 173 * Allocate icl_pdu with empty BHS to fill up by the caller. 174 */ 175struct icl_pdu * 176icl_pdu_new_bhs(struct icl_conn *ic, int flags) 177{ 178 struct icl_pdu *ip; 179 180 ip = icl_pdu_new(ic, flags); 181 if (ip == NULL) 182 return (NULL); 183 184 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 185 flags, MT_DATA, M_PKTHDR); 186 if (ip->ip_bhs_mbuf == NULL) { 187 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 188 icl_pdu_free(ip); 189 return (NULL); 190 } 191 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 192 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 193 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 194 195 return (ip); 196} 197 198static int 199icl_pdu_ahs_length(const struct icl_pdu *request) 200{ 201 202 return (request->ip_bhs->bhs_total_ahs_len * 4); 203} 204 205size_t 206icl_pdu_data_segment_length(const struct icl_pdu *request) 207{ 208 uint32_t len = 0; 209 210 len += request->ip_bhs->bhs_data_segment_len[0]; 211 len <<= 8; 212 len += request->ip_bhs->bhs_data_segment_len[1]; 213 len <<= 8; 214 len += request->ip_bhs->bhs_data_segment_len[2]; 215 216 return (len); 217} 218 219static void 220icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 221{ 222 223 response->ip_bhs->bhs_data_segment_len[2] = len; 224 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 225 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 226} 227 228static size_t 229icl_pdu_padding(const struct icl_pdu *ip) 230{ 231 232 if ((ip->ip_data_len % 4) != 0) 233 return (4 - (ip->ip_data_len % 4)); 234 235 return (0); 236} 237 238static size_t 239icl_pdu_size(const struct icl_pdu *response) 240{ 241 size_t len; 242 243 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 244 245 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 246 icl_pdu_padding(response); 247 if (response->ip_conn->ic_header_crc32c) 248 len += ISCSI_HEADER_DIGEST_SIZE; 249 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 250 len += ISCSI_DATA_DIGEST_SIZE; 251 252 return (len); 253} 254 255static int 256icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 257{ 258 struct mbuf *m; 259 260 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 261 if (m == NULL) { 262 ICL_DEBUG("failed to receive BHS"); 263 return (-1); 264 } 265 266 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 267 if (request->ip_bhs_mbuf == NULL) { 268 ICL_WARN("m_pullup failed"); 269 return (-1); 270 } 271 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 272 273 /* 274 * XXX: For architectures with strict alignment requirements 275 * we may need to allocate ip_bhs and copy the data into it. 276 * For some reason, though, not doing this doesn't seem 277 * to cause problems; tested on sparc64. 278 */ 279 280 *availablep -= sizeof(struct iscsi_bhs); 281 return (0); 282} 283 284static int 285icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 286{ 287 288 request->ip_ahs_len = icl_pdu_ahs_length(request); 289 if (request->ip_ahs_len == 0) 290 return (0); 291 292 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 293 request->ip_ahs_len); 294 if (request->ip_ahs_mbuf == NULL) { 295 ICL_DEBUG("failed to receive AHS"); 296 return (-1); 297 } 298 299 *availablep -= request->ip_ahs_len; 300 return (0); 301} 302 303static uint32_t 304icl_mbuf_to_crc32c(const struct mbuf *m0) 305{ 306 uint32_t digest = 0xffffffff; 307 const struct mbuf *m; 308 309 for (m = m0; m != NULL; m = m->m_next) 310 digest = calculate_crc32c(digest, 311 mtod(m, const void *), m->m_len); 312 313 digest = digest ^ 0xffffffff; 314 315 return (digest); 316} 317 318static int 319icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 320{ 321 struct mbuf *m; 322 uint32_t received_digest, valid_digest; 323 324 if (request->ip_conn->ic_header_crc32c == false) 325 return (0); 326 327 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 328 if (m == NULL) { 329 ICL_DEBUG("failed to receive header digest"); 330 return (-1); 331 } 332 333 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 334 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 335 m_freem(m); 336 337 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 338 339 /* 340 * XXX: Handle AHS. 341 */ 342 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 343 if (received_digest != valid_digest) { 344 ICL_WARN("header digest check failed; got 0x%x, " 345 "should be 0x%x", received_digest, valid_digest); 346 return (-1); 347 } 348 349 return (0); 350} 351 352/* 353 * Return the number of bytes that should be waiting in the receive socket 354 * before icl_pdu_receive_data_segment() gets called. 355 */ 356static size_t 357icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 358{ 359 size_t len; 360 361 len = icl_pdu_data_segment_length(request); 362 if (len == 0) 363 return (0); 364 365 /* 366 * Account for the parts of data segment already read from 367 * the socket buffer. 368 */ 369 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 370 len -= request->ip_data_len; 371 372 /* 373 * Don't always wait for the full data segment to be delivered 374 * to the socket; this might badly affect performance due to 375 * TCP window scaling. 376 */ 377 if (len > partial_receive_len) { 378#if 0 379 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 380 len, partial_receive_len)); 381#endif 382 len = partial_receive_len; 383 384 return (len); 385 } 386 387 /* 388 * Account for padding. Note that due to the way code is written, 389 * the icl_pdu_receive_data_segment() must always receive padding 390 * along with the last part of data segment, because it would be 391 * impossible to tell whether we've already received the full data 392 * segment including padding, or without it. 393 */ 394 if ((len % 4) != 0) 395 len += 4 - (len % 4); 396 397#if 0 398 ICL_DEBUG("need %zd bytes of data", len)); 399#endif 400 401 return (len); 402} 403 404static int 405icl_pdu_receive_data_segment(struct icl_pdu *request, 406 size_t *availablep, bool *more_neededp) 407{ 408 struct icl_conn *ic; 409 size_t len, padding = 0; 410 struct mbuf *m; 411 412 ic = request->ip_conn; 413 414 *more_neededp = false; 415 ic->ic_receive_len = 0; 416 417 len = icl_pdu_data_segment_length(request); 418 if (len == 0) 419 return (0); 420 421 if ((len % 4) != 0) 422 padding = 4 - (len % 4); 423 424 /* 425 * Account for already received parts of data segment. 426 */ 427 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 428 len -= request->ip_data_len; 429 430 if (len + padding > *availablep) { 431 /* 432 * Not enough data in the socket buffer. Receive as much 433 * as we can. Don't receive padding, since, obviously, it's 434 * not the end of data segment yet. 435 */ 436#if 0 437 ICL_DEBUG("limited from %zd to %zd", 438 len + padding, *availablep - padding)); 439#endif 440 len = *availablep - padding; 441 *more_neededp = true; 442 padding = 0; 443 } 444 445 /* 446 * Must not try to receive padding without at least one byte 447 * of actual data segment. 448 */ 449 if (len > 0) { 450 m = icl_conn_receive(request->ip_conn, len + padding); 451 if (m == NULL) { 452 ICL_DEBUG("failed to receive data segment"); 453 return (-1); 454 } 455 456 if (request->ip_data_mbuf == NULL) 457 request->ip_data_mbuf = m; 458 else 459 m_cat(request->ip_data_mbuf, m); 460 461 request->ip_data_len += len; 462 *availablep -= len + padding; 463 } else 464 ICL_DEBUG("len 0"); 465 466 if (*more_neededp) 467 ic->ic_receive_len = 468 icl_pdu_data_segment_receive_len(request); 469 470 return (0); 471} 472 473static int 474icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 475{ 476 struct mbuf *m; 477 uint32_t received_digest, valid_digest; 478 479 if (request->ip_conn->ic_data_crc32c == false) 480 return (0); 481 482 if (request->ip_data_len == 0) 483 return (0); 484 485 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 486 if (m == NULL) { 487 ICL_DEBUG("failed to receive data digest"); 488 return (-1); 489 } 490 491 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 492 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 493 m_freem(m); 494 495 *availablep -= ISCSI_DATA_DIGEST_SIZE; 496 497 /* 498 * Note that ip_data_mbuf also contains padding; since digest 499 * calculation is supposed to include that, we iterate over 500 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 501 */ 502 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 503 if (received_digest != valid_digest) { 504 ICL_WARN("data digest check failed; got 0x%x, " 505 "should be 0x%x", received_digest, valid_digest); 506 return (-1); 507 } 508 509 return (0); 510} 511 512/* 513 * Somewhat contrary to the name, this attempts to receive only one 514 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 515 */ 516static struct icl_pdu * 517icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 518{ 519 struct icl_pdu *request; 520 struct socket *so; 521 size_t len; 522 int error; 523 bool more_needed; 524 525 so = ic->ic_socket; 526 527 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 528 KASSERT(ic->ic_receive_pdu == NULL, 529 ("ic->ic_receive_pdu != NULL")); 530 request = icl_pdu_new(ic, M_NOWAIT); 531 if (request == NULL) { 532 ICL_DEBUG("failed to allocate PDU; " 533 "dropping connection"); 534 icl_conn_fail(ic); 535 return (NULL); 536 } 537 ic->ic_receive_pdu = request; 538 } else { 539 KASSERT(ic->ic_receive_pdu != NULL, 540 ("ic->ic_receive_pdu == NULL")); 541 request = ic->ic_receive_pdu; 542 } 543 544 if (*availablep < ic->ic_receive_len) { 545#if 0 546 ICL_DEBUG("not enough data; need %zd, " 547 "have %zd", ic->ic_receive_len, *availablep); 548#endif 549 return (NULL); 550 } 551 552 switch (ic->ic_receive_state) { 553 case ICL_CONN_STATE_BHS: 554 //ICL_DEBUG("receiving BHS"); 555 error = icl_pdu_receive_bhs(request, availablep); 556 if (error != 0) { 557 ICL_DEBUG("failed to receive BHS; " 558 "dropping connection"); 559 break; 560 } 561 562 /* 563 * We don't enforce any limit for AHS length; 564 * its length is stored in 8 bit field. 565 */ 566 567 len = icl_pdu_data_segment_length(request); 568 if (len > ic->ic_max_data_segment_length) { 569 ICL_WARN("received data segment " 570 "length %zd is larger than negotiated " 571 "MaxDataSegmentLength %zd; " 572 "dropping connection", 573 len, ic->ic_max_data_segment_length); 574 error = EINVAL; 575 break; 576 } 577 578 ic->ic_receive_state = ICL_CONN_STATE_AHS; 579 ic->ic_receive_len = icl_pdu_ahs_length(request); 580 break; 581 582 case ICL_CONN_STATE_AHS: 583 //ICL_DEBUG("receiving AHS"); 584 error = icl_pdu_receive_ahs(request, availablep); 585 if (error != 0) { 586 ICL_DEBUG("failed to receive AHS; " 587 "dropping connection"); 588 break; 589 } 590 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 591 if (ic->ic_header_crc32c == false) 592 ic->ic_receive_len = 0; 593 else 594 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 595 break; 596 597 case ICL_CONN_STATE_HEADER_DIGEST: 598 //ICL_DEBUG("receiving header digest"); 599 error = icl_pdu_check_header_digest(request, availablep); 600 if (error != 0) { 601 ICL_DEBUG("header digest failed; " 602 "dropping connection"); 603 break; 604 } 605 606 ic->ic_receive_state = ICL_CONN_STATE_DATA; 607 ic->ic_receive_len = 608 icl_pdu_data_segment_receive_len(request); 609 break; 610 611 case ICL_CONN_STATE_DATA: 612 //ICL_DEBUG("receiving data segment"); 613 error = icl_pdu_receive_data_segment(request, availablep, 614 &more_needed); 615 if (error != 0) { 616 ICL_DEBUG("failed to receive data segment;" 617 "dropping connection"); 618 break; 619 } 620 621 if (more_needed) 622 break; 623 624 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 625 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 626 ic->ic_receive_len = 0; 627 else 628 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 629 break; 630 631 case ICL_CONN_STATE_DATA_DIGEST: 632 //ICL_DEBUG("receiving data digest"); 633 error = icl_pdu_check_data_digest(request, availablep); 634 if (error != 0) { 635 ICL_DEBUG("data digest failed; " 636 "dropping connection"); 637 break; 638 } 639 640 /* 641 * We've received complete PDU; reset the receive state machine 642 * and return the PDU. 643 */ 644 ic->ic_receive_state = ICL_CONN_STATE_BHS; 645 ic->ic_receive_len = sizeof(struct iscsi_bhs); 646 ic->ic_receive_pdu = NULL; 647 return (request); 648 649 default: 650 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 651 } 652 653 if (error != 0) { 654 icl_pdu_free(request); 655 icl_conn_fail(ic); 656 } 657 658 return (NULL); 659} 660 661static void 662icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 663{ 664 struct icl_pdu *response; 665 struct socket *so; 666 667 so = ic->ic_socket; 668 669 /* 670 * This can never happen; we're careful to only mess with ic->ic_socket 671 * pointer when the send/receive threads are not running. 672 */ 673 KASSERT(so != NULL, ("NULL socket")); 674 675 for (;;) { 676 if (ic->ic_disconnecting) 677 return; 678 679 if (so->so_error != 0) { 680 ICL_DEBUG("connection error %d; " 681 "dropping connection", so->so_error); 682 icl_conn_fail(ic); 683 return; 684 } 685 686 /* 687 * Loop until we have a complete PDU or there is not enough 688 * data in the socket buffer. 689 */ 690 if (available < ic->ic_receive_len) { 691#if 0 692 ICL_DEBUG("not enough data; have %zd, " 693 "need %zd", available, 694 ic->ic_receive_len); 695#endif 696 return; 697 } 698 699 response = icl_conn_receive_pdu(ic, &available); 700 if (response == NULL) 701 continue; 702 703 if (response->ip_ahs_len > 0) { 704 ICL_WARN("received PDU with unsupported " 705 "AHS; opcode 0x%x; dropping connection", 706 response->ip_bhs->bhs_opcode); 707 icl_pdu_free(response); 708 icl_conn_fail(ic); 709 return; 710 } 711 712 (ic->ic_receive)(response); 713 } 714} 715 716static void 717icl_receive_thread(void *arg) 718{ 719 struct icl_conn *ic; 720 size_t available; 721 struct socket *so; 722 723 ic = arg; 724 so = ic->ic_socket; 725 726 ICL_CONN_LOCK(ic); 727 ic->ic_receive_running = true; 728 ICL_CONN_UNLOCK(ic); 729 730 for (;;) { 731 if (ic->ic_disconnecting) { 732 //ICL_DEBUG("terminating"); 733 break; 734 } 735 736 SOCKBUF_LOCK(&so->so_rcv); 737 available = so->so_rcv.sb_cc; 738 if (available < ic->ic_receive_len) { 739 so->so_rcv.sb_lowat = ic->ic_receive_len; 740 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 741 } 742 SOCKBUF_UNLOCK(&so->so_rcv); 743 744 icl_conn_receive_pdus(ic, available); 745 } 746 747 ICL_CONN_LOCK(ic); 748 ic->ic_receive_running = false; 749 ICL_CONN_UNLOCK(ic); 750 kthread_exit(); 751} 752 753static int 754icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 755{ 756 struct icl_conn *ic; 757 758 ic = arg; 759 cv_signal(&ic->ic_receive_cv); 760 return (SU_OK); 761} 762 763static int 764icl_pdu_send(struct icl_pdu *request) 765{ 766 size_t padding, pdu_len; 767 uint32_t digest, zero = 0; 768 int error, ok; 769 struct socket *so; 770 struct icl_conn *ic; 771 772 ic = request->ip_conn; 773 so = request->ip_conn->ic_socket; 774 775 ICL_CONN_LOCK_ASSERT(ic); 776 777 icl_pdu_set_data_segment_length(request, request->ip_data_len); 778 779 pdu_len = icl_pdu_size(request); 780 781 if (ic->ic_header_crc32c) { 782 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 783 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 784 (void *)&digest); 785 if (ok != 1) { 786 ICL_WARN("failed to append header digest"); 787 return (1); 788 } 789 } 790 791 if (request->ip_data_len != 0) { 792 padding = icl_pdu_padding(request); 793 if (padding > 0) { 794 ok = m_append(request->ip_data_mbuf, padding, 795 (void *)&zero); 796 if (ok != 1) { 797 ICL_WARN("failed to append padding"); 798 return (1); 799 } 800 } 801 802 if (ic->ic_data_crc32c) { 803 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 804 805 ok = m_append(request->ip_data_mbuf, sizeof(digest), 806 (void *)&digest); 807 if (ok != 1) { 808 ICL_WARN("failed to append header digest"); 809 return (1); 810 } 811 } 812 813 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 814 request->ip_data_mbuf = NULL; 815 } 816 817 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 818 819 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 820 NULL, MSG_DONTWAIT, curthread); 821 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 822 if (error != 0) { 823 ICL_DEBUG("sosend error %d", error); 824 return (error); 825 } 826 827 return (0); 828} 829 830static void 831icl_conn_send_pdus(struct icl_conn *ic) 832{ 833 struct icl_pdu *request; 834 struct socket *so; 835 size_t available, size; 836 int error; 837 838 ICL_CONN_LOCK_ASSERT(ic); 839 840 so = ic->ic_socket; 841 842 SOCKBUF_LOCK(&so->so_snd); 843 available = sbspace(&so->so_snd); 844 SOCKBUF_UNLOCK(&so->so_snd); 845 846 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 847 if (ic->ic_disconnecting) 848 return; 849 850 request = TAILQ_FIRST(&ic->ic_to_send); 851 size = icl_pdu_size(request); 852 if (available < size) { 853 /* 854 * Set the low watermark on the socket, 855 * to avoid waking up until there is enough 856 * space. 857 */ 858 SOCKBUF_LOCK(&so->so_snd); 859 so->so_snd.sb_lowat = size; 860 SOCKBUF_UNLOCK(&so->so_snd); 861#if 1 862 ICL_DEBUG("no space to send; " 863 "have %zd, need %zd", 864 available, size); 865#endif 866 return; 867 } 868 available -= size; 869 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next); 870 error = icl_pdu_send(request); 871 if (error != 0) { 872 ICL_DEBUG("failed to send PDU; " 873 "dropping connection"); 874 icl_conn_fail(ic); 875 return; 876 } 877 icl_pdu_free(request); 878 } 879} 880 881static void 882icl_send_thread(void *arg) 883{ 884 struct icl_conn *ic; 885 886 ic = arg; 887 888 ICL_CONN_LOCK(ic); 889 ic->ic_send_running = true; 890 891 for (;;) { 892 if (ic->ic_disconnecting) { 893 //ICL_DEBUG("terminating"); 894 break; 895 } 896 icl_conn_send_pdus(ic); 897 cv_wait(&ic->ic_send_cv, ic->ic_lock); 898 } 899 900 ic->ic_send_running = false; 901 ICL_CONN_UNLOCK(ic); 902 kthread_exit(); 903} 904 905static int 906icl_soupcall_send(struct socket *so, void *arg, int waitflag) 907{ 908 struct icl_conn *ic; 909 910 ic = arg; 911 cv_signal(&ic->ic_send_cv); 912 return (SU_OK); 913} 914 915int 916icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags) 917{ 918 struct mbuf *mb, *newmb; 919 size_t copylen, off = 0; 920 921 KASSERT(len > 0, ("len == 0")); 922 923 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 924 if (newmb == NULL) { 925 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 926 return (ENOMEM); 927 } 928 929 for (mb = newmb; mb != NULL; mb = mb->m_next) { 930 copylen = min(M_TRAILINGSPACE(mb), len - off); 931 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 932 mb->m_len = copylen; 933 off += copylen; 934 } 935 KASSERT(off == len, ("%s: off != len", __func__)); 936 937 if (request->ip_data_mbuf == NULL) { 938 request->ip_data_mbuf = newmb; 939 request->ip_data_len = len; 940 } else { 941 m_cat(request->ip_data_mbuf, newmb); 942 request->ip_data_len += len; 943 } 944 945 return (0); 946} 947 948void 949icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 950{ 951 952 m_copydata(ip->ip_data_mbuf, off, len, addr); 953} 954 955void 956icl_pdu_queue(struct icl_pdu *ip) 957{ 958 struct icl_conn *ic; 959 960 ic = ip->ip_conn; 961 962 ICL_CONN_LOCK_ASSERT(ic); 963 964 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 965 ICL_DEBUG("icl_pdu_queue on closed connection"); 966 icl_pdu_free(ip); 967 return; 968 } 969 TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 970 cv_signal(&ic->ic_send_cv); 971} 972 973struct icl_conn * 974icl_conn_new(const char *name, struct mtx *lock) 975{ 976 struct icl_conn *ic; 977 978 refcount_acquire(&icl_ncons); 979 980 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 981 982 TAILQ_INIT(&ic->ic_to_send); 983 ic->ic_lock = lock; 984 cv_init(&ic->ic_send_cv, "icl_tx"); 985 cv_init(&ic->ic_receive_cv, "icl_rx"); 986#ifdef DIAGNOSTIC 987 refcount_init(&ic->ic_outstanding_pdus, 0); 988#endif 989 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 990 ic->ic_name = name; 991 992 return (ic); 993} 994 995void 996icl_conn_free(struct icl_conn *ic) 997{ 998 999 cv_destroy(&ic->ic_send_cv); 1000 cv_destroy(&ic->ic_receive_cv); 1001 uma_zfree(icl_conn_zone, ic); 1002 refcount_release(&icl_ncons); 1003} 1004 1005static int 1006icl_conn_start(struct icl_conn *ic) 1007{ 1008 size_t bufsize; 1009 struct sockopt opt; 1010 int error, one = 1; 1011 1012 ICL_CONN_LOCK(ic); 1013 1014 /* 1015 * XXX: Ugly hack. 1016 */ 1017 if (ic->ic_socket == NULL) { 1018 ICL_CONN_UNLOCK(ic); 1019 return (EINVAL); 1020 } 1021 1022 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1023 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1024 ic->ic_disconnecting = false; 1025 1026 ICL_CONN_UNLOCK(ic); 1027 1028 /* 1029 * Use max available sockbuf size for sending. Do it manually 1030 * instead of sbreserve(9) to work around resource limits. 1031 * 1032 * XXX: This kind of sucks. On one hand, we don't currently support 1033 * sending a part of data segment; we always do it in one piece, 1034 * so we have to make sure it can fit in the socket buffer. 1035 * Once I've implemented partial send, we'll get rid of this 1036 * and use autoscaling. 1037 */ 1038 bufsize = (sizeof(struct iscsi_bhs) + 1039 ic->ic_max_data_segment_length) * 8; 1040 error = soreserve(ic->ic_socket, bufsize, bufsize); 1041 if (error != 0) { 1042 ICL_WARN("soreserve failed with error %d", error); 1043 icl_conn_close(ic); 1044 return (error); 1045 } 1046 1047 /* 1048 * Disable Nagle. 1049 */ 1050 bzero(&opt, sizeof(opt)); 1051 opt.sopt_dir = SOPT_SET; 1052 opt.sopt_level = IPPROTO_TCP; 1053 opt.sopt_name = TCP_NODELAY; 1054 opt.sopt_val = &one; 1055 opt.sopt_valsize = sizeof(one); 1056 error = sosetopt(ic->ic_socket, &opt); 1057 if (error != 0) { 1058 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1059 icl_conn_close(ic); 1060 return (error); 1061 } 1062 1063 /* 1064 * Start threads. 1065 */ 1066 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", 1067 ic->ic_name); 1068 if (error != 0) { 1069 ICL_WARN("kthread_add(9) failed with error %d", error); 1070 icl_conn_close(ic); 1071 return (error); 1072 } 1073 1074 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", 1075 ic->ic_name); 1076 if (error != 0) { 1077 ICL_WARN("kthread_add(9) failed with error %d", error); 1078 icl_conn_close(ic); 1079 return (error); 1080 } 1081 1082 /* 1083 * Register socket upcall, to get notified about incoming PDUs 1084 * and free space to send outgoing ones. 1085 */ 1086 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1087 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1088 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1089 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1090 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1091 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1092 1093 return (0); 1094} 1095 1096int 1097icl_conn_handoff(struct icl_conn *ic, int fd) 1098{ 1099 struct file *fp; 1100 struct socket *so; 1101 cap_rights_t rights; 1102 int error; 1103 1104 ICL_CONN_LOCK_ASSERT_NOT(ic); 1105 1106 /* 1107 * Steal the socket from userland. 1108 */ 1109 error = fget(curthread, fd, 1110 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1111 if (error != 0) 1112 return (error); 1113 if (fp->f_type != DTYPE_SOCKET) { 1114 fdrop(fp, curthread); 1115 return (EINVAL); 1116 } 1117 so = fp->f_data; 1118 if (so->so_type != SOCK_STREAM) { 1119 fdrop(fp, curthread); 1120 return (EINVAL); 1121 } 1122 1123 ICL_CONN_LOCK(ic); 1124 1125 if (ic->ic_socket != NULL) { 1126 ICL_CONN_UNLOCK(ic); 1127 fdrop(fp, curthread); 1128 return (EBUSY); 1129 } 1130 1131 ic->ic_socket = fp->f_data; 1132 fp->f_ops = &badfileops; 1133 fp->f_data = NULL; 1134 fdrop(fp, curthread); 1135 ICL_CONN_UNLOCK(ic); 1136 1137 error = icl_conn_start(ic); 1138 1139 return (error); 1140} 1141 1142void 1143icl_conn_shutdown(struct icl_conn *ic) 1144{ 1145 ICL_CONN_LOCK_ASSERT_NOT(ic); 1146 1147 ICL_CONN_LOCK(ic); 1148 if (ic->ic_socket == NULL) { 1149 ICL_CONN_UNLOCK(ic); 1150 return; 1151 } 1152 ICL_CONN_UNLOCK(ic); 1153 1154 soshutdown(ic->ic_socket, SHUT_RDWR); 1155} 1156 1157void 1158icl_conn_close(struct icl_conn *ic) 1159{ 1160 struct icl_pdu *pdu; 1161 1162 ICL_CONN_LOCK_ASSERT_NOT(ic); 1163 1164 ICL_CONN_LOCK(ic); 1165 if (ic->ic_socket == NULL) { 1166 ICL_CONN_UNLOCK(ic); 1167 return; 1168 } 1169 1170 ic->ic_disconnecting = true; 1171 1172 /* 1173 * Wake up the threads, so they can properly terminate. 1174 */ 1175 cv_signal(&ic->ic_receive_cv); 1176 cv_signal(&ic->ic_send_cv); 1177 while (ic->ic_receive_running || ic->ic_send_running) { 1178 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1179 ICL_CONN_UNLOCK(ic); 1180 cv_signal(&ic->ic_receive_cv); 1181 cv_signal(&ic->ic_send_cv); 1182 pause("icl_close", 1 * hz); 1183 ICL_CONN_LOCK(ic); 1184 } 1185 //ICL_DEBUG("send/receive threads terminated"); 1186 1187 soclose(ic->ic_socket); 1188 ic->ic_socket = NULL; 1189 1190 if (ic->ic_receive_pdu != NULL) { 1191 //ICL_DEBUG("freeing partially received PDU"); 1192 icl_pdu_free(ic->ic_receive_pdu); 1193 ic->ic_receive_pdu = NULL; 1194 } 1195 1196 /* 1197 * Remove any outstanding PDUs from the send queue. 1198 */ 1199 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 1200 pdu = TAILQ_FIRST(&ic->ic_to_send); 1201 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next); 1202 icl_pdu_free(pdu); 1203 } 1204 1205 KASSERT(TAILQ_EMPTY(&ic->ic_to_send), 1206 ("destroying session with non-empty send queue")); 1207 /* 1208 * XXX 1209 */ 1210#if 0 1211 KASSERT(ic->ic_outstanding_pdus == 0, 1212 ("destroying session with %d outstanding PDUs", 1213 ic->ic_outstanding_pdus)); 1214#endif 1215 ICL_CONN_UNLOCK(ic); 1216} 1217 1218bool 1219icl_conn_connected(struct icl_conn *ic) 1220{ 1221 ICL_CONN_LOCK_ASSERT_NOT(ic); 1222 1223 ICL_CONN_LOCK(ic); 1224 if (ic->ic_socket == NULL) { 1225 ICL_CONN_UNLOCK(ic); 1226 return (false); 1227 } 1228 if (ic->ic_socket->so_error != 0) { 1229 ICL_CONN_UNLOCK(ic); 1230 return (false); 1231 } 1232 ICL_CONN_UNLOCK(ic); 1233 return (true); 1234} 1235 1236#ifdef ICL_KERNEL_PROXY 1237int 1238icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1239{ 1240 int error; 1241 1242 ICL_CONN_LOCK_ASSERT_NOT(ic); 1243 1244 if (so->so_type != SOCK_STREAM) 1245 return (EINVAL); 1246 1247 ICL_CONN_LOCK(ic); 1248 if (ic->ic_socket != NULL) { 1249 ICL_CONN_UNLOCK(ic); 1250 return (EBUSY); 1251 } 1252 ic->ic_socket = so; 1253 ICL_CONN_UNLOCK(ic); 1254 1255 error = icl_conn_start(ic); 1256 1257 return (error); 1258} 1259#endif /* ICL_KERNEL_PROXY */ 1260 1261static int 1262icl_unload(void) 1263{ 1264 1265 if (icl_ncons != 0) 1266 return (EBUSY); 1267 1268 uma_zdestroy(icl_conn_zone); 1269 uma_zdestroy(icl_pdu_zone); 1270 1271 return (0); 1272} 1273 1274static void 1275icl_load(void) 1276{ 1277 1278 icl_conn_zone = uma_zcreate("icl_conn", 1279 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1280 UMA_ALIGN_PTR, 0); 1281 icl_pdu_zone = uma_zcreate("icl_pdu", 1282 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1283 UMA_ALIGN_PTR, 0); 1284 1285 refcount_init(&icl_ncons, 0); 1286} 1287 1288static int 1289icl_modevent(module_t mod, int what, void *arg) 1290{ 1291 1292 switch (what) { 1293 case MOD_LOAD: 1294 icl_load(); 1295 return (0); 1296 case MOD_UNLOAD: 1297 return (icl_unload()); 1298 default: 1299 return (EINVAL); 1300 } 1301} 1302 1303moduledata_t icl_data = { 1304 "icl", 1305 icl_modevent, 1306 0 1307}; 1308 1309DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1310MODULE_VERSION(icl, 1); 1311