icl.c revision 265495
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265495 2014-05-07 06:29:01Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capability.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW, 65 &debug, 1, "Enable debug messages"); 66static int partial_receive_len = 1 * 1024; /* XXX: More? */ 67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW, 69 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 70 "data segment"); 71 72static uma_zone_t icl_conn_zone; 73static uma_zone_t icl_pdu_zone; 74 75static volatile u_int icl_ncons; 76 77#define ICL_DEBUG(X, ...) \ 78 if (debug > 1) { \ 79 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 80 } while (0) 81 82#define ICL_WARN(X, ...) \ 83 if (debug > 0) { \ 84 printf("WARNING: %s: " X "\n", \ 85 __func__, ## __VA_ARGS__); \ 86 } while (0) 87 88#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 89#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 90#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 91#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 92 93static void 94icl_conn_fail(struct icl_conn *ic) 95{ 96 if (ic->ic_socket == NULL) 97 return; 98 99 /* 100 * XXX 101 */ 102 ic->ic_socket->so_error = EDOOFUS; 103 (ic->ic_error)(ic); 104} 105 106static struct mbuf * 107icl_conn_receive(struct icl_conn *ic, size_t len) 108{ 109 struct uio uio; 110 struct socket *so; 111 struct mbuf *m; 112 int error, flags; 113 114 so = ic->ic_socket; 115 116 memset(&uio, 0, sizeof(uio)); 117 uio.uio_resid = len; 118 119 flags = MSG_DONTWAIT; 120 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 121 if (error != 0) { 122 ICL_DEBUG("soreceive error %d", error); 123 return (NULL); 124 } 125 if (uio.uio_resid != 0) { 126 m_freem(m); 127 ICL_DEBUG("short read"); 128 return (NULL); 129 } 130 131 return (m); 132} 133 134static struct icl_pdu * 135icl_pdu_new(struct icl_conn *ic, int flags) 136{ 137 struct icl_pdu *ip; 138 139#ifdef DIAGNOSTIC 140 refcount_acquire(&ic->ic_outstanding_pdus); 141#endif 142 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 143 if (ip == NULL) { 144 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 145#ifdef DIAGNOSTIC 146 refcount_release(&ic->ic_outstanding_pdus); 147#endif 148 return (NULL); 149 } 150 151 ip->ip_conn = ic; 152 153 return (ip); 154} 155 156void 157icl_pdu_free(struct icl_pdu *ip) 158{ 159 struct icl_conn *ic; 160 161 ic = ip->ip_conn; 162 163 m_freem(ip->ip_bhs_mbuf); 164 m_freem(ip->ip_ahs_mbuf); 165 m_freem(ip->ip_data_mbuf); 166 uma_zfree(icl_pdu_zone, ip); 167#ifdef DIAGNOSTIC 168 refcount_release(&ic->ic_outstanding_pdus); 169#endif 170} 171 172/* 173 * Allocate icl_pdu with empty BHS to fill up by the caller. 174 */ 175struct icl_pdu * 176icl_pdu_new_bhs(struct icl_conn *ic, int flags) 177{ 178 struct icl_pdu *ip; 179 180 ip = icl_pdu_new(ic, flags); 181 if (ip == NULL) 182 return (NULL); 183 184 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 185 flags, MT_DATA, M_PKTHDR); 186 if (ip->ip_bhs_mbuf == NULL) { 187 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 188 icl_pdu_free(ip); 189 return (NULL); 190 } 191 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 192 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 193 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 194 195 return (ip); 196} 197 198static int 199icl_pdu_ahs_length(const struct icl_pdu *request) 200{ 201 202 return (request->ip_bhs->bhs_total_ahs_len * 4); 203} 204 205size_t 206icl_pdu_data_segment_length(const struct icl_pdu *request) 207{ 208 uint32_t len = 0; 209 210 len += request->ip_bhs->bhs_data_segment_len[0]; 211 len <<= 8; 212 len += request->ip_bhs->bhs_data_segment_len[1]; 213 len <<= 8; 214 len += request->ip_bhs->bhs_data_segment_len[2]; 215 216 return (len); 217} 218 219static void 220icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 221{ 222 223 response->ip_bhs->bhs_data_segment_len[2] = len; 224 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 225 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 226} 227 228static size_t 229icl_pdu_padding(const struct icl_pdu *ip) 230{ 231 232 if ((ip->ip_data_len % 4) != 0) 233 return (4 - (ip->ip_data_len % 4)); 234 235 return (0); 236} 237 238static size_t 239icl_pdu_size(const struct icl_pdu *response) 240{ 241 size_t len; 242 243 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 244 245 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 246 icl_pdu_padding(response); 247 if (response->ip_conn->ic_header_crc32c) 248 len += ISCSI_HEADER_DIGEST_SIZE; 249 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 250 len += ISCSI_DATA_DIGEST_SIZE; 251 252 return (len); 253} 254 255static int 256icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 257{ 258 struct mbuf *m; 259 260 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 261 if (m == NULL) { 262 ICL_DEBUG("failed to receive BHS"); 263 return (-1); 264 } 265 266 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 267 if (request->ip_bhs_mbuf == NULL) { 268 ICL_WARN("m_pullup failed"); 269 return (-1); 270 } 271 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 272 273 /* 274 * XXX: For architectures with strict alignment requirements 275 * we may need to allocate ip_bhs and copy the data into it. 276 * For some reason, though, not doing this doesn't seem 277 * to cause problems; tested on sparc64. 278 */ 279 280 *availablep -= sizeof(struct iscsi_bhs); 281 return (0); 282} 283 284static int 285icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 286{ 287 288 request->ip_ahs_len = icl_pdu_ahs_length(request); 289 if (request->ip_ahs_len == 0) 290 return (0); 291 292 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 293 request->ip_ahs_len); 294 if (request->ip_ahs_mbuf == NULL) { 295 ICL_DEBUG("failed to receive AHS"); 296 return (-1); 297 } 298 299 *availablep -= request->ip_ahs_len; 300 return (0); 301} 302 303static uint32_t 304icl_mbuf_to_crc32c(const struct mbuf *m0) 305{ 306 uint32_t digest = 0xffffffff; 307 const struct mbuf *m; 308 309 for (m = m0; m != NULL; m = m->m_next) 310 digest = calculate_crc32c(digest, 311 mtod(m, const void *), m->m_len); 312 313 digest = digest ^ 0xffffffff; 314 315 return (digest); 316} 317 318static int 319icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 320{ 321 struct mbuf *m; 322 uint32_t received_digest, valid_digest; 323 324 if (request->ip_conn->ic_header_crc32c == false) 325 return (0); 326 327 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 328 if (m == NULL) { 329 ICL_DEBUG("failed to receive header digest"); 330 return (-1); 331 } 332 333 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 334 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 335 m_freem(m); 336 337 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 338 339 /* 340 * XXX: Handle AHS. 341 */ 342 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 343 if (received_digest != valid_digest) { 344 ICL_WARN("header digest check failed; got 0x%x, " 345 "should be 0x%x", received_digest, valid_digest); 346 return (-1); 347 } 348 349 return (0); 350} 351 352/* 353 * Return the number of bytes that should be waiting in the receive socket 354 * before icl_pdu_receive_data_segment() gets called. 355 */ 356static size_t 357icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 358{ 359 size_t len; 360 361 len = icl_pdu_data_segment_length(request); 362 if (len == 0) 363 return (0); 364 365 /* 366 * Account for the parts of data segment already read from 367 * the socket buffer. 368 */ 369 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 370 len -= request->ip_data_len; 371 372 /* 373 * Don't always wait for the full data segment to be delivered 374 * to the socket; this might badly affect performance due to 375 * TCP window scaling. 376 */ 377 if (len > partial_receive_len) { 378#if 0 379 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 380 len, partial_receive_len)); 381#endif 382 len = partial_receive_len; 383 384 return (len); 385 } 386 387 /* 388 * Account for padding. Note that due to the way code is written, 389 * the icl_pdu_receive_data_segment() must always receive padding 390 * along with the last part of data segment, because it would be 391 * impossible to tell whether we've already received the full data 392 * segment including padding, or without it. 393 */ 394 if ((len % 4) != 0) 395 len += 4 - (len % 4); 396 397#if 0 398 ICL_DEBUG("need %zd bytes of data", len)); 399#endif 400 401 return (len); 402} 403 404static int 405icl_pdu_receive_data_segment(struct icl_pdu *request, 406 size_t *availablep, bool *more_neededp) 407{ 408 struct icl_conn *ic; 409 size_t len, padding = 0; 410 struct mbuf *m; 411 412 ic = request->ip_conn; 413 414 *more_neededp = false; 415 ic->ic_receive_len = 0; 416 417 len = icl_pdu_data_segment_length(request); 418 if (len == 0) 419 return (0); 420 421 if ((len % 4) != 0) 422 padding = 4 - (len % 4); 423 424 /* 425 * Account for already received parts of data segment. 426 */ 427 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 428 len -= request->ip_data_len; 429 430 if (len + padding > *availablep) { 431 /* 432 * Not enough data in the socket buffer. Receive as much 433 * as we can. Don't receive padding, since, obviously, it's 434 * not the end of data segment yet. 435 */ 436#if 0 437 ICL_DEBUG("limited from %zd to %zd", 438 len + padding, *availablep - padding)); 439#endif 440 len = *availablep - padding; 441 *more_neededp = true; 442 padding = 0; 443 } 444 445 /* 446 * Must not try to receive padding without at least one byte 447 * of actual data segment. 448 */ 449 if (len > 0) { 450 m = icl_conn_receive(request->ip_conn, len + padding); 451 if (m == NULL) { 452 ICL_DEBUG("failed to receive data segment"); 453 return (-1); 454 } 455 456 if (request->ip_data_mbuf == NULL) 457 request->ip_data_mbuf = m; 458 else 459 m_cat(request->ip_data_mbuf, m); 460 461 request->ip_data_len += len; 462 *availablep -= len + padding; 463 } else 464 ICL_DEBUG("len 0"); 465 466 if (*more_neededp) 467 ic->ic_receive_len = 468 icl_pdu_data_segment_receive_len(request); 469 470 return (0); 471} 472 473static int 474icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 475{ 476 struct mbuf *m; 477 uint32_t received_digest, valid_digest; 478 479 if (request->ip_conn->ic_data_crc32c == false) 480 return (0); 481 482 if (request->ip_data_len == 0) 483 return (0); 484 485 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 486 if (m == NULL) { 487 ICL_DEBUG("failed to receive data digest"); 488 return (-1); 489 } 490 491 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 492 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 493 m_freem(m); 494 495 *availablep -= ISCSI_DATA_DIGEST_SIZE; 496 497 /* 498 * Note that ip_data_mbuf also contains padding; since digest 499 * calculation is supposed to include that, we iterate over 500 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 501 */ 502 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 503 if (received_digest != valid_digest) { 504 ICL_WARN("data digest check failed; got 0x%x, " 505 "should be 0x%x", received_digest, valid_digest); 506 return (-1); 507 } 508 509 return (0); 510} 511 512/* 513 * Somewhat contrary to the name, this attempts to receive only one 514 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 515 */ 516static struct icl_pdu * 517icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 518{ 519 struct icl_pdu *request; 520 struct socket *so; 521 size_t len; 522 int error; 523 bool more_needed; 524 525 so = ic->ic_socket; 526 527 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 528 KASSERT(ic->ic_receive_pdu == NULL, 529 ("ic->ic_receive_pdu != NULL")); 530 request = icl_pdu_new(ic, M_NOWAIT); 531 if (request == NULL) { 532 ICL_DEBUG("failed to allocate PDU; " 533 "dropping connection"); 534 icl_conn_fail(ic); 535 return (NULL); 536 } 537 ic->ic_receive_pdu = request; 538 } else { 539 KASSERT(ic->ic_receive_pdu != NULL, 540 ("ic->ic_receive_pdu == NULL")); 541 request = ic->ic_receive_pdu; 542 } 543 544 if (*availablep < ic->ic_receive_len) { 545#if 0 546 ICL_DEBUG("not enough data; need %zd, " 547 "have %zd", ic->ic_receive_len, *availablep); 548#endif 549 return (NULL); 550 } 551 552 switch (ic->ic_receive_state) { 553 case ICL_CONN_STATE_BHS: 554 //ICL_DEBUG("receiving BHS"); 555 error = icl_pdu_receive_bhs(request, availablep); 556 if (error != 0) { 557 ICL_DEBUG("failed to receive BHS; " 558 "dropping connection"); 559 break; 560 } 561 562 /* 563 * We don't enforce any limit for AHS length; 564 * its length is stored in 8 bit field. 565 */ 566 567 len = icl_pdu_data_segment_length(request); 568 if (len > ic->ic_max_data_segment_length) { 569 ICL_WARN("received data segment " 570 "length %zd is larger than negotiated " 571 "MaxDataSegmentLength %zd; " 572 "dropping connection", 573 len, ic->ic_max_data_segment_length); 574 error = EINVAL; 575 break; 576 } 577 578 ic->ic_receive_state = ICL_CONN_STATE_AHS; 579 ic->ic_receive_len = icl_pdu_ahs_length(request); 580 break; 581 582 case ICL_CONN_STATE_AHS: 583 //ICL_DEBUG("receiving AHS"); 584 error = icl_pdu_receive_ahs(request, availablep); 585 if (error != 0) { 586 ICL_DEBUG("failed to receive AHS; " 587 "dropping connection"); 588 break; 589 } 590 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 591 if (ic->ic_header_crc32c == false) 592 ic->ic_receive_len = 0; 593 else 594 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 595 break; 596 597 case ICL_CONN_STATE_HEADER_DIGEST: 598 //ICL_DEBUG("receiving header digest"); 599 error = icl_pdu_check_header_digest(request, availablep); 600 if (error != 0) { 601 ICL_DEBUG("header digest failed; " 602 "dropping connection"); 603 break; 604 } 605 606 ic->ic_receive_state = ICL_CONN_STATE_DATA; 607 ic->ic_receive_len = 608 icl_pdu_data_segment_receive_len(request); 609 break; 610 611 case ICL_CONN_STATE_DATA: 612 //ICL_DEBUG("receiving data segment"); 613 error = icl_pdu_receive_data_segment(request, availablep, 614 &more_needed); 615 if (error != 0) { 616 ICL_DEBUG("failed to receive data segment;" 617 "dropping connection"); 618 break; 619 } 620 621 if (more_needed) 622 break; 623 624 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 625 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 626 ic->ic_receive_len = 0; 627 else 628 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 629 break; 630 631 case ICL_CONN_STATE_DATA_DIGEST: 632 //ICL_DEBUG("receiving data digest"); 633 error = icl_pdu_check_data_digest(request, availablep); 634 if (error != 0) { 635 ICL_DEBUG("data digest failed; " 636 "dropping connection"); 637 break; 638 } 639 640 /* 641 * We've received complete PDU; reset the receive state machine 642 * and return the PDU. 643 */ 644 ic->ic_receive_state = ICL_CONN_STATE_BHS; 645 ic->ic_receive_len = sizeof(struct iscsi_bhs); 646 ic->ic_receive_pdu = NULL; 647 return (request); 648 649 default: 650 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 651 } 652 653 if (error != 0) { 654 icl_pdu_free(request); 655 icl_conn_fail(ic); 656 } 657 658 return (NULL); 659} 660 661static void 662icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 663{ 664 struct icl_pdu *response; 665 struct socket *so; 666 667 so = ic->ic_socket; 668 669 /* 670 * This can never happen; we're careful to only mess with ic->ic_socket 671 * pointer when the send/receive threads are not running. 672 */ 673 KASSERT(so != NULL, ("NULL socket")); 674 675 for (;;) { 676 if (ic->ic_disconnecting) 677 return; 678 679 if (so->so_error != 0) { 680 ICL_DEBUG("connection error %d; " 681 "dropping connection", so->so_error); 682 icl_conn_fail(ic); 683 return; 684 } 685 686 /* 687 * Loop until we have a complete PDU or there is not enough 688 * data in the socket buffer. 689 */ 690 if (available < ic->ic_receive_len) { 691#if 0 692 ICL_DEBUG("not enough data; have %zd, " 693 "need %zd", available, 694 ic->ic_receive_len); 695#endif 696 return; 697 } 698 699 response = icl_conn_receive_pdu(ic, &available); 700 if (response == NULL) 701 continue; 702 703 if (response->ip_ahs_len > 0) { 704 ICL_WARN("received PDU with unsupported " 705 "AHS; opcode 0x%x; dropping connection", 706 response->ip_bhs->bhs_opcode); 707 icl_pdu_free(response); 708 icl_conn_fail(ic); 709 return; 710 } 711 712 (ic->ic_receive)(response); 713 } 714} 715 716static void 717icl_receive_thread(void *arg) 718{ 719 struct icl_conn *ic; 720 size_t available; 721 struct socket *so; 722 723 ic = arg; 724 so = ic->ic_socket; 725 726 ICL_CONN_LOCK(ic); 727 ic->ic_receive_running = true; 728 ICL_CONN_UNLOCK(ic); 729 730 for (;;) { 731 if (ic->ic_disconnecting) { 732 //ICL_DEBUG("terminating"); 733 break; 734 } 735 736 SOCKBUF_LOCK(&so->so_rcv); 737 available = so->so_rcv.sb_cc; 738 if (available < ic->ic_receive_len) { 739 so->so_rcv.sb_lowat = ic->ic_receive_len; 740 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 741 } 742 SOCKBUF_UNLOCK(&so->so_rcv); 743 744 icl_conn_receive_pdus(ic, available); 745 } 746 747 ICL_CONN_LOCK(ic); 748 ic->ic_receive_running = false; 749 ICL_CONN_UNLOCK(ic); 750 kthread_exit(); 751} 752 753static int 754icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 755{ 756 struct icl_conn *ic; 757 758 ic = arg; 759 cv_signal(&ic->ic_receive_cv); 760 return (SU_OK); 761} 762 763static int 764icl_pdu_send(struct icl_pdu *request) 765{ 766 size_t padding, pdu_len; 767 uint32_t digest, zero = 0; 768 int error, ok; 769 struct socket *so; 770 struct icl_conn *ic; 771 772 ic = request->ip_conn; 773 so = request->ip_conn->ic_socket; 774 775 ICL_CONN_LOCK_ASSERT(ic); 776 777 icl_pdu_set_data_segment_length(request, request->ip_data_len); 778 779 pdu_len = icl_pdu_size(request); 780 781 if (ic->ic_header_crc32c) { 782 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 783 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 784 (void *)&digest); 785 if (ok != 1) { 786 ICL_WARN("failed to append header digest"); 787 return (1); 788 } 789 } 790 791 if (request->ip_data_len != 0) { 792 padding = icl_pdu_padding(request); 793 if (padding > 0) { 794 ok = m_append(request->ip_data_mbuf, padding, 795 (void *)&zero); 796 if (ok != 1) { 797 ICL_WARN("failed to append padding"); 798 return (1); 799 } 800 } 801 802 if (ic->ic_data_crc32c) { 803 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 804 805 ok = m_append(request->ip_data_mbuf, sizeof(digest), 806 (void *)&digest); 807 if (ok != 1) { 808 ICL_WARN("failed to append header digest"); 809 return (1); 810 } 811 } 812 813 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 814 request->ip_data_mbuf = NULL; 815 } 816 817 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 818 819 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 820 NULL, MSG_DONTWAIT, curthread); 821 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 822 if (error != 0) { 823 ICL_DEBUG("sosend error %d", error); 824 return (error); 825 } 826 827 return (0); 828} 829 830static void 831icl_conn_send_pdus(struct icl_conn *ic) 832{ 833 struct icl_pdu *request; 834 struct socket *so; 835 size_t available, size; 836 int error; 837 838 ICL_CONN_LOCK_ASSERT(ic); 839 840 so = ic->ic_socket; 841 842 SOCKBUF_LOCK(&so->so_snd); 843 available = sbspace(&so->so_snd); 844 SOCKBUF_UNLOCK(&so->so_snd); 845 846 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 847 if (ic->ic_disconnecting) 848 return; 849 850 request = TAILQ_FIRST(&ic->ic_to_send); 851 size = icl_pdu_size(request); 852 if (available < size) { 853 /* 854 * Set the low watermark on the socket, 855 * to avoid waking up until there is enough 856 * space. 857 */ 858 SOCKBUF_LOCK(&so->so_snd); 859 so->so_snd.sb_lowat = size; 860 SOCKBUF_UNLOCK(&so->so_snd); 861#if 1 862 ICL_DEBUG("no space to send; " 863 "have %zd, need %zd", 864 available, size); 865#endif 866 return; 867 } 868 available -= size; 869 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next); 870 error = icl_pdu_send(request); 871 if (error != 0) { 872 ICL_DEBUG("failed to send PDU; " 873 "dropping connection"); 874 icl_conn_fail(ic); 875 return; 876 } 877 icl_pdu_free(request); 878 } 879} 880 881static void 882icl_send_thread(void *arg) 883{ 884 struct icl_conn *ic; 885 886 ic = arg; 887 888 ICL_CONN_LOCK(ic); 889 ic->ic_send_running = true; 890 891 for (;;) { 892 if (ic->ic_disconnecting) { 893 //ICL_DEBUG("terminating"); 894 break; 895 } 896 icl_conn_send_pdus(ic); 897 cv_wait(&ic->ic_send_cv, ic->ic_lock); 898 } 899 900 ic->ic_send_running = false; 901 ICL_CONN_UNLOCK(ic); 902 kthread_exit(); 903} 904 905static int 906icl_soupcall_send(struct socket *so, void *arg, int waitflag) 907{ 908 struct icl_conn *ic; 909 910 ic = arg; 911 cv_signal(&ic->ic_send_cv); 912 return (SU_OK); 913} 914 915int 916icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags) 917{ 918 struct mbuf *mb, *newmb; 919 size_t copylen, off = 0; 920 921 KASSERT(len > 0, ("len == 0")); 922 923 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 924 if (newmb == NULL) { 925 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 926 return (ENOMEM); 927 } 928 929 for (mb = newmb; mb != NULL; mb = mb->m_next) { 930 copylen = min(M_TRAILINGSPACE(mb), len - off); 931 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 932 mb->m_len = copylen; 933 off += copylen; 934 } 935 KASSERT(off == len, ("%s: off != len", __func__)); 936 937 if (request->ip_data_mbuf == NULL) { 938 request->ip_data_mbuf = newmb; 939 request->ip_data_len = len; 940 } else { 941 m_cat(request->ip_data_mbuf, newmb); 942 request->ip_data_len += len; 943 } 944 945 return (0); 946} 947 948void 949icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 950{ 951 952 m_copydata(ip->ip_data_mbuf, off, len, addr); 953} 954 955void 956icl_pdu_queue(struct icl_pdu *ip) 957{ 958 struct icl_conn *ic; 959 960 ic = ip->ip_conn; 961 962 ICL_CONN_LOCK_ASSERT(ic); 963 964 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 965 ICL_DEBUG("icl_pdu_queue on closed connection"); 966 icl_pdu_free(ip); 967 return; 968 } 969 TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 970 cv_signal(&ic->ic_send_cv); 971} 972 973struct icl_conn * 974icl_conn_new(struct mtx *lock) 975{ 976 struct icl_conn *ic; 977 978 refcount_acquire(&icl_ncons); 979 980 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 981 982 TAILQ_INIT(&ic->ic_to_send); 983 ic->ic_lock = lock; 984 cv_init(&ic->ic_send_cv, "icl_tx"); 985 cv_init(&ic->ic_receive_cv, "icl_rx"); 986#ifdef DIAGNOSTIC 987 refcount_init(&ic->ic_outstanding_pdus, 0); 988#endif 989 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 990 991 return (ic); 992} 993 994void 995icl_conn_free(struct icl_conn *ic) 996{ 997 998 cv_destroy(&ic->ic_send_cv); 999 cv_destroy(&ic->ic_receive_cv); 1000 uma_zfree(icl_conn_zone, ic); 1001 refcount_release(&icl_ncons); 1002} 1003 1004static int 1005icl_conn_start(struct icl_conn *ic) 1006{ 1007 size_t bufsize; 1008 struct sockopt opt; 1009 int error, one = 1; 1010 1011 ICL_CONN_LOCK(ic); 1012 1013 /* 1014 * XXX: Ugly hack. 1015 */ 1016 if (ic->ic_socket == NULL) { 1017 ICL_CONN_UNLOCK(ic); 1018 return (EINVAL); 1019 } 1020 1021 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1022 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1023 ic->ic_disconnecting = false; 1024 1025 ICL_CONN_UNLOCK(ic); 1026 1027 /* 1028 * Use max available sockbuf size for sending. Do it manually 1029 * instead of sbreserve(9) to work around resource limits. 1030 * 1031 * XXX: This kind of sucks. On one hand, we don't currently support 1032 * sending a part of data segment; we always do it in one piece, 1033 * so we have to make sure it can fit in the socket buffer. 1034 * Once I've implemented partial send, we'll get rid of this 1035 * and use autoscaling. 1036 */ 1037 bufsize = (sizeof(struct iscsi_bhs) + 1038 ic->ic_max_data_segment_length) * 8; 1039 error = soreserve(ic->ic_socket, bufsize, bufsize); 1040 if (error != 0) { 1041 ICL_WARN("soreserve failed with error %d", error); 1042 icl_conn_close(ic); 1043 return (error); 1044 } 1045 1046 /* 1047 * Disable Nagle. 1048 */ 1049 bzero(&opt, sizeof(opt)); 1050 opt.sopt_dir = SOPT_SET; 1051 opt.sopt_level = IPPROTO_TCP; 1052 opt.sopt_name = TCP_NODELAY; 1053 opt.sopt_val = &one; 1054 opt.sopt_valsize = sizeof(one); 1055 error = sosetopt(ic->ic_socket, &opt); 1056 if (error != 0) { 1057 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1058 icl_conn_close(ic); 1059 return (error); 1060 } 1061 1062 /* 1063 * Start threads. 1064 */ 1065 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx"); 1066 if (error != 0) { 1067 ICL_WARN("kthread_add(9) failed with error %d", error); 1068 icl_conn_close(ic); 1069 return (error); 1070 } 1071 1072 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx"); 1073 if (error != 0) { 1074 ICL_WARN("kthread_add(9) failed with error %d", error); 1075 icl_conn_close(ic); 1076 return (error); 1077 } 1078 1079 /* 1080 * Register socket upcall, to get notified about incoming PDUs 1081 * and free space to send outgoing ones. 1082 */ 1083 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1084 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1085 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1086 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1087 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1088 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1089 1090 return (0); 1091} 1092 1093int 1094icl_conn_handoff(struct icl_conn *ic, int fd) 1095{ 1096 struct file *fp; 1097 struct socket *so; 1098 cap_rights_t rights; 1099 int error; 1100 1101 ICL_CONN_LOCK_ASSERT_NOT(ic); 1102 1103 /* 1104 * Steal the socket from userland. 1105 */ 1106 error = fget(curthread, fd, 1107 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1108 if (error != 0) 1109 return (error); 1110 if (fp->f_type != DTYPE_SOCKET) { 1111 fdrop(fp, curthread); 1112 return (EINVAL); 1113 } 1114 so = fp->f_data; 1115 if (so->so_type != SOCK_STREAM) { 1116 fdrop(fp, curthread); 1117 return (EINVAL); 1118 } 1119 1120 ICL_CONN_LOCK(ic); 1121 1122 if (ic->ic_socket != NULL) { 1123 ICL_CONN_UNLOCK(ic); 1124 fdrop(fp, curthread); 1125 return (EBUSY); 1126 } 1127 1128 ic->ic_socket = fp->f_data; 1129 fp->f_ops = &badfileops; 1130 fp->f_data = NULL; 1131 fdrop(fp, curthread); 1132 ICL_CONN_UNLOCK(ic); 1133 1134 error = icl_conn_start(ic); 1135 1136 return (error); 1137} 1138 1139void 1140icl_conn_shutdown(struct icl_conn *ic) 1141{ 1142 ICL_CONN_LOCK_ASSERT_NOT(ic); 1143 1144 ICL_CONN_LOCK(ic); 1145 if (ic->ic_socket == NULL) { 1146 ICL_CONN_UNLOCK(ic); 1147 return; 1148 } 1149 ICL_CONN_UNLOCK(ic); 1150 1151 soshutdown(ic->ic_socket, SHUT_RDWR); 1152} 1153 1154void 1155icl_conn_close(struct icl_conn *ic) 1156{ 1157 struct icl_pdu *pdu; 1158 1159 ICL_CONN_LOCK_ASSERT_NOT(ic); 1160 1161 ICL_CONN_LOCK(ic); 1162 if (ic->ic_socket == NULL) { 1163 ICL_CONN_UNLOCK(ic); 1164 return; 1165 } 1166 1167 ic->ic_disconnecting = true; 1168 1169 /* 1170 * Wake up the threads, so they can properly terminate. 1171 */ 1172 cv_signal(&ic->ic_receive_cv); 1173 cv_signal(&ic->ic_send_cv); 1174 while (ic->ic_receive_running || ic->ic_send_running) { 1175 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1176 ICL_CONN_UNLOCK(ic); 1177 cv_signal(&ic->ic_receive_cv); 1178 cv_signal(&ic->ic_send_cv); 1179 pause("icl_close", 1 * hz); 1180 ICL_CONN_LOCK(ic); 1181 } 1182 //ICL_DEBUG("send/receive threads terminated"); 1183 1184 soclose(ic->ic_socket); 1185 ic->ic_socket = NULL; 1186 1187 if (ic->ic_receive_pdu != NULL) { 1188 //ICL_DEBUG("freeing partially received PDU"); 1189 icl_pdu_free(ic->ic_receive_pdu); 1190 ic->ic_receive_pdu = NULL; 1191 } 1192 1193 /* 1194 * Remove any outstanding PDUs from the send queue. 1195 */ 1196 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 1197 pdu = TAILQ_FIRST(&ic->ic_to_send); 1198 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next); 1199 icl_pdu_free(pdu); 1200 } 1201 1202 KASSERT(TAILQ_EMPTY(&ic->ic_to_send), 1203 ("destroying session with non-empty send queue")); 1204 /* 1205 * XXX 1206 */ 1207#if 0 1208 KASSERT(ic->ic_outstanding_pdus == 0, 1209 ("destroying session with %d outstanding PDUs", 1210 ic->ic_outstanding_pdus)); 1211#endif 1212 ICL_CONN_UNLOCK(ic); 1213} 1214 1215bool 1216icl_conn_connected(struct icl_conn *ic) 1217{ 1218 ICL_CONN_LOCK_ASSERT_NOT(ic); 1219 1220 ICL_CONN_LOCK(ic); 1221 if (ic->ic_socket == NULL) { 1222 ICL_CONN_UNLOCK(ic); 1223 return (false); 1224 } 1225 if (ic->ic_socket->so_error != 0) { 1226 ICL_CONN_UNLOCK(ic); 1227 return (false); 1228 } 1229 ICL_CONN_UNLOCK(ic); 1230 return (true); 1231} 1232 1233#ifdef ICL_KERNEL_PROXY 1234int 1235icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1236{ 1237 int error; 1238 1239 ICL_CONN_LOCK_ASSERT_NOT(ic); 1240 1241 if (so->so_type != SOCK_STREAM) 1242 return (EINVAL); 1243 1244 ICL_CONN_LOCK(ic); 1245 if (ic->ic_socket != NULL) { 1246 ICL_CONN_UNLOCK(ic); 1247 return (EBUSY); 1248 } 1249 ic->ic_socket = so; 1250 ICL_CONN_UNLOCK(ic); 1251 1252 error = icl_conn_start(ic); 1253 1254 return (error); 1255} 1256#endif /* ICL_KERNEL_PROXY */ 1257 1258static int 1259icl_unload(void) 1260{ 1261 1262 if (icl_ncons != 0) 1263 return (EBUSY); 1264 1265 uma_zdestroy(icl_conn_zone); 1266 uma_zdestroy(icl_pdu_zone); 1267 1268 return (0); 1269} 1270 1271static void 1272icl_load(void) 1273{ 1274 1275 icl_conn_zone = uma_zcreate("icl_conn", 1276 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1277 UMA_ALIGN_PTR, 0); 1278 icl_pdu_zone = uma_zcreate("icl_pdu", 1279 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1280 UMA_ALIGN_PTR, 0); 1281 1282 refcount_init(&icl_ncons, 0); 1283} 1284 1285static int 1286icl_modevent(module_t mod, int what, void *arg) 1287{ 1288 1289 switch (what) { 1290 case MOD_LOAD: 1291 icl_load(); 1292 return (0); 1293 case MOD_UNLOAD: 1294 return (icl_unload()); 1295 default: 1296 return (EINVAL); 1297 } 1298} 1299 1300moduledata_t icl_data = { 1301 "icl", 1302 icl_modevent, 1303 0 1304}; 1305 1306DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1307MODULE_VERSION(icl, 1); 1308