icl.c revision 265505
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265505 2014-05-07 07:25:47Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capability.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN, 65 &debug, 1, "Enable debug messages"); 66static int coalesce = 1; 67TUNABLE_INT("kern.icl.coalesce", &coalesce); 68SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN, 69 &coalesce, 1, "Try to coalesce PDUs before sending"); 70static int partial_receive_len = 1 * 1024; /* XXX: More? */ 71TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 72SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, 73 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 74 "data segment"); 75static int sendspace = 1048576; 76TUNABLE_INT("kern.icl.sendspace", &sendspace); 77SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN, 78 &sendspace, 1048576, "Default send socket buffer size"); 79static int recvspace = 1048576; 80TUNABLE_INT("kern.icl.recvspace", &recvspace); 81SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN, 82 &recvspace, 1048576, "Default receive socket buffer size"); 83 84static uma_zone_t icl_conn_zone; 85static uma_zone_t icl_pdu_zone; 86 87static volatile u_int icl_ncons; 88 89#define ICL_DEBUG(X, ...) \ 90 do { \ 91 if (debug > 1) \ 92 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 93 } while (0) 94 95#define ICL_WARN(X, ...) \ 96 do { \ 97 if (debug > 0) { \ 98 printf("WARNING: %s: " X "\n", \ 99 __func__, ## __VA_ARGS__); \ 100 } \ 101 } while (0) 102 103#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 104#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 105#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 106#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 107 108STAILQ_HEAD(icl_pdu_stailq, icl_pdu); 109 110static void 111icl_conn_fail(struct icl_conn *ic) 112{ 113 if (ic->ic_socket == NULL) 114 return; 115 116 /* 117 * XXX 118 */ 119 ic->ic_socket->so_error = EDOOFUS; 120 (ic->ic_error)(ic); 121} 122 123static struct mbuf * 124icl_conn_receive(struct icl_conn *ic, size_t len) 125{ 126 struct uio uio; 127 struct socket *so; 128 struct mbuf *m; 129 int error, flags; 130 131 so = ic->ic_socket; 132 133 memset(&uio, 0, sizeof(uio)); 134 uio.uio_resid = len; 135 136 flags = MSG_DONTWAIT; 137 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 138 if (error != 0) { 139 ICL_DEBUG("soreceive error %d", error); 140 return (NULL); 141 } 142 if (uio.uio_resid != 0) { 143 m_freem(m); 144 ICL_DEBUG("short read"); 145 return (NULL); 146 } 147 148 return (m); 149} 150 151static struct icl_pdu * 152icl_pdu_new(struct icl_conn *ic, int flags) 153{ 154 struct icl_pdu *ip; 155 156#ifdef DIAGNOSTIC 157 refcount_acquire(&ic->ic_outstanding_pdus); 158#endif 159 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 160 if (ip == NULL) { 161 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 162#ifdef DIAGNOSTIC 163 refcount_release(&ic->ic_outstanding_pdus); 164#endif 165 return (NULL); 166 } 167 168 ip->ip_conn = ic; 169 170 return (ip); 171} 172 173void 174icl_pdu_free(struct icl_pdu *ip) 175{ 176 struct icl_conn *ic; 177 178 ic = ip->ip_conn; 179 180 m_freem(ip->ip_bhs_mbuf); 181 m_freem(ip->ip_ahs_mbuf); 182 m_freem(ip->ip_data_mbuf); 183 uma_zfree(icl_pdu_zone, ip); 184#ifdef DIAGNOSTIC 185 refcount_release(&ic->ic_outstanding_pdus); 186#endif 187} 188 189/* 190 * Allocate icl_pdu with empty BHS to fill up by the caller. 191 */ 192struct icl_pdu * 193icl_pdu_new_bhs(struct icl_conn *ic, int flags) 194{ 195 struct icl_pdu *ip; 196 197 ip = icl_pdu_new(ic, flags); 198 if (ip == NULL) 199 return (NULL); 200 201 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 202 flags, MT_DATA, M_PKTHDR); 203 if (ip->ip_bhs_mbuf == NULL) { 204 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 205 icl_pdu_free(ip); 206 return (NULL); 207 } 208 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 209 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 210 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 211 212 return (ip); 213} 214 215static int 216icl_pdu_ahs_length(const struct icl_pdu *request) 217{ 218 219 return (request->ip_bhs->bhs_total_ahs_len * 4); 220} 221 222size_t 223icl_pdu_data_segment_length(const struct icl_pdu *request) 224{ 225 uint32_t len = 0; 226 227 len += request->ip_bhs->bhs_data_segment_len[0]; 228 len <<= 8; 229 len += request->ip_bhs->bhs_data_segment_len[1]; 230 len <<= 8; 231 len += request->ip_bhs->bhs_data_segment_len[2]; 232 233 return (len); 234} 235 236static void 237icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 238{ 239 240 response->ip_bhs->bhs_data_segment_len[2] = len; 241 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 242 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 243} 244 245static size_t 246icl_pdu_padding(const struct icl_pdu *ip) 247{ 248 249 if ((ip->ip_data_len % 4) != 0) 250 return (4 - (ip->ip_data_len % 4)); 251 252 return (0); 253} 254 255static size_t 256icl_pdu_size(const struct icl_pdu *response) 257{ 258 size_t len; 259 260 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 261 262 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 263 icl_pdu_padding(response); 264 if (response->ip_conn->ic_header_crc32c) 265 len += ISCSI_HEADER_DIGEST_SIZE; 266 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 267 len += ISCSI_DATA_DIGEST_SIZE; 268 269 return (len); 270} 271 272static int 273icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 274{ 275 struct mbuf *m; 276 277 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 278 if (m == NULL) { 279 ICL_DEBUG("failed to receive BHS"); 280 return (-1); 281 } 282 283 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 284 if (request->ip_bhs_mbuf == NULL) { 285 ICL_WARN("m_pullup failed"); 286 return (-1); 287 } 288 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 289 290 /* 291 * XXX: For architectures with strict alignment requirements 292 * we may need to allocate ip_bhs and copy the data into it. 293 * For some reason, though, not doing this doesn't seem 294 * to cause problems; tested on sparc64. 295 */ 296 297 *availablep -= sizeof(struct iscsi_bhs); 298 return (0); 299} 300 301static int 302icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 303{ 304 305 request->ip_ahs_len = icl_pdu_ahs_length(request); 306 if (request->ip_ahs_len == 0) 307 return (0); 308 309 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 310 request->ip_ahs_len); 311 if (request->ip_ahs_mbuf == NULL) { 312 ICL_DEBUG("failed to receive AHS"); 313 return (-1); 314 } 315 316 *availablep -= request->ip_ahs_len; 317 return (0); 318} 319 320static uint32_t 321icl_mbuf_to_crc32c(const struct mbuf *m0) 322{ 323 uint32_t digest = 0xffffffff; 324 const struct mbuf *m; 325 326 for (m = m0; m != NULL; m = m->m_next) 327 digest = calculate_crc32c(digest, 328 mtod(m, const void *), m->m_len); 329 330 digest = digest ^ 0xffffffff; 331 332 return (digest); 333} 334 335static int 336icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 337{ 338 struct mbuf *m; 339 uint32_t received_digest, valid_digest; 340 341 if (request->ip_conn->ic_header_crc32c == false) 342 return (0); 343 344 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 345 if (m == NULL) { 346 ICL_DEBUG("failed to receive header digest"); 347 return (-1); 348 } 349 350 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 351 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 352 m_freem(m); 353 354 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 355 356 /* 357 * XXX: Handle AHS. 358 */ 359 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 360 if (received_digest != valid_digest) { 361 ICL_WARN("header digest check failed; got 0x%x, " 362 "should be 0x%x", received_digest, valid_digest); 363 return (-1); 364 } 365 366 return (0); 367} 368 369/* 370 * Return the number of bytes that should be waiting in the receive socket 371 * before icl_pdu_receive_data_segment() gets called. 372 */ 373static size_t 374icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 375{ 376 size_t len; 377 378 len = icl_pdu_data_segment_length(request); 379 if (len == 0) 380 return (0); 381 382 /* 383 * Account for the parts of data segment already read from 384 * the socket buffer. 385 */ 386 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 387 len -= request->ip_data_len; 388 389 /* 390 * Don't always wait for the full data segment to be delivered 391 * to the socket; this might badly affect performance due to 392 * TCP window scaling. 393 */ 394 if (len > partial_receive_len) { 395#if 0 396 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 397 len, partial_receive_len)); 398#endif 399 len = partial_receive_len; 400 401 return (len); 402 } 403 404 /* 405 * Account for padding. Note that due to the way code is written, 406 * the icl_pdu_receive_data_segment() must always receive padding 407 * along with the last part of data segment, because it would be 408 * impossible to tell whether we've already received the full data 409 * segment including padding, or without it. 410 */ 411 if ((len % 4) != 0) 412 len += 4 - (len % 4); 413 414#if 0 415 ICL_DEBUG("need %zd bytes of data", len)); 416#endif 417 418 return (len); 419} 420 421static int 422icl_pdu_receive_data_segment(struct icl_pdu *request, 423 size_t *availablep, bool *more_neededp) 424{ 425 struct icl_conn *ic; 426 size_t len, padding = 0; 427 struct mbuf *m; 428 429 ic = request->ip_conn; 430 431 *more_neededp = false; 432 ic->ic_receive_len = 0; 433 434 len = icl_pdu_data_segment_length(request); 435 if (len == 0) 436 return (0); 437 438 if ((len % 4) != 0) 439 padding = 4 - (len % 4); 440 441 /* 442 * Account for already received parts of data segment. 443 */ 444 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 445 len -= request->ip_data_len; 446 447 if (len + padding > *availablep) { 448 /* 449 * Not enough data in the socket buffer. Receive as much 450 * as we can. Don't receive padding, since, obviously, it's 451 * not the end of data segment yet. 452 */ 453#if 0 454 ICL_DEBUG("limited from %zd to %zd", 455 len + padding, *availablep - padding)); 456#endif 457 len = *availablep - padding; 458 *more_neededp = true; 459 padding = 0; 460 } 461 462 /* 463 * Must not try to receive padding without at least one byte 464 * of actual data segment. 465 */ 466 if (len > 0) { 467 m = icl_conn_receive(request->ip_conn, len + padding); 468 if (m == NULL) { 469 ICL_DEBUG("failed to receive data segment"); 470 return (-1); 471 } 472 473 if (request->ip_data_mbuf == NULL) 474 request->ip_data_mbuf = m; 475 else 476 m_cat(request->ip_data_mbuf, m); 477 478 request->ip_data_len += len; 479 *availablep -= len + padding; 480 } else 481 ICL_DEBUG("len 0"); 482 483 if (*more_neededp) 484 ic->ic_receive_len = 485 icl_pdu_data_segment_receive_len(request); 486 487 return (0); 488} 489 490static int 491icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 492{ 493 struct mbuf *m; 494 uint32_t received_digest, valid_digest; 495 496 if (request->ip_conn->ic_data_crc32c == false) 497 return (0); 498 499 if (request->ip_data_len == 0) 500 return (0); 501 502 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 503 if (m == NULL) { 504 ICL_DEBUG("failed to receive data digest"); 505 return (-1); 506 } 507 508 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 509 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 510 m_freem(m); 511 512 *availablep -= ISCSI_DATA_DIGEST_SIZE; 513 514 /* 515 * Note that ip_data_mbuf also contains padding; since digest 516 * calculation is supposed to include that, we iterate over 517 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 518 */ 519 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 520 if (received_digest != valid_digest) { 521 ICL_WARN("data digest check failed; got 0x%x, " 522 "should be 0x%x", received_digest, valid_digest); 523 return (-1); 524 } 525 526 return (0); 527} 528 529/* 530 * Somewhat contrary to the name, this attempts to receive only one 531 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 532 */ 533static struct icl_pdu * 534icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 535{ 536 struct icl_pdu *request; 537 struct socket *so; 538 size_t len; 539 int error; 540 bool more_needed; 541 542 so = ic->ic_socket; 543 544 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 545 KASSERT(ic->ic_receive_pdu == NULL, 546 ("ic->ic_receive_pdu != NULL")); 547 request = icl_pdu_new(ic, M_NOWAIT); 548 if (request == NULL) { 549 ICL_DEBUG("failed to allocate PDU; " 550 "dropping connection"); 551 icl_conn_fail(ic); 552 return (NULL); 553 } 554 ic->ic_receive_pdu = request; 555 } else { 556 KASSERT(ic->ic_receive_pdu != NULL, 557 ("ic->ic_receive_pdu == NULL")); 558 request = ic->ic_receive_pdu; 559 } 560 561 if (*availablep < ic->ic_receive_len) { 562#if 0 563 ICL_DEBUG("not enough data; need %zd, " 564 "have %zd", ic->ic_receive_len, *availablep); 565#endif 566 return (NULL); 567 } 568 569 switch (ic->ic_receive_state) { 570 case ICL_CONN_STATE_BHS: 571 //ICL_DEBUG("receiving BHS"); 572 error = icl_pdu_receive_bhs(request, availablep); 573 if (error != 0) { 574 ICL_DEBUG("failed to receive BHS; " 575 "dropping connection"); 576 break; 577 } 578 579 /* 580 * We don't enforce any limit for AHS length; 581 * its length is stored in 8 bit field. 582 */ 583 584 len = icl_pdu_data_segment_length(request); 585 if (len > ic->ic_max_data_segment_length) { 586 ICL_WARN("received data segment " 587 "length %zd is larger than negotiated " 588 "MaxDataSegmentLength %zd; " 589 "dropping connection", 590 len, ic->ic_max_data_segment_length); 591 error = EINVAL; 592 break; 593 } 594 595 ic->ic_receive_state = ICL_CONN_STATE_AHS; 596 ic->ic_receive_len = icl_pdu_ahs_length(request); 597 break; 598 599 case ICL_CONN_STATE_AHS: 600 //ICL_DEBUG("receiving AHS"); 601 error = icl_pdu_receive_ahs(request, availablep); 602 if (error != 0) { 603 ICL_DEBUG("failed to receive AHS; " 604 "dropping connection"); 605 break; 606 } 607 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 608 if (ic->ic_header_crc32c == false) 609 ic->ic_receive_len = 0; 610 else 611 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 612 break; 613 614 case ICL_CONN_STATE_HEADER_DIGEST: 615 //ICL_DEBUG("receiving header digest"); 616 error = icl_pdu_check_header_digest(request, availablep); 617 if (error != 0) { 618 ICL_DEBUG("header digest failed; " 619 "dropping connection"); 620 break; 621 } 622 623 ic->ic_receive_state = ICL_CONN_STATE_DATA; 624 ic->ic_receive_len = 625 icl_pdu_data_segment_receive_len(request); 626 break; 627 628 case ICL_CONN_STATE_DATA: 629 //ICL_DEBUG("receiving data segment"); 630 error = icl_pdu_receive_data_segment(request, availablep, 631 &more_needed); 632 if (error != 0) { 633 ICL_DEBUG("failed to receive data segment;" 634 "dropping connection"); 635 break; 636 } 637 638 if (more_needed) 639 break; 640 641 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 642 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 643 ic->ic_receive_len = 0; 644 else 645 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 646 break; 647 648 case ICL_CONN_STATE_DATA_DIGEST: 649 //ICL_DEBUG("receiving data digest"); 650 error = icl_pdu_check_data_digest(request, availablep); 651 if (error != 0) { 652 ICL_DEBUG("data digest failed; " 653 "dropping connection"); 654 break; 655 } 656 657 /* 658 * We've received complete PDU; reset the receive state machine 659 * and return the PDU. 660 */ 661 ic->ic_receive_state = ICL_CONN_STATE_BHS; 662 ic->ic_receive_len = sizeof(struct iscsi_bhs); 663 ic->ic_receive_pdu = NULL; 664 return (request); 665 666 default: 667 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 668 } 669 670 if (error != 0) { 671 icl_pdu_free(request); 672 icl_conn_fail(ic); 673 } 674 675 return (NULL); 676} 677 678static void 679icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 680{ 681 struct icl_pdu *response; 682 struct socket *so; 683 684 so = ic->ic_socket; 685 686 /* 687 * This can never happen; we're careful to only mess with ic->ic_socket 688 * pointer when the send/receive threads are not running. 689 */ 690 KASSERT(so != NULL, ("NULL socket")); 691 692 for (;;) { 693 if (ic->ic_disconnecting) 694 return; 695 696 if (so->so_error != 0) { 697 ICL_DEBUG("connection error %d; " 698 "dropping connection", so->so_error); 699 icl_conn_fail(ic); 700 return; 701 } 702 703 /* 704 * Loop until we have a complete PDU or there is not enough 705 * data in the socket buffer. 706 */ 707 if (available < ic->ic_receive_len) { 708#if 0 709 ICL_DEBUG("not enough data; have %zd, " 710 "need %zd", available, 711 ic->ic_receive_len); 712#endif 713 return; 714 } 715 716 response = icl_conn_receive_pdu(ic, &available); 717 if (response == NULL) 718 continue; 719 720 if (response->ip_ahs_len > 0) { 721 ICL_WARN("received PDU with unsupported " 722 "AHS; opcode 0x%x; dropping connection", 723 response->ip_bhs->bhs_opcode); 724 icl_pdu_free(response); 725 icl_conn_fail(ic); 726 return; 727 } 728 729 (ic->ic_receive)(response); 730 } 731} 732 733static void 734icl_receive_thread(void *arg) 735{ 736 struct icl_conn *ic; 737 size_t available; 738 struct socket *so; 739 740 ic = arg; 741 so = ic->ic_socket; 742 743 ICL_CONN_LOCK(ic); 744 ic->ic_receive_running = true; 745 ICL_CONN_UNLOCK(ic); 746 747 for (;;) { 748 if (ic->ic_disconnecting) { 749 //ICL_DEBUG("terminating"); 750 break; 751 } 752 753 SOCKBUF_LOCK(&so->so_rcv); 754 available = so->so_rcv.sb_cc; 755 if (available < ic->ic_receive_len) { 756 so->so_rcv.sb_lowat = ic->ic_receive_len; 757 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 758 } 759 SOCKBUF_UNLOCK(&so->so_rcv); 760 761 icl_conn_receive_pdus(ic, available); 762 } 763 764 ICL_CONN_LOCK(ic); 765 ic->ic_receive_running = false; 766 ICL_CONN_UNLOCK(ic); 767 kthread_exit(); 768} 769 770static int 771icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 772{ 773 struct icl_conn *ic; 774 775 ic = arg; 776 cv_signal(&ic->ic_receive_cv); 777 return (SU_OK); 778} 779 780static int 781icl_pdu_finalize(struct icl_pdu *request) 782{ 783 size_t padding, pdu_len; 784 uint32_t digest, zero = 0; 785 int ok; 786 struct icl_conn *ic; 787 788 ic = request->ip_conn; 789 790 icl_pdu_set_data_segment_length(request, request->ip_data_len); 791 792 pdu_len = icl_pdu_size(request); 793 794 if (ic->ic_header_crc32c) { 795 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 796 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 797 (void *)&digest); 798 if (ok != 1) { 799 ICL_WARN("failed to append header digest"); 800 return (1); 801 } 802 } 803 804 if (request->ip_data_len != 0) { 805 padding = icl_pdu_padding(request); 806 if (padding > 0) { 807 ok = m_append(request->ip_data_mbuf, padding, 808 (void *)&zero); 809 if (ok != 1) { 810 ICL_WARN("failed to append padding"); 811 return (1); 812 } 813 } 814 815 if (ic->ic_data_crc32c) { 816 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 817 818 ok = m_append(request->ip_data_mbuf, sizeof(digest), 819 (void *)&digest); 820 if (ok != 1) { 821 ICL_WARN("failed to append data digest"); 822 return (1); 823 } 824 } 825 826 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 827 request->ip_data_mbuf = NULL; 828 } 829 830 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 831 832 return (0); 833} 834 835static void 836icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue) 837{ 838 struct icl_pdu *request, *request2; 839 struct socket *so; 840 size_t available, size, size2; 841 int coalesced, error; 842 843 ICL_CONN_LOCK_ASSERT_NOT(ic); 844 845 so = ic->ic_socket; 846 847 SOCKBUF_LOCK(&so->so_snd); 848 /* 849 * Check how much space do we have for transmit. We can't just 850 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE, 851 * as it always frees the mbuf chain passed to it, even in case 852 * of error. 853 */ 854 available = sbspace(&so->so_snd); 855 856 /* 857 * Notify the socket layer that it doesn't need to call 858 * send socket upcall for the time being. 859 */ 860 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 861 SOCKBUF_UNLOCK(&so->so_snd); 862 863 while (!STAILQ_EMPTY(queue)) { 864 if (ic->ic_disconnecting) 865 return; 866 request = STAILQ_FIRST(queue); 867 size = icl_pdu_size(request); 868 if (available < size) { 869#if 1 870 ICL_DEBUG("no space to send; " 871 "have %zd, need %zd", 872 available, size); 873#endif 874 875 /* 876 * Set the low watermark on the socket, 877 * to avoid unneccessary wakeups until there 878 * is enough space for the PDU to fit. 879 */ 880 SOCKBUF_LOCK(&so->so_snd); 881 so->so_snd.sb_lowat = size; 882 SOCKBUF_UNLOCK(&so->so_snd); 883 return; 884 } 885 STAILQ_REMOVE_HEAD(queue, ip_next); 886 error = icl_pdu_finalize(request); 887 if (error != 0) { 888 ICL_DEBUG("failed to finalize PDU; " 889 "dropping connection"); 890 icl_conn_fail(ic); 891 icl_pdu_free(request); 892 return; 893 } 894 if (coalesce) { 895 coalesced = 1; 896 for (;;) { 897 request2 = STAILQ_FIRST(queue); 898 if (request2 == NULL) 899 break; 900 size2 = icl_pdu_size(request2); 901 if (available < size + size2) 902 break; 903 STAILQ_REMOVE_HEAD(queue, ip_next); 904 error = icl_pdu_finalize(request2); 905 if (error != 0) { 906 ICL_DEBUG("failed to finalize PDU; " 907 "dropping connection"); 908 icl_conn_fail(ic); 909 icl_pdu_free(request); 910 icl_pdu_free(request2); 911 return; 912 } 913 m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf); 914 request2->ip_bhs_mbuf = NULL; 915 request->ip_bhs_mbuf->m_pkthdr.len += size2; 916 size += size2; 917 STAILQ_REMOVE_AFTER(queue, request, ip_next); 918 icl_pdu_free(request2); 919 coalesced++; 920 } 921#if 0 922 if (coalesced > 1) { 923 ICL_DEBUG("coalesced %d PDUs into %zd bytes", 924 coalesced, size); 925 } 926#endif 927 } 928 available -= size; 929 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 930 NULL, MSG_DONTWAIT, curthread); 931 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 932 if (error != 0) { 933 ICL_DEBUG("failed to send PDU, error %d; " 934 "dropping connection", error); 935 icl_conn_fail(ic); 936 icl_pdu_free(request); 937 return; 938 } 939 icl_pdu_free(request); 940 } 941} 942 943static void 944icl_send_thread(void *arg) 945{ 946 struct icl_conn *ic; 947 struct icl_pdu_stailq queue; 948 949 ic = arg; 950 951 STAILQ_INIT(&queue); 952 953 ICL_CONN_LOCK(ic); 954 ic->ic_send_running = true; 955 956 for (;;) { 957 if (ic->ic_disconnecting) { 958 //ICL_DEBUG("terminating"); 959 break; 960 } 961 962 for (;;) { 963 /* 964 * If the local queue is empty, populate it from 965 * the main one. This way the icl_conn_send_pdus() 966 * can go through all the queued PDUs without holding 967 * any locks. 968 */ 969 if (STAILQ_EMPTY(&queue)) 970 STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu); 971 972 ic->ic_check_send_space = false; 973 ICL_CONN_UNLOCK(ic); 974 icl_conn_send_pdus(ic, &queue); 975 ICL_CONN_LOCK(ic); 976 977 /* 978 * The icl_soupcall_send() was called since the last 979 * call to sbspace(); go around; 980 */ 981 if (ic->ic_check_send_space) 982 continue; 983 984 /* 985 * Local queue is empty, but we still have PDUs 986 * in the main one; go around. 987 */ 988 if (STAILQ_EMPTY(&queue) && 989 !STAILQ_EMPTY(&ic->ic_to_send)) 990 continue; 991 992 /* 993 * There might be some stuff in the local queue, 994 * which didn't get sent due to not having enough send 995 * space. Wait for socket upcall. 996 */ 997 break; 998 } 999 1000 cv_wait(&ic->ic_send_cv, ic->ic_lock); 1001 } 1002 1003 /* 1004 * We're exiting; move PDUs back to the main queue, so they can 1005 * get freed properly. At this point ordering doesn't matter. 1006 */ 1007 STAILQ_CONCAT(&ic->ic_to_send, &queue); 1008 1009 ic->ic_send_running = false; 1010 ICL_CONN_UNLOCK(ic); 1011 kthread_exit(); 1012} 1013 1014static int 1015icl_soupcall_send(struct socket *so, void *arg, int waitflag) 1016{ 1017 struct icl_conn *ic; 1018 1019 ic = arg; 1020 1021 ICL_CONN_LOCK(ic); 1022 ic->ic_check_send_space = true; 1023 ICL_CONN_UNLOCK(ic); 1024 1025 cv_signal(&ic->ic_send_cv); 1026 1027 return (SU_OK); 1028} 1029 1030int 1031icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, 1032 int flags) 1033{ 1034 struct mbuf *mb, *newmb; 1035 size_t copylen, off = 0; 1036 1037 KASSERT(len > 0, ("len == 0")); 1038 1039 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 1040 if (newmb == NULL) { 1041 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 1042 return (ENOMEM); 1043 } 1044 1045 for (mb = newmb; mb != NULL; mb = mb->m_next) { 1046 copylen = min(M_TRAILINGSPACE(mb), len - off); 1047 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 1048 mb->m_len = copylen; 1049 off += copylen; 1050 } 1051 KASSERT(off == len, ("%s: off != len", __func__)); 1052 1053 if (request->ip_data_mbuf == NULL) { 1054 request->ip_data_mbuf = newmb; 1055 request->ip_data_len = len; 1056 } else { 1057 m_cat(request->ip_data_mbuf, newmb); 1058 request->ip_data_len += len; 1059 } 1060 1061 return (0); 1062} 1063 1064void 1065icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 1066{ 1067 1068 m_copydata(ip->ip_data_mbuf, off, len, addr); 1069} 1070 1071void 1072icl_pdu_queue(struct icl_pdu *ip) 1073{ 1074 struct icl_conn *ic; 1075 1076 ic = ip->ip_conn; 1077 1078 ICL_CONN_LOCK_ASSERT(ic); 1079 1080 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 1081 ICL_DEBUG("icl_pdu_queue on closed connection"); 1082 icl_pdu_free(ip); 1083 return; 1084 } 1085 1086 if (!STAILQ_EMPTY(&ic->ic_to_send)) { 1087 STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 1088 /* 1089 * If the queue is not empty, someone else had already 1090 * signaled the send thread; no need to do that again, 1091 * just return. 1092 */ 1093 return; 1094 } 1095 1096 STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 1097 cv_signal(&ic->ic_send_cv); 1098} 1099 1100struct icl_conn * 1101icl_conn_new(const char *name, struct mtx *lock) 1102{ 1103 struct icl_conn *ic; 1104 1105 refcount_acquire(&icl_ncons); 1106 1107 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 1108 1109 STAILQ_INIT(&ic->ic_to_send); 1110 ic->ic_lock = lock; 1111 cv_init(&ic->ic_send_cv, "icl_tx"); 1112 cv_init(&ic->ic_receive_cv, "icl_rx"); 1113#ifdef DIAGNOSTIC 1114 refcount_init(&ic->ic_outstanding_pdus, 0); 1115#endif 1116 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 1117 ic->ic_name = name; 1118 1119 return (ic); 1120} 1121 1122void 1123icl_conn_free(struct icl_conn *ic) 1124{ 1125 1126 cv_destroy(&ic->ic_send_cv); 1127 cv_destroy(&ic->ic_receive_cv); 1128 uma_zfree(icl_conn_zone, ic); 1129 refcount_release(&icl_ncons); 1130} 1131 1132static int 1133icl_conn_start(struct icl_conn *ic) 1134{ 1135 size_t minspace; 1136 struct sockopt opt; 1137 int error, one = 1; 1138 1139 ICL_CONN_LOCK(ic); 1140 1141 /* 1142 * XXX: Ugly hack. 1143 */ 1144 if (ic->ic_socket == NULL) { 1145 ICL_CONN_UNLOCK(ic); 1146 return (EINVAL); 1147 } 1148 1149 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1150 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1151 ic->ic_disconnecting = false; 1152 1153 ICL_CONN_UNLOCK(ic); 1154 1155 /* 1156 * For sendspace, this is required because the current code cannot 1157 * send a PDU in pieces; thus, the minimum buffer size is equal 1158 * to the maximum PDU size. "+4" is to account for possible padding. 1159 * 1160 * What we should actually do here is to use autoscaling, but set 1161 * some minimal buffer size to "minspace". I don't know a way to do 1162 * that, though. 1163 */ 1164 minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length + 1165 ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; 1166 if (sendspace < minspace) { 1167 ICL_WARN("kern.icl.sendspace too low; must be at least %zd", 1168 minspace); 1169 sendspace = minspace; 1170 } 1171 if (recvspace < minspace) { 1172 ICL_WARN("kern.icl.recvspace too low; must be at least %zd", 1173 minspace); 1174 recvspace = minspace; 1175 } 1176 1177 error = soreserve(ic->ic_socket, sendspace, recvspace); 1178 if (error != 0) { 1179 ICL_WARN("soreserve failed with error %d", error); 1180 icl_conn_close(ic); 1181 return (error); 1182 } 1183 1184 /* 1185 * Disable Nagle. 1186 */ 1187 bzero(&opt, sizeof(opt)); 1188 opt.sopt_dir = SOPT_SET; 1189 opt.sopt_level = IPPROTO_TCP; 1190 opt.sopt_name = TCP_NODELAY; 1191 opt.sopt_val = &one; 1192 opt.sopt_valsize = sizeof(one); 1193 error = sosetopt(ic->ic_socket, &opt); 1194 if (error != 0) { 1195 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1196 icl_conn_close(ic); 1197 return (error); 1198 } 1199 1200 /* 1201 * Start threads. 1202 */ 1203 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", 1204 ic->ic_name); 1205 if (error != 0) { 1206 ICL_WARN("kthread_add(9) failed with error %d", error); 1207 icl_conn_close(ic); 1208 return (error); 1209 } 1210 1211 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", 1212 ic->ic_name); 1213 if (error != 0) { 1214 ICL_WARN("kthread_add(9) failed with error %d", error); 1215 icl_conn_close(ic); 1216 return (error); 1217 } 1218 1219 /* 1220 * Register socket upcall, to get notified about incoming PDUs 1221 * and free space to send outgoing ones. 1222 */ 1223 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1224 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1225 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1226 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1227 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1228 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1229 1230 return (0); 1231} 1232 1233int 1234icl_conn_handoff(struct icl_conn *ic, int fd) 1235{ 1236 struct file *fp; 1237 struct socket *so; 1238 cap_rights_t rights; 1239 int error; 1240 1241 ICL_CONN_LOCK_ASSERT_NOT(ic); 1242 1243 /* 1244 * Steal the socket from userland. 1245 */ 1246 error = fget(curthread, fd, 1247 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1248 if (error != 0) 1249 return (error); 1250 if (fp->f_type != DTYPE_SOCKET) { 1251 fdrop(fp, curthread); 1252 return (EINVAL); 1253 } 1254 so = fp->f_data; 1255 if (so->so_type != SOCK_STREAM) { 1256 fdrop(fp, curthread); 1257 return (EINVAL); 1258 } 1259 1260 ICL_CONN_LOCK(ic); 1261 1262 if (ic->ic_socket != NULL) { 1263 ICL_CONN_UNLOCK(ic); 1264 fdrop(fp, curthread); 1265 return (EBUSY); 1266 } 1267 1268 ic->ic_socket = fp->f_data; 1269 fp->f_ops = &badfileops; 1270 fp->f_data = NULL; 1271 fdrop(fp, curthread); 1272 ICL_CONN_UNLOCK(ic); 1273 1274 error = icl_conn_start(ic); 1275 1276 return (error); 1277} 1278 1279void 1280icl_conn_shutdown(struct icl_conn *ic) 1281{ 1282 ICL_CONN_LOCK_ASSERT_NOT(ic); 1283 1284 ICL_CONN_LOCK(ic); 1285 if (ic->ic_socket == NULL) { 1286 ICL_CONN_UNLOCK(ic); 1287 return; 1288 } 1289 ICL_CONN_UNLOCK(ic); 1290 1291 soshutdown(ic->ic_socket, SHUT_RDWR); 1292} 1293 1294void 1295icl_conn_close(struct icl_conn *ic) 1296{ 1297 struct icl_pdu *pdu; 1298 1299 ICL_CONN_LOCK_ASSERT_NOT(ic); 1300 1301 ICL_CONN_LOCK(ic); 1302 if (ic->ic_socket == NULL) { 1303 ICL_CONN_UNLOCK(ic); 1304 return; 1305 } 1306 1307 /* 1308 * Deregister socket upcalls. 1309 */ 1310 ICL_CONN_UNLOCK(ic); 1311 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1312 if (ic->ic_socket->so_snd.sb_upcall != NULL) 1313 soupcall_clear(ic->ic_socket, SO_SND); 1314 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1315 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1316 if (ic->ic_socket->so_rcv.sb_upcall != NULL) 1317 soupcall_clear(ic->ic_socket, SO_RCV); 1318 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1319 ICL_CONN_LOCK(ic); 1320 1321 ic->ic_disconnecting = true; 1322 1323 /* 1324 * Wake up the threads, so they can properly terminate. 1325 */ 1326 cv_signal(&ic->ic_receive_cv); 1327 cv_signal(&ic->ic_send_cv); 1328 while (ic->ic_receive_running || ic->ic_send_running) { 1329 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1330 ICL_CONN_UNLOCK(ic); 1331 cv_signal(&ic->ic_receive_cv); 1332 cv_signal(&ic->ic_send_cv); 1333 pause("icl_close", 1 * hz); 1334 ICL_CONN_LOCK(ic); 1335 } 1336 //ICL_DEBUG("send/receive threads terminated"); 1337 1338 ICL_CONN_UNLOCK(ic); 1339 soclose(ic->ic_socket); 1340 ICL_CONN_LOCK(ic); 1341 ic->ic_socket = NULL; 1342 1343 if (ic->ic_receive_pdu != NULL) { 1344 //ICL_DEBUG("freeing partially received PDU"); 1345 icl_pdu_free(ic->ic_receive_pdu); 1346 ic->ic_receive_pdu = NULL; 1347 } 1348 1349 /* 1350 * Remove any outstanding PDUs from the send queue. 1351 */ 1352 while (!STAILQ_EMPTY(&ic->ic_to_send)) { 1353 pdu = STAILQ_FIRST(&ic->ic_to_send); 1354 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next); 1355 icl_pdu_free(pdu); 1356 } 1357 1358 KASSERT(STAILQ_EMPTY(&ic->ic_to_send), 1359 ("destroying session with non-empty send queue")); 1360#ifdef DIAGNOSTIC 1361 KASSERT(ic->ic_outstanding_pdus == 0, 1362 ("destroying session with %d outstanding PDUs", 1363 ic->ic_outstanding_pdus)); 1364#endif 1365 ICL_CONN_UNLOCK(ic); 1366} 1367 1368bool 1369icl_conn_connected(struct icl_conn *ic) 1370{ 1371 ICL_CONN_LOCK_ASSERT_NOT(ic); 1372 1373 ICL_CONN_LOCK(ic); 1374 if (ic->ic_socket == NULL) { 1375 ICL_CONN_UNLOCK(ic); 1376 return (false); 1377 } 1378 if (ic->ic_socket->so_error != 0) { 1379 ICL_CONN_UNLOCK(ic); 1380 return (false); 1381 } 1382 ICL_CONN_UNLOCK(ic); 1383 return (true); 1384} 1385 1386#ifdef ICL_KERNEL_PROXY 1387int 1388icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1389{ 1390 int error; 1391 1392 ICL_CONN_LOCK_ASSERT_NOT(ic); 1393 1394 if (so->so_type != SOCK_STREAM) 1395 return (EINVAL); 1396 1397 ICL_CONN_LOCK(ic); 1398 if (ic->ic_socket != NULL) { 1399 ICL_CONN_UNLOCK(ic); 1400 return (EBUSY); 1401 } 1402 ic->ic_socket = so; 1403 ICL_CONN_UNLOCK(ic); 1404 1405 error = icl_conn_start(ic); 1406 1407 return (error); 1408} 1409#endif /* ICL_KERNEL_PROXY */ 1410 1411static int 1412icl_unload(void) 1413{ 1414 1415 if (icl_ncons != 0) 1416 return (EBUSY); 1417 1418 uma_zdestroy(icl_conn_zone); 1419 uma_zdestroy(icl_pdu_zone); 1420 1421 return (0); 1422} 1423 1424static void 1425icl_load(void) 1426{ 1427 1428 icl_conn_zone = uma_zcreate("icl_conn", 1429 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1430 UMA_ALIGN_PTR, 0); 1431 icl_pdu_zone = uma_zcreate("icl_pdu", 1432 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1433 UMA_ALIGN_PTR, 0); 1434 1435 refcount_init(&icl_ncons, 0); 1436} 1437 1438static int 1439icl_modevent(module_t mod, int what, void *arg) 1440{ 1441 1442 switch (what) { 1443 case MOD_LOAD: 1444 icl_load(); 1445 return (0); 1446 case MOD_UNLOAD: 1447 return (icl_unload()); 1448 default: 1449 return (EINVAL); 1450 } 1451} 1452 1453moduledata_t icl_data = { 1454 "icl", 1455 icl_modevent, 1456 0 1457}; 1458 1459DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1460MODULE_VERSION(icl, 1); 1461