icl.c revision 265495
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265495 2014-05-07 06:29:01Z trasz $
30 */
31
32/*
33 * iSCSI Common Layer.  It's used by both the initiator and target to send
34 * and receive iSCSI PDUs.
35 */
36
37#include <sys/param.h>
38#include <sys/capability.h>
39#include <sys/condvar.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/module.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/sx.h>
53#include <sys/uio.h>
54#include <vm/uma.h>
55#include <netinet/in.h>
56#include <netinet/tcp.h>
57
58#include "icl.h"
59#include "iscsi_proto.h"
60
61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62static int debug = 1;
63TUNABLE_INT("kern.icl.debug", &debug);
64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65    &debug, 1, "Enable debug messages");
66static int partial_receive_len = 1 * 1024; /* XXX: More? */
67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69    &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70    "data segment");
71
72static uma_zone_t icl_conn_zone;
73static uma_zone_t icl_pdu_zone;
74
75static volatile u_int	icl_ncons;
76
77#define	ICL_DEBUG(X, ...)					\
78	if (debug > 1) {					\
79		printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
80	} while (0)
81
82#define	ICL_WARN(X, ...)					\
83	if (debug > 0) {					\
84		printf("WARNING: %s: " X "\n",			\
85		    __func__, ## __VA_ARGS__);			\
86	} while (0)
87
88#define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
89#define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
90#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
91#define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
92
93static void
94icl_conn_fail(struct icl_conn *ic)
95{
96	if (ic->ic_socket == NULL)
97		return;
98
99	/*
100	 * XXX
101	 */
102	ic->ic_socket->so_error = EDOOFUS;
103	(ic->ic_error)(ic);
104}
105
106static struct mbuf *
107icl_conn_receive(struct icl_conn *ic, size_t len)
108{
109	struct uio uio;
110	struct socket *so;
111	struct mbuf *m;
112	int error, flags;
113
114	so = ic->ic_socket;
115
116	memset(&uio, 0, sizeof(uio));
117	uio.uio_resid = len;
118
119	flags = MSG_DONTWAIT;
120	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
121	if (error != 0) {
122		ICL_DEBUG("soreceive error %d", error);
123		return (NULL);
124	}
125	if (uio.uio_resid != 0) {
126		m_freem(m);
127		ICL_DEBUG("short read");
128		return (NULL);
129	}
130
131	return (m);
132}
133
134static struct icl_pdu *
135icl_pdu_new(struct icl_conn *ic, int flags)
136{
137	struct icl_pdu *ip;
138
139#ifdef DIAGNOSTIC
140	refcount_acquire(&ic->ic_outstanding_pdus);
141#endif
142	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
143	if (ip == NULL) {
144		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
145#ifdef DIAGNOSTIC
146		refcount_release(&ic->ic_outstanding_pdus);
147#endif
148		return (NULL);
149	}
150
151	ip->ip_conn = ic;
152
153	return (ip);
154}
155
156void
157icl_pdu_free(struct icl_pdu *ip)
158{
159	struct icl_conn *ic;
160
161	ic = ip->ip_conn;
162
163	m_freem(ip->ip_bhs_mbuf);
164	m_freem(ip->ip_ahs_mbuf);
165	m_freem(ip->ip_data_mbuf);
166	uma_zfree(icl_pdu_zone, ip);
167#ifdef DIAGNOSTIC
168	refcount_release(&ic->ic_outstanding_pdus);
169#endif
170}
171
172/*
173 * Allocate icl_pdu with empty BHS to fill up by the caller.
174 */
175struct icl_pdu *
176icl_pdu_new_bhs(struct icl_conn *ic, int flags)
177{
178	struct icl_pdu *ip;
179
180	ip = icl_pdu_new(ic, flags);
181	if (ip == NULL)
182		return (NULL);
183
184	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
185	    flags, MT_DATA, M_PKTHDR);
186	if (ip->ip_bhs_mbuf == NULL) {
187		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
188		icl_pdu_free(ip);
189		return (NULL);
190	}
191	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
192	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
193	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
194
195	return (ip);
196}
197
198static int
199icl_pdu_ahs_length(const struct icl_pdu *request)
200{
201
202	return (request->ip_bhs->bhs_total_ahs_len * 4);
203}
204
205size_t
206icl_pdu_data_segment_length(const struct icl_pdu *request)
207{
208	uint32_t len = 0;
209
210	len += request->ip_bhs->bhs_data_segment_len[0];
211	len <<= 8;
212	len += request->ip_bhs->bhs_data_segment_len[1];
213	len <<= 8;
214	len += request->ip_bhs->bhs_data_segment_len[2];
215
216	return (len);
217}
218
219static void
220icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
221{
222
223	response->ip_bhs->bhs_data_segment_len[2] = len;
224	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
225	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
226}
227
228static size_t
229icl_pdu_padding(const struct icl_pdu *ip)
230{
231
232	if ((ip->ip_data_len % 4) != 0)
233		return (4 - (ip->ip_data_len % 4));
234
235	return (0);
236}
237
238static size_t
239icl_pdu_size(const struct icl_pdu *response)
240{
241	size_t len;
242
243	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
244
245	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
246	    icl_pdu_padding(response);
247	if (response->ip_conn->ic_header_crc32c)
248		len += ISCSI_HEADER_DIGEST_SIZE;
249	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
250		len += ISCSI_DATA_DIGEST_SIZE;
251
252	return (len);
253}
254
255static int
256icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
257{
258	struct mbuf *m;
259
260	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
261	if (m == NULL) {
262		ICL_DEBUG("failed to receive BHS");
263		return (-1);
264	}
265
266	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
267	if (request->ip_bhs_mbuf == NULL) {
268		ICL_WARN("m_pullup failed");
269		return (-1);
270	}
271	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
272
273	/*
274	 * XXX: For architectures with strict alignment requirements
275	 * 	we may need to allocate ip_bhs and copy the data into it.
276	 * 	For some reason, though, not doing this doesn't seem
277	 * 	to cause problems; tested on sparc64.
278	 */
279
280	*availablep -= sizeof(struct iscsi_bhs);
281	return (0);
282}
283
284static int
285icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
286{
287
288	request->ip_ahs_len = icl_pdu_ahs_length(request);
289	if (request->ip_ahs_len == 0)
290		return (0);
291
292	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
293	    request->ip_ahs_len);
294	if (request->ip_ahs_mbuf == NULL) {
295		ICL_DEBUG("failed to receive AHS");
296		return (-1);
297	}
298
299	*availablep -= request->ip_ahs_len;
300	return (0);
301}
302
303static uint32_t
304icl_mbuf_to_crc32c(const struct mbuf *m0)
305{
306	uint32_t digest = 0xffffffff;
307	const struct mbuf *m;
308
309	for (m = m0; m != NULL; m = m->m_next)
310		digest = calculate_crc32c(digest,
311		    mtod(m, const void *), m->m_len);
312
313	digest = digest ^ 0xffffffff;
314
315	return (digest);
316}
317
318static int
319icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
320{
321	struct mbuf *m;
322	uint32_t received_digest, valid_digest;
323
324	if (request->ip_conn->ic_header_crc32c == false)
325		return (0);
326
327	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
328	if (m == NULL) {
329		ICL_DEBUG("failed to receive header digest");
330		return (-1);
331	}
332
333	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
334	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
335	m_freem(m);
336
337	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
338
339	/*
340	 * XXX: Handle AHS.
341	 */
342	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
343	if (received_digest != valid_digest) {
344		ICL_WARN("header digest check failed; got 0x%x, "
345		    "should be 0x%x", received_digest, valid_digest);
346		return (-1);
347	}
348
349	return (0);
350}
351
352/*
353 * Return the number of bytes that should be waiting in the receive socket
354 * before icl_pdu_receive_data_segment() gets called.
355 */
356static size_t
357icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
358{
359	size_t len;
360
361	len = icl_pdu_data_segment_length(request);
362	if (len == 0)
363		return (0);
364
365	/*
366	 * Account for the parts of data segment already read from
367	 * the socket buffer.
368	 */
369	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
370	len -= request->ip_data_len;
371
372	/*
373	 * Don't always wait for the full data segment to be delivered
374	 * to the socket; this might badly affect performance due to
375	 * TCP window scaling.
376	 */
377	if (len > partial_receive_len) {
378#if 0
379		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
380		    len, partial_receive_len));
381#endif
382		len = partial_receive_len;
383
384		return (len);
385	}
386
387	/*
388	 * Account for padding.  Note that due to the way code is written,
389	 * the icl_pdu_receive_data_segment() must always receive padding
390	 * along with the last part of data segment, because it would be
391	 * impossible to tell whether we've already received the full data
392	 * segment including padding, or without it.
393	 */
394	if ((len % 4) != 0)
395		len += 4 - (len % 4);
396
397#if 0
398	ICL_DEBUG("need %zd bytes of data", len));
399#endif
400
401	return (len);
402}
403
404static int
405icl_pdu_receive_data_segment(struct icl_pdu *request,
406    size_t *availablep, bool *more_neededp)
407{
408	struct icl_conn *ic;
409	size_t len, padding = 0;
410	struct mbuf *m;
411
412	ic = request->ip_conn;
413
414	*more_neededp = false;
415	ic->ic_receive_len = 0;
416
417	len = icl_pdu_data_segment_length(request);
418	if (len == 0)
419		return (0);
420
421	if ((len % 4) != 0)
422		padding = 4 - (len % 4);
423
424	/*
425	 * Account for already received parts of data segment.
426	 */
427	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
428	len -= request->ip_data_len;
429
430	if (len + padding > *availablep) {
431		/*
432		 * Not enough data in the socket buffer.  Receive as much
433		 * as we can.  Don't receive padding, since, obviously, it's
434		 * not the end of data segment yet.
435		 */
436#if 0
437		ICL_DEBUG("limited from %zd to %zd",
438		    len + padding, *availablep - padding));
439#endif
440		len = *availablep - padding;
441		*more_neededp = true;
442		padding = 0;
443	}
444
445	/*
446	 * Must not try to receive padding without at least one byte
447	 * of actual data segment.
448	 */
449	if (len > 0) {
450		m = icl_conn_receive(request->ip_conn, len + padding);
451		if (m == NULL) {
452			ICL_DEBUG("failed to receive data segment");
453			return (-1);
454		}
455
456		if (request->ip_data_mbuf == NULL)
457			request->ip_data_mbuf = m;
458		else
459			m_cat(request->ip_data_mbuf, m);
460
461		request->ip_data_len += len;
462		*availablep -= len + padding;
463	} else
464		ICL_DEBUG("len 0");
465
466	if (*more_neededp)
467		ic->ic_receive_len =
468		    icl_pdu_data_segment_receive_len(request);
469
470	return (0);
471}
472
473static int
474icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
475{
476	struct mbuf *m;
477	uint32_t received_digest, valid_digest;
478
479	if (request->ip_conn->ic_data_crc32c == false)
480		return (0);
481
482	if (request->ip_data_len == 0)
483		return (0);
484
485	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
486	if (m == NULL) {
487		ICL_DEBUG("failed to receive data digest");
488		return (-1);
489	}
490
491	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
492	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
493	m_freem(m);
494
495	*availablep -= ISCSI_DATA_DIGEST_SIZE;
496
497	/*
498	 * Note that ip_data_mbuf also contains padding; since digest
499	 * calculation is supposed to include that, we iterate over
500	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
501	 */
502	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
503	if (received_digest != valid_digest) {
504		ICL_WARN("data digest check failed; got 0x%x, "
505		    "should be 0x%x", received_digest, valid_digest);
506		return (-1);
507	}
508
509	return (0);
510}
511
512/*
513 * Somewhat contrary to the name, this attempts to receive only one
514 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
515 */
516static struct icl_pdu *
517icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
518{
519	struct icl_pdu *request;
520	struct socket *so;
521	size_t len;
522	int error;
523	bool more_needed;
524
525	so = ic->ic_socket;
526
527	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
528		KASSERT(ic->ic_receive_pdu == NULL,
529		    ("ic->ic_receive_pdu != NULL"));
530		request = icl_pdu_new(ic, M_NOWAIT);
531		if (request == NULL) {
532			ICL_DEBUG("failed to allocate PDU; "
533			    "dropping connection");
534			icl_conn_fail(ic);
535			return (NULL);
536		}
537		ic->ic_receive_pdu = request;
538	} else {
539		KASSERT(ic->ic_receive_pdu != NULL,
540		    ("ic->ic_receive_pdu == NULL"));
541		request = ic->ic_receive_pdu;
542	}
543
544	if (*availablep < ic->ic_receive_len) {
545#if 0
546		ICL_DEBUG("not enough data; need %zd, "
547		    "have %zd", ic->ic_receive_len, *availablep);
548#endif
549		return (NULL);
550	}
551
552	switch (ic->ic_receive_state) {
553	case ICL_CONN_STATE_BHS:
554		//ICL_DEBUG("receiving BHS");
555		error = icl_pdu_receive_bhs(request, availablep);
556		if (error != 0) {
557			ICL_DEBUG("failed to receive BHS; "
558			    "dropping connection");
559			break;
560		}
561
562		/*
563		 * We don't enforce any limit for AHS length;
564		 * its length is stored in 8 bit field.
565		 */
566
567		len = icl_pdu_data_segment_length(request);
568		if (len > ic->ic_max_data_segment_length) {
569			ICL_WARN("received data segment "
570			    "length %zd is larger than negotiated "
571			    "MaxDataSegmentLength %zd; "
572			    "dropping connection",
573			    len, ic->ic_max_data_segment_length);
574			error = EINVAL;
575			break;
576		}
577
578		ic->ic_receive_state = ICL_CONN_STATE_AHS;
579		ic->ic_receive_len = icl_pdu_ahs_length(request);
580		break;
581
582	case ICL_CONN_STATE_AHS:
583		//ICL_DEBUG("receiving AHS");
584		error = icl_pdu_receive_ahs(request, availablep);
585		if (error != 0) {
586			ICL_DEBUG("failed to receive AHS; "
587			    "dropping connection");
588			break;
589		}
590		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
591		if (ic->ic_header_crc32c == false)
592			ic->ic_receive_len = 0;
593		else
594			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
595		break;
596
597	case ICL_CONN_STATE_HEADER_DIGEST:
598		//ICL_DEBUG("receiving header digest");
599		error = icl_pdu_check_header_digest(request, availablep);
600		if (error != 0) {
601			ICL_DEBUG("header digest failed; "
602			    "dropping connection");
603			break;
604		}
605
606		ic->ic_receive_state = ICL_CONN_STATE_DATA;
607		ic->ic_receive_len =
608		    icl_pdu_data_segment_receive_len(request);
609		break;
610
611	case ICL_CONN_STATE_DATA:
612		//ICL_DEBUG("receiving data segment");
613		error = icl_pdu_receive_data_segment(request, availablep,
614		    &more_needed);
615		if (error != 0) {
616			ICL_DEBUG("failed to receive data segment;"
617			    "dropping connection");
618			break;
619		}
620
621		if (more_needed)
622			break;
623
624		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
625		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
626			ic->ic_receive_len = 0;
627		else
628			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
629		break;
630
631	case ICL_CONN_STATE_DATA_DIGEST:
632		//ICL_DEBUG("receiving data digest");
633		error = icl_pdu_check_data_digest(request, availablep);
634		if (error != 0) {
635			ICL_DEBUG("data digest failed; "
636			    "dropping connection");
637			break;
638		}
639
640		/*
641		 * We've received complete PDU; reset the receive state machine
642		 * and return the PDU.
643		 */
644		ic->ic_receive_state = ICL_CONN_STATE_BHS;
645		ic->ic_receive_len = sizeof(struct iscsi_bhs);
646		ic->ic_receive_pdu = NULL;
647		return (request);
648
649	default:
650		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
651	}
652
653	if (error != 0) {
654		icl_pdu_free(request);
655		icl_conn_fail(ic);
656	}
657
658	return (NULL);
659}
660
661static void
662icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
663{
664	struct icl_pdu *response;
665	struct socket *so;
666
667	so = ic->ic_socket;
668
669	/*
670	 * This can never happen; we're careful to only mess with ic->ic_socket
671	 * pointer when the send/receive threads are not running.
672	 */
673	KASSERT(so != NULL, ("NULL socket"));
674
675	for (;;) {
676		if (ic->ic_disconnecting)
677			return;
678
679		if (so->so_error != 0) {
680			ICL_DEBUG("connection error %d; "
681			    "dropping connection", so->so_error);
682			icl_conn_fail(ic);
683			return;
684		}
685
686		/*
687		 * Loop until we have a complete PDU or there is not enough
688		 * data in the socket buffer.
689		 */
690		if (available < ic->ic_receive_len) {
691#if 0
692			ICL_DEBUG("not enough data; have %zd, "
693			    "need %zd", available,
694			    ic->ic_receive_len);
695#endif
696			return;
697		}
698
699		response = icl_conn_receive_pdu(ic, &available);
700		if (response == NULL)
701			continue;
702
703		if (response->ip_ahs_len > 0) {
704			ICL_WARN("received PDU with unsupported "
705			    "AHS; opcode 0x%x; dropping connection",
706			    response->ip_bhs->bhs_opcode);
707			icl_pdu_free(response);
708			icl_conn_fail(ic);
709			return;
710		}
711
712		(ic->ic_receive)(response);
713	}
714}
715
716static void
717icl_receive_thread(void *arg)
718{
719	struct icl_conn *ic;
720	size_t available;
721	struct socket *so;
722
723	ic = arg;
724	so = ic->ic_socket;
725
726	ICL_CONN_LOCK(ic);
727	ic->ic_receive_running = true;
728	ICL_CONN_UNLOCK(ic);
729
730	for (;;) {
731		if (ic->ic_disconnecting) {
732			//ICL_DEBUG("terminating");
733			break;
734		}
735
736		SOCKBUF_LOCK(&so->so_rcv);
737		available = so->so_rcv.sb_cc;
738		if (available < ic->ic_receive_len) {
739			so->so_rcv.sb_lowat = ic->ic_receive_len;
740			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
741		}
742		SOCKBUF_UNLOCK(&so->so_rcv);
743
744		icl_conn_receive_pdus(ic, available);
745	}
746
747	ICL_CONN_LOCK(ic);
748	ic->ic_receive_running = false;
749	ICL_CONN_UNLOCK(ic);
750	kthread_exit();
751}
752
753static int
754icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
755{
756	struct icl_conn *ic;
757
758	ic = arg;
759	cv_signal(&ic->ic_receive_cv);
760	return (SU_OK);
761}
762
763static int
764icl_pdu_send(struct icl_pdu *request)
765{
766	size_t padding, pdu_len;
767	uint32_t digest, zero = 0;
768	int error, ok;
769	struct socket *so;
770	struct icl_conn *ic;
771
772	ic = request->ip_conn;
773	so = request->ip_conn->ic_socket;
774
775	ICL_CONN_LOCK_ASSERT(ic);
776
777	icl_pdu_set_data_segment_length(request, request->ip_data_len);
778
779	pdu_len = icl_pdu_size(request);
780
781	if (ic->ic_header_crc32c) {
782		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
783		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
784		    (void *)&digest);
785		if (ok != 1) {
786			ICL_WARN("failed to append header digest");
787			return (1);
788		}
789	}
790
791	if (request->ip_data_len != 0) {
792		padding = icl_pdu_padding(request);
793		if (padding > 0) {
794			ok = m_append(request->ip_data_mbuf, padding,
795			    (void *)&zero);
796			if (ok != 1) {
797				ICL_WARN("failed to append padding");
798				return (1);
799			}
800		}
801
802		if (ic->ic_data_crc32c) {
803			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
804
805			ok = m_append(request->ip_data_mbuf, sizeof(digest),
806			    (void *)&digest);
807			if (ok != 1) {
808				ICL_WARN("failed to append header digest");
809				return (1);
810			}
811		}
812
813		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
814		request->ip_data_mbuf = NULL;
815	}
816
817	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
818
819	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
820	    NULL, MSG_DONTWAIT, curthread);
821	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
822	if (error != 0) {
823		ICL_DEBUG("sosend error %d", error);
824		return (error);
825	}
826
827	return (0);
828}
829
830static void
831icl_conn_send_pdus(struct icl_conn *ic)
832{
833	struct icl_pdu *request;
834	struct socket *so;
835	size_t available, size;
836	int error;
837
838	ICL_CONN_LOCK_ASSERT(ic);
839
840	so = ic->ic_socket;
841
842	SOCKBUF_LOCK(&so->so_snd);
843	available = sbspace(&so->so_snd);
844	SOCKBUF_UNLOCK(&so->so_snd);
845
846	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
847		if (ic->ic_disconnecting)
848			return;
849
850		request = TAILQ_FIRST(&ic->ic_to_send);
851		size = icl_pdu_size(request);
852		if (available < size) {
853			/*
854			 * Set the low watermark on the socket,
855			 * to avoid waking up until there is enough
856			 * space.
857			 */
858			SOCKBUF_LOCK(&so->so_snd);
859			so->so_snd.sb_lowat = size;
860			SOCKBUF_UNLOCK(&so->so_snd);
861#if 1
862			ICL_DEBUG("no space to send; "
863			    "have %zd, need %zd",
864			    available, size);
865#endif
866			return;
867		}
868		available -= size;
869		TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
870		error = icl_pdu_send(request);
871		if (error != 0) {
872			ICL_DEBUG("failed to send PDU; "
873			    "dropping connection");
874			icl_conn_fail(ic);
875			return;
876		}
877		icl_pdu_free(request);
878	}
879}
880
881static void
882icl_send_thread(void *arg)
883{
884	struct icl_conn *ic;
885
886	ic = arg;
887
888	ICL_CONN_LOCK(ic);
889	ic->ic_send_running = true;
890
891	for (;;) {
892		if (ic->ic_disconnecting) {
893			//ICL_DEBUG("terminating");
894			break;
895		}
896		icl_conn_send_pdus(ic);
897		cv_wait(&ic->ic_send_cv, ic->ic_lock);
898	}
899
900	ic->ic_send_running = false;
901	ICL_CONN_UNLOCK(ic);
902	kthread_exit();
903}
904
905static int
906icl_soupcall_send(struct socket *so, void *arg, int waitflag)
907{
908	struct icl_conn *ic;
909
910	ic = arg;
911	cv_signal(&ic->ic_send_cv);
912	return (SU_OK);
913}
914
915int
916icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
917{
918	struct mbuf *mb, *newmb;
919	size_t copylen, off = 0;
920
921	KASSERT(len > 0, ("len == 0"));
922
923	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
924	if (newmb == NULL) {
925		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
926		return (ENOMEM);
927	}
928
929	for (mb = newmb; mb != NULL; mb = mb->m_next) {
930		copylen = min(M_TRAILINGSPACE(mb), len - off);
931		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
932		mb->m_len = copylen;
933		off += copylen;
934	}
935	KASSERT(off == len, ("%s: off != len", __func__));
936
937	if (request->ip_data_mbuf == NULL) {
938		request->ip_data_mbuf = newmb;
939		request->ip_data_len = len;
940	} else {
941		m_cat(request->ip_data_mbuf, newmb);
942		request->ip_data_len += len;
943	}
944
945	return (0);
946}
947
948void
949icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
950{
951
952	m_copydata(ip->ip_data_mbuf, off, len, addr);
953}
954
955void
956icl_pdu_queue(struct icl_pdu *ip)
957{
958	struct icl_conn *ic;
959
960	ic = ip->ip_conn;
961
962	ICL_CONN_LOCK_ASSERT(ic);
963
964	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
965		ICL_DEBUG("icl_pdu_queue on closed connection");
966		icl_pdu_free(ip);
967		return;
968	}
969	TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
970	cv_signal(&ic->ic_send_cv);
971}
972
973struct icl_conn *
974icl_conn_new(struct mtx *lock)
975{
976	struct icl_conn *ic;
977
978	refcount_acquire(&icl_ncons);
979
980	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
981
982	TAILQ_INIT(&ic->ic_to_send);
983	ic->ic_lock = lock;
984	cv_init(&ic->ic_send_cv, "icl_tx");
985	cv_init(&ic->ic_receive_cv, "icl_rx");
986#ifdef DIAGNOSTIC
987	refcount_init(&ic->ic_outstanding_pdus, 0);
988#endif
989	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
990
991	return (ic);
992}
993
994void
995icl_conn_free(struct icl_conn *ic)
996{
997
998	cv_destroy(&ic->ic_send_cv);
999	cv_destroy(&ic->ic_receive_cv);
1000	uma_zfree(icl_conn_zone, ic);
1001	refcount_release(&icl_ncons);
1002}
1003
1004static int
1005icl_conn_start(struct icl_conn *ic)
1006{
1007	size_t bufsize;
1008	struct sockopt opt;
1009	int error, one = 1;
1010
1011	ICL_CONN_LOCK(ic);
1012
1013	/*
1014	 * XXX: Ugly hack.
1015	 */
1016	if (ic->ic_socket == NULL) {
1017		ICL_CONN_UNLOCK(ic);
1018		return (EINVAL);
1019	}
1020
1021	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1022	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1023	ic->ic_disconnecting = false;
1024
1025	ICL_CONN_UNLOCK(ic);
1026
1027	/*
1028	 * Use max available sockbuf size for sending.  Do it manually
1029	 * instead of sbreserve(9) to work around resource limits.
1030	 *
1031	 * XXX: This kind of sucks.  On one hand, we don't currently support
1032	 *	sending a part of data segment; we always do it in one piece,
1033	 *	so we have to make sure it can fit in the socket buffer.
1034	 *	Once I've implemented partial send, we'll get rid of this
1035	 *	and use autoscaling.
1036	 */
1037        bufsize = (sizeof(struct iscsi_bhs) +
1038            ic->ic_max_data_segment_length) * 8;
1039	error = soreserve(ic->ic_socket, bufsize, bufsize);
1040	if (error != 0) {
1041		ICL_WARN("soreserve failed with error %d", error);
1042		icl_conn_close(ic);
1043		return (error);
1044	}
1045
1046	/*
1047	 * Disable Nagle.
1048	 */
1049	bzero(&opt, sizeof(opt));
1050	opt.sopt_dir = SOPT_SET;
1051	opt.sopt_level = IPPROTO_TCP;
1052	opt.sopt_name = TCP_NODELAY;
1053	opt.sopt_val = &one;
1054	opt.sopt_valsize = sizeof(one);
1055	error = sosetopt(ic->ic_socket, &opt);
1056	if (error != 0) {
1057		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1058		icl_conn_close(ic);
1059		return (error);
1060	}
1061
1062	/*
1063	 * Start threads.
1064	 */
1065	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1066	if (error != 0) {
1067		ICL_WARN("kthread_add(9) failed with error %d", error);
1068		icl_conn_close(ic);
1069		return (error);
1070	}
1071
1072	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1073	if (error != 0) {
1074		ICL_WARN("kthread_add(9) failed with error %d", error);
1075		icl_conn_close(ic);
1076		return (error);
1077	}
1078
1079	/*
1080	 * Register socket upcall, to get notified about incoming PDUs
1081	 * and free space to send outgoing ones.
1082	 */
1083	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1084	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1085	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1086	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1087	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1088	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1089
1090	return (0);
1091}
1092
1093int
1094icl_conn_handoff(struct icl_conn *ic, int fd)
1095{
1096	struct file *fp;
1097	struct socket *so;
1098	cap_rights_t rights;
1099	int error;
1100
1101	ICL_CONN_LOCK_ASSERT_NOT(ic);
1102
1103	/*
1104	 * Steal the socket from userland.
1105	 */
1106	error = fget(curthread, fd,
1107	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1108	if (error != 0)
1109		return (error);
1110	if (fp->f_type != DTYPE_SOCKET) {
1111		fdrop(fp, curthread);
1112		return (EINVAL);
1113	}
1114	so = fp->f_data;
1115	if (so->so_type != SOCK_STREAM) {
1116		fdrop(fp, curthread);
1117		return (EINVAL);
1118	}
1119
1120	ICL_CONN_LOCK(ic);
1121
1122	if (ic->ic_socket != NULL) {
1123		ICL_CONN_UNLOCK(ic);
1124		fdrop(fp, curthread);
1125		return (EBUSY);
1126	}
1127
1128	ic->ic_socket = fp->f_data;
1129	fp->f_ops = &badfileops;
1130	fp->f_data = NULL;
1131	fdrop(fp, curthread);
1132	ICL_CONN_UNLOCK(ic);
1133
1134	error = icl_conn_start(ic);
1135
1136	return (error);
1137}
1138
1139void
1140icl_conn_shutdown(struct icl_conn *ic)
1141{
1142	ICL_CONN_LOCK_ASSERT_NOT(ic);
1143
1144	ICL_CONN_LOCK(ic);
1145	if (ic->ic_socket == NULL) {
1146		ICL_CONN_UNLOCK(ic);
1147		return;
1148	}
1149	ICL_CONN_UNLOCK(ic);
1150
1151	soshutdown(ic->ic_socket, SHUT_RDWR);
1152}
1153
1154void
1155icl_conn_close(struct icl_conn *ic)
1156{
1157	struct icl_pdu *pdu;
1158
1159	ICL_CONN_LOCK_ASSERT_NOT(ic);
1160
1161	ICL_CONN_LOCK(ic);
1162	if (ic->ic_socket == NULL) {
1163		ICL_CONN_UNLOCK(ic);
1164		return;
1165	}
1166
1167	ic->ic_disconnecting = true;
1168
1169	/*
1170	 * Wake up the threads, so they can properly terminate.
1171	 */
1172	cv_signal(&ic->ic_receive_cv);
1173	cv_signal(&ic->ic_send_cv);
1174	while (ic->ic_receive_running || ic->ic_send_running) {
1175		//ICL_DEBUG("waiting for send/receive threads to terminate");
1176		ICL_CONN_UNLOCK(ic);
1177		cv_signal(&ic->ic_receive_cv);
1178		cv_signal(&ic->ic_send_cv);
1179		pause("icl_close", 1 * hz);
1180		ICL_CONN_LOCK(ic);
1181	}
1182	//ICL_DEBUG("send/receive threads terminated");
1183
1184	soclose(ic->ic_socket);
1185	ic->ic_socket = NULL;
1186
1187	if (ic->ic_receive_pdu != NULL) {
1188		//ICL_DEBUG("freeing partially received PDU");
1189		icl_pdu_free(ic->ic_receive_pdu);
1190		ic->ic_receive_pdu = NULL;
1191	}
1192
1193	/*
1194	 * Remove any outstanding PDUs from the send queue.
1195	 */
1196	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1197		pdu = TAILQ_FIRST(&ic->ic_to_send);
1198		TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1199		icl_pdu_free(pdu);
1200	}
1201
1202	KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1203	    ("destroying session with non-empty send queue"));
1204	/*
1205	 * XXX
1206	 */
1207#if 0
1208	KASSERT(ic->ic_outstanding_pdus == 0,
1209	    ("destroying session with %d outstanding PDUs",
1210	     ic->ic_outstanding_pdus));
1211#endif
1212	ICL_CONN_UNLOCK(ic);
1213}
1214
1215bool
1216icl_conn_connected(struct icl_conn *ic)
1217{
1218	ICL_CONN_LOCK_ASSERT_NOT(ic);
1219
1220	ICL_CONN_LOCK(ic);
1221	if (ic->ic_socket == NULL) {
1222		ICL_CONN_UNLOCK(ic);
1223		return (false);
1224	}
1225	if (ic->ic_socket->so_error != 0) {
1226		ICL_CONN_UNLOCK(ic);
1227		return (false);
1228	}
1229	ICL_CONN_UNLOCK(ic);
1230	return (true);
1231}
1232
1233#ifdef ICL_KERNEL_PROXY
1234int
1235icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1236{
1237	int error;
1238
1239	ICL_CONN_LOCK_ASSERT_NOT(ic);
1240
1241	if (so->so_type != SOCK_STREAM)
1242		return (EINVAL);
1243
1244	ICL_CONN_LOCK(ic);
1245	if (ic->ic_socket != NULL) {
1246		ICL_CONN_UNLOCK(ic);
1247		return (EBUSY);
1248	}
1249	ic->ic_socket = so;
1250	ICL_CONN_UNLOCK(ic);
1251
1252	error = icl_conn_start(ic);
1253
1254	return (error);
1255}
1256#endif /* ICL_KERNEL_PROXY */
1257
1258static int
1259icl_unload(void)
1260{
1261
1262	if (icl_ncons != 0)
1263		return (EBUSY);
1264
1265	uma_zdestroy(icl_conn_zone);
1266	uma_zdestroy(icl_pdu_zone);
1267
1268	return (0);
1269}
1270
1271static void
1272icl_load(void)
1273{
1274
1275	icl_conn_zone = uma_zcreate("icl_conn",
1276	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1277	    UMA_ALIGN_PTR, 0);
1278	icl_pdu_zone = uma_zcreate("icl_pdu",
1279	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1280	    UMA_ALIGN_PTR, 0);
1281
1282	refcount_init(&icl_ncons, 0);
1283}
1284
1285static int
1286icl_modevent(module_t mod, int what, void *arg)
1287{
1288
1289	switch (what) {
1290	case MOD_LOAD:
1291		icl_load();
1292		return (0);
1293	case MOD_UNLOAD:
1294		return (icl_unload());
1295	default:
1296		return (EINVAL);
1297	}
1298}
1299
1300moduledata_t icl_data = {
1301	"icl",
1302	icl_modevent,
1303	0
1304};
1305
1306DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1307MODULE_VERSION(icl, 1);
1308