ctl_backend_block.c revision 313365
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * Copyright (c) 2014-2015 Alexander Motin <mav@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Edward Tomasz Napierala
9 * under sponsorship from the FreeBSD Foundation.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions, and the following disclaimer,
16 *    without modification.
17 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
18 *    substantially similar to the "NO WARRANTY" disclaimer below
19 *    ("Disclaimer") and any redistribution must be conditioned upon
20 *    including a substantially similar Disclaimer requirement for further
21 *    binary redistribution.
22 *
23 * NO WARRANTY
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGES.
35 *
36 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
37 */
38/*
39 * CAM Target Layer driver backend for block devices.
40 *
41 * Author: Ken Merry <ken@FreeBSD.org>
42 */
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD: stable/10/sys/cam/ctl/ctl_backend_block.c 313365 2017-02-07 01:42:53Z mav $");
45
46#include <opt_kdtrace.h>
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/types.h>
52#include <sys/kthread.h>
53#include <sys/bio.h>
54#include <sys/fcntl.h>
55#include <sys/limits.h>
56#include <sys/lock.h>
57#include <sys/mutex.h>
58#include <sys/condvar.h>
59#include <sys/malloc.h>
60#include <sys/conf.h>
61#include <sys/ioccom.h>
62#include <sys/queue.h>
63#include <sys/sbuf.h>
64#include <sys/endian.h>
65#include <sys/uio.h>
66#include <sys/buf.h>
67#include <sys/taskqueue.h>
68#include <sys/vnode.h>
69#include <sys/namei.h>
70#include <sys/mount.h>
71#include <sys/disk.h>
72#include <sys/fcntl.h>
73#include <sys/filedesc.h>
74#include <sys/filio.h>
75#include <sys/proc.h>
76#include <sys/pcpu.h>
77#include <sys/module.h>
78#include <sys/sdt.h>
79#include <sys/devicestat.h>
80#include <sys/sysctl.h>
81
82#include <geom/geom.h>
83
84#include <cam/cam.h>
85#include <cam/scsi/scsi_all.h>
86#include <cam/scsi/scsi_da.h>
87#include <cam/ctl/ctl_io.h>
88#include <cam/ctl/ctl.h>
89#include <cam/ctl/ctl_backend.h>
90#include <cam/ctl/ctl_ioctl.h>
91#include <cam/ctl/ctl_ha.h>
92#include <cam/ctl/ctl_scsi_all.h>
93#include <cam/ctl/ctl_private.h>
94#include <cam/ctl/ctl_error.h>
95
96/*
97 * The idea here is that we'll allocate enough S/G space to hold a 1MB
98 * I/O.  If we get an I/O larger than that, we'll split it.
99 */
100#define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
101#define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
102#define	CTLBLK_MAX_SEG		MAXPHYS
103#define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
104#define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
105
106#ifdef CTLBLK_DEBUG
107#define DPRINTF(fmt, args...) \
108    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
109#else
110#define DPRINTF(fmt, args...) do {} while(0)
111#endif
112
113#define PRIV(io)	\
114    ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
115#define ARGS(io)	\
116    ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
117
118SDT_PROVIDER_DEFINE(cbb);
119
120typedef enum {
121	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
122	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
123	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
124} ctl_be_block_lun_flags;
125
126typedef enum {
127	CTL_BE_BLOCK_NONE,
128	CTL_BE_BLOCK_DEV,
129	CTL_BE_BLOCK_FILE
130} ctl_be_block_type;
131
132struct ctl_be_block_filedata {
133	struct ucred *cred;
134};
135
136union ctl_be_block_bedata {
137	struct ctl_be_block_filedata file;
138};
139
140struct ctl_be_block_io;
141struct ctl_be_block_lun;
142
143typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
144			       struct ctl_be_block_io *beio);
145typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
146				  const char *attrname);
147
148/*
149 * Backend LUN structure.  There is a 1:1 mapping between a block device
150 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
151 */
152struct ctl_be_block_lun {
153	struct ctl_lun_create_params params;
154	char lunname[32];
155	char *dev_path;
156	ctl_be_block_type dev_type;
157	struct vnode *vn;
158	union ctl_be_block_bedata backend;
159	cbb_dispatch_t dispatch;
160	cbb_dispatch_t lun_flush;
161	cbb_dispatch_t unmap;
162	cbb_dispatch_t get_lba_status;
163	cbb_getattr_t getattr;
164	uma_zone_t lun_zone;
165	uint64_t size_blocks;
166	uint64_t size_bytes;
167	struct ctl_be_block_softc *softc;
168	struct devstat *disk_stats;
169	ctl_be_block_lun_flags flags;
170	STAILQ_ENTRY(ctl_be_block_lun) links;
171	struct ctl_be_lun cbe_lun;
172	struct taskqueue *io_taskqueue;
173	struct task io_task;
174	int num_threads;
175	STAILQ_HEAD(, ctl_io_hdr) input_queue;
176	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
177	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
178	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
179	struct mtx_padalign io_lock;
180	struct mtx_padalign queue_lock;
181};
182
183/*
184 * Overall softc structure for the block backend module.
185 */
186struct ctl_be_block_softc {
187	struct mtx			 lock;
188	int				 num_luns;
189	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
190};
191
192static struct ctl_be_block_softc backend_block_softc;
193
194/*
195 * Per-I/O information.
196 */
197struct ctl_be_block_io {
198	union ctl_io			*io;
199	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
200	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
201	int				bio_cmd;
202	int				num_segs;
203	int				num_bios_sent;
204	int				num_bios_done;
205	int				send_complete;
206	int				first_error;
207	uint64_t			first_error_offset;
208	struct bintime			ds_t0;
209	devstat_tag_type		ds_tag_type;
210	devstat_trans_flags		ds_trans_type;
211	uint64_t			io_len;
212	uint64_t			io_offset;
213	int				io_arg;
214	struct ctl_be_block_softc	*softc;
215	struct ctl_be_block_lun		*lun;
216	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
217};
218
219extern struct ctl_softc *control_softc;
220
221static int cbb_num_threads = 14;
222TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
223SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
224	    "CAM Target Layer Block Backend");
225SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
226           &cbb_num_threads, 0, "Number of threads per backing file");
227
228static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
229static void ctl_free_beio(struct ctl_be_block_io *beio);
230static void ctl_complete_beio(struct ctl_be_block_io *beio);
231static int ctl_be_block_move_done(union ctl_io *io);
232static void ctl_be_block_biodone(struct bio *bio);
233static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
234				    struct ctl_be_block_io *beio);
235static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
236				       struct ctl_be_block_io *beio);
237static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
238				  struct ctl_be_block_io *beio);
239static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
240					 const char *attrname);
241static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
242				   struct ctl_be_block_io *beio);
243static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
244				   struct ctl_be_block_io *beio);
245static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
246				      struct ctl_be_block_io *beio);
247static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
248					 const char *attrname);
249static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
250				    union ctl_io *io);
251static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
252				    union ctl_io *io);
253static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
254				  union ctl_io *io);
255static void ctl_be_block_worker(void *context, int pending);
256static int ctl_be_block_submit(union ctl_io *io);
257static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
258				   int flag, struct thread *td);
259static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
260				  struct ctl_lun_req *req);
261static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
262				 struct ctl_lun_req *req);
263static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
264static int ctl_be_block_open(struct ctl_be_block_lun *be_lun,
265			     struct ctl_lun_req *req);
266static int ctl_be_block_create(struct ctl_be_block_softc *softc,
267			       struct ctl_lun_req *req);
268static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
269			   struct ctl_lun_req *req);
270static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
271			   struct ctl_lun_req *req);
272static void ctl_be_block_lun_shutdown(void *be_lun);
273static void ctl_be_block_lun_config_status(void *be_lun,
274					   ctl_lun_config_status status);
275static int ctl_be_block_config_write(union ctl_io *io);
276static int ctl_be_block_config_read(union ctl_io *io);
277static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
278static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
279int ctl_be_block_init(void);
280
281static struct ctl_backend_driver ctl_be_block_driver =
282{
283	.name = "block",
284	.flags = CTL_BE_FLAG_HAS_CONFIG,
285	.init = ctl_be_block_init,
286	.data_submit = ctl_be_block_submit,
287	.data_move_done = ctl_be_block_move_done,
288	.config_read = ctl_be_block_config_read,
289	.config_write = ctl_be_block_config_write,
290	.ioctl = ctl_be_block_ioctl,
291	.lun_info = ctl_be_block_lun_info,
292	.lun_attr = ctl_be_block_lun_attr
293};
294
295MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
296CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
297
298static uma_zone_t beio_zone;
299
300static struct ctl_be_block_io *
301ctl_alloc_beio(struct ctl_be_block_softc *softc)
302{
303	struct ctl_be_block_io *beio;
304
305	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
306	beio->softc = softc;
307	return (beio);
308}
309
310static void
311ctl_free_beio(struct ctl_be_block_io *beio)
312{
313	int duplicate_free;
314	int i;
315
316	duplicate_free = 0;
317
318	for (i = 0; i < beio->num_segs; i++) {
319		if (beio->sg_segs[i].addr == NULL)
320			duplicate_free++;
321
322		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
323		beio->sg_segs[i].addr = NULL;
324
325		/* For compare we had two equal S/G lists. */
326		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
327			uma_zfree(beio->lun->lun_zone,
328			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
329			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
330		}
331	}
332
333	if (duplicate_free > 0) {
334		printf("%s: %d duplicate frees out of %d segments\n", __func__,
335		       duplicate_free, beio->num_segs);
336	}
337
338	uma_zfree(beio_zone, beio);
339}
340
341static void
342ctl_complete_beio(struct ctl_be_block_io *beio)
343{
344	union ctl_io *io = beio->io;
345
346	if (beio->beio_cont != NULL) {
347		beio->beio_cont(beio);
348	} else {
349		ctl_free_beio(beio);
350		ctl_data_submit_done(io);
351	}
352}
353
354static size_t
355cmp(uint8_t *a, uint8_t *b, size_t size)
356{
357	size_t i;
358
359	for (i = 0; i < size; i++) {
360		if (a[i] != b[i])
361			break;
362	}
363	return (i);
364}
365
366static void
367ctl_be_block_compare(union ctl_io *io)
368{
369	struct ctl_be_block_io *beio;
370	uint64_t off, res;
371	int i;
372	uint8_t info[8];
373
374	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
375	off = 0;
376	for (i = 0; i < beio->num_segs; i++) {
377		res = cmp(beio->sg_segs[i].addr,
378		    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
379		    beio->sg_segs[i].len);
380		off += res;
381		if (res < beio->sg_segs[i].len)
382			break;
383	}
384	if (i < beio->num_segs) {
385		scsi_u64to8b(off, info);
386		ctl_set_sense(&io->scsiio, /*current_error*/ 1,
387		    /*sense_key*/ SSD_KEY_MISCOMPARE,
388		    /*asc*/ 0x1D, /*ascq*/ 0x00,
389		    /*type*/ SSD_ELEM_INFO,
390		    /*size*/ sizeof(info), /*data*/ &info,
391		    /*type*/ SSD_ELEM_NONE);
392	} else
393		ctl_set_success(&io->scsiio);
394}
395
396static int
397ctl_be_block_move_done(union ctl_io *io)
398{
399	struct ctl_be_block_io *beio;
400	struct ctl_be_block_lun *be_lun;
401	struct ctl_lba_len_flags *lbalen;
402#ifdef CTL_TIME_IO
403	struct bintime cur_bt;
404#endif
405
406	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
407	be_lun = beio->lun;
408
409	DPRINTF("entered\n");
410
411#ifdef CTL_TIME_IO
412	getbinuptime(&cur_bt);
413	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
414	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
415#endif
416	io->io_hdr.num_dmas++;
417	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
418
419	/*
420	 * We set status at this point for read commands, and write
421	 * commands with errors.
422	 */
423	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
424		;
425	} else if ((io->io_hdr.port_status != 0) &&
426	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
427	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
428		ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1,
429		    /*retry_count*/ io->io_hdr.port_status);
430	} else if (io->scsiio.kern_data_resid != 0 &&
431	    (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT &&
432	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
433	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
434		ctl_set_invalid_field_ciu(&io->scsiio);
435	} else if ((io->io_hdr.port_status == 0) &&
436	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
437		lbalen = ARGS(beio->io);
438		if (lbalen->flags & CTL_LLF_READ) {
439			ctl_set_success(&io->scsiio);
440		} else if (lbalen->flags & CTL_LLF_COMPARE) {
441			/* We have two data blocks ready for comparison. */
442			ctl_be_block_compare(io);
443		}
444	}
445
446	/*
447	 * If this is a read, or a write with errors, it is done.
448	 */
449	if ((beio->bio_cmd == BIO_READ)
450	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
451	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
452		ctl_complete_beio(beio);
453		return (0);
454	}
455
456	/*
457	 * At this point, we have a write and the DMA completed
458	 * successfully.  We now have to queue it to the task queue to
459	 * execute the backend I/O.  That is because we do blocking
460	 * memory allocations, and in the file backing case, blocking I/O.
461	 * This move done routine is generally called in the SIM's
462	 * interrupt context, and therefore we cannot block.
463	 */
464	mtx_lock(&be_lun->queue_lock);
465	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
466	mtx_unlock(&be_lun->queue_lock);
467	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
468
469	return (0);
470}
471
472static void
473ctl_be_block_biodone(struct bio *bio)
474{
475	struct ctl_be_block_io *beio;
476	struct ctl_be_block_lun *be_lun;
477	union ctl_io *io;
478	int error;
479
480	beio = bio->bio_caller1;
481	be_lun = beio->lun;
482	io = beio->io;
483
484	DPRINTF("entered\n");
485
486	error = bio->bio_error;
487	mtx_lock(&be_lun->io_lock);
488	if (error != 0 &&
489	    (beio->first_error == 0 ||
490	     bio->bio_offset < beio->first_error_offset)) {
491		beio->first_error = error;
492		beio->first_error_offset = bio->bio_offset;
493	}
494
495	beio->num_bios_done++;
496
497	/*
498	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
499	 * during the free might cause it to complain.
500	 */
501	g_destroy_bio(bio);
502
503	/*
504	 * If the send complete bit isn't set, or we aren't the last I/O to
505	 * complete, then we're done.
506	 */
507	if ((beio->send_complete == 0)
508	 || (beio->num_bios_done < beio->num_bios_sent)) {
509		mtx_unlock(&be_lun->io_lock);
510		return;
511	}
512
513	/*
514	 * At this point, we've verified that we are the last I/O to
515	 * complete, so it's safe to drop the lock.
516	 */
517	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
518	    beio->ds_tag_type, beio->ds_trans_type,
519	    /*now*/ NULL, /*then*/&beio->ds_t0);
520	mtx_unlock(&be_lun->io_lock);
521
522	/*
523	 * If there are any errors from the backing device, we fail the
524	 * entire I/O with a medium error.
525	 */
526	error = beio->first_error;
527	if (error != 0) {
528		if (error == EOPNOTSUPP) {
529			ctl_set_invalid_opcode(&io->scsiio);
530		} else if (error == ENOSPC || error == EDQUOT) {
531			ctl_set_space_alloc_fail(&io->scsiio);
532		} else if (error == EROFS || error == EACCES) {
533			ctl_set_hw_write_protected(&io->scsiio);
534		} else if (beio->bio_cmd == BIO_FLUSH) {
535			/* XXX KDM is there is a better error here? */
536			ctl_set_internal_failure(&io->scsiio,
537						 /*sks_valid*/ 1,
538						 /*retry_count*/ 0xbad2);
539		} else {
540			ctl_set_medium_error(&io->scsiio,
541			    beio->bio_cmd == BIO_READ);
542		}
543		ctl_complete_beio(beio);
544		return;
545	}
546
547	/*
548	 * If this is a write, a flush, a delete or verify, we're all done.
549	 * If this is a read, we can now send the data to the user.
550	 */
551	if ((beio->bio_cmd == BIO_WRITE)
552	 || (beio->bio_cmd == BIO_FLUSH)
553	 || (beio->bio_cmd == BIO_DELETE)
554	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
555		ctl_set_success(&io->scsiio);
556		ctl_complete_beio(beio);
557	} else {
558		if ((ARGS(io)->flags & CTL_LLF_READ) &&
559		    beio->beio_cont == NULL) {
560			ctl_set_success(&io->scsiio);
561			ctl_serseq_done(io);
562		}
563#ifdef CTL_TIME_IO
564		getbinuptime(&io->io_hdr.dma_start_bt);
565#endif
566		ctl_datamove(io);
567	}
568}
569
570static void
571ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
572			struct ctl_be_block_io *beio)
573{
574	union ctl_io *io = beio->io;
575	struct mount *mountpoint;
576	int error, lock_flags;
577
578	DPRINTF("entered\n");
579
580	binuptime(&beio->ds_t0);
581	mtx_lock(&be_lun->io_lock);
582	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
583	mtx_unlock(&be_lun->io_lock);
584
585	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
586
587	if (MNT_SHARED_WRITES(mountpoint) ||
588	    ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
589		lock_flags = LK_SHARED;
590	else
591		lock_flags = LK_EXCLUSIVE;
592	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
593	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
594	    curthread);
595	VOP_UNLOCK(be_lun->vn, 0);
596
597	vn_finished_write(mountpoint);
598
599	mtx_lock(&be_lun->io_lock);
600	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
601	    beio->ds_tag_type, beio->ds_trans_type,
602	    /*now*/ NULL, /*then*/&beio->ds_t0);
603	mtx_unlock(&be_lun->io_lock);
604
605	if (error == 0)
606		ctl_set_success(&io->scsiio);
607	else {
608		/* XXX KDM is there is a better error here? */
609		ctl_set_internal_failure(&io->scsiio,
610					 /*sks_valid*/ 1,
611					 /*retry_count*/ 0xbad1);
612	}
613
614	ctl_complete_beio(beio);
615}
616
617SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t");
618SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t");
619SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t");
620SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t");
621
622static void
623ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
624			   struct ctl_be_block_io *beio)
625{
626	struct ctl_be_block_filedata *file_data;
627	union ctl_io *io;
628	struct uio xuio;
629	struct iovec *xiovec;
630	size_t s;
631	int error, flags, i;
632
633	DPRINTF("entered\n");
634
635	file_data = &be_lun->backend.file;
636	io = beio->io;
637	flags = 0;
638	if (ARGS(io)->flags & CTL_LLF_DPO)
639		flags |= IO_DIRECT;
640	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
641		flags |= IO_SYNC;
642
643	bzero(&xuio, sizeof(xuio));
644	if (beio->bio_cmd == BIO_READ) {
645		SDT_PROBE0(cbb, , read, file_start);
646		xuio.uio_rw = UIO_READ;
647	} else {
648		SDT_PROBE0(cbb, , write, file_start);
649		xuio.uio_rw = UIO_WRITE;
650	}
651	xuio.uio_offset = beio->io_offset;
652	xuio.uio_resid = beio->io_len;
653	xuio.uio_segflg = UIO_SYSSPACE;
654	xuio.uio_iov = beio->xiovecs;
655	xuio.uio_iovcnt = beio->num_segs;
656	xuio.uio_td = curthread;
657
658	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
659		xiovec->iov_base = beio->sg_segs[i].addr;
660		xiovec->iov_len = beio->sg_segs[i].len;
661	}
662
663	binuptime(&beio->ds_t0);
664	mtx_lock(&be_lun->io_lock);
665	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
666	mtx_unlock(&be_lun->io_lock);
667
668	if (beio->bio_cmd == BIO_READ) {
669		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
670
671		/*
672		 * UFS pays attention to IO_DIRECT for reads.  If the
673		 * DIRECTIO option is configured into the kernel, it calls
674		 * ffs_rawread().  But that only works for single-segment
675		 * uios with user space addresses.  In our case, with a
676		 * kernel uio, it still reads into the buffer cache, but it
677		 * will just try to release the buffer from the cache later
678		 * on in ffs_read().
679		 *
680		 * ZFS does not pay attention to IO_DIRECT for reads.
681		 *
682		 * UFS does not pay attention to IO_SYNC for reads.
683		 *
684		 * ZFS pays attention to IO_SYNC (which translates into the
685		 * Solaris define FRSYNC for zfs_read()) for reads.  It
686		 * attempts to sync the file before reading.
687		 */
688		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
689
690		VOP_UNLOCK(be_lun->vn, 0);
691		SDT_PROBE0(cbb, , read, file_done);
692		if (error == 0 && xuio.uio_resid > 0) {
693			/*
694			 * If we red less then requested (EOF), then
695			 * we should clean the rest of the buffer.
696			 */
697			s = beio->io_len - xuio.uio_resid;
698			for (i = 0; i < beio->num_segs; i++) {
699				if (s >= beio->sg_segs[i].len) {
700					s -= beio->sg_segs[i].len;
701					continue;
702				}
703				bzero((uint8_t *)beio->sg_segs[i].addr + s,
704				    beio->sg_segs[i].len - s);
705				s = 0;
706			}
707		}
708	} else {
709		struct mount *mountpoint;
710		int lock_flags;
711
712		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
713
714		if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL)
715		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
716			lock_flags = LK_SHARED;
717		else
718			lock_flags = LK_EXCLUSIVE;
719		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
720
721		/*
722		 * UFS pays attention to IO_DIRECT for writes.  The write
723		 * is done asynchronously.  (Normally the write would just
724		 * get put into cache.
725		 *
726		 * UFS pays attention to IO_SYNC for writes.  It will
727		 * attempt to write the buffer out synchronously if that
728		 * flag is set.
729		 *
730		 * ZFS does not pay attention to IO_DIRECT for writes.
731		 *
732		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
733		 * for writes.  It will flush the transaction from the
734		 * cache before returning.
735		 */
736		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
737		VOP_UNLOCK(be_lun->vn, 0);
738
739		vn_finished_write(mountpoint);
740		SDT_PROBE0(cbb, , write, file_done);
741        }
742
743	mtx_lock(&be_lun->io_lock);
744	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
745	    beio->ds_tag_type, beio->ds_trans_type,
746	    /*now*/ NULL, /*then*/&beio->ds_t0);
747	mtx_unlock(&be_lun->io_lock);
748
749	/*
750	 * If we got an error, set the sense data to "MEDIUM ERROR" and
751	 * return the I/O to the user.
752	 */
753	if (error != 0) {
754		if (error == ENOSPC || error == EDQUOT) {
755			ctl_set_space_alloc_fail(&io->scsiio);
756		} else if (error == EROFS || error == EACCES) {
757			ctl_set_hw_write_protected(&io->scsiio);
758		} else {
759			ctl_set_medium_error(&io->scsiio,
760			    beio->bio_cmd == BIO_READ);
761		}
762		ctl_complete_beio(beio);
763		return;
764	}
765
766	/*
767	 * If this is a write or a verify, we're all done.
768	 * If this is a read, we can now send the data to the user.
769	 */
770	if ((beio->bio_cmd == BIO_WRITE) ||
771	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
772		ctl_set_success(&io->scsiio);
773		ctl_complete_beio(beio);
774	} else {
775		if ((ARGS(io)->flags & CTL_LLF_READ) &&
776		    beio->beio_cont == NULL) {
777			ctl_set_success(&io->scsiio);
778			ctl_serseq_done(io);
779		}
780#ifdef CTL_TIME_IO
781		getbinuptime(&io->io_hdr.dma_start_bt);
782#endif
783		ctl_datamove(io);
784	}
785}
786
787static void
788ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
789			struct ctl_be_block_io *beio)
790{
791	union ctl_io *io = beio->io;
792	struct ctl_lba_len_flags *lbalen = ARGS(io);
793	struct scsi_get_lba_status_data *data;
794	off_t roff, off;
795	int error, status;
796
797	DPRINTF("entered\n");
798
799	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
800	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
801	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
802	    0, curthread->td_ucred, curthread);
803	if (error == 0 && off > roff)
804		status = 0;	/* mapped up to off */
805	else {
806		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
807		    0, curthread->td_ucred, curthread);
808		if (error == 0 && off > roff)
809			status = 1;	/* deallocated up to off */
810		else {
811			status = 0;	/* unknown up to the end */
812			off = be_lun->size_bytes;
813		}
814	}
815	VOP_UNLOCK(be_lun->vn, 0);
816
817	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
818	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
819	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
820	    lbalen->lba), data->descr[0].length);
821	data->descr[0].status = status;
822
823	ctl_complete_beio(beio);
824}
825
826static uint64_t
827ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
828{
829	struct vattr		vattr;
830	struct statfs		statfs;
831	uint64_t		val;
832	int			error;
833
834	val = UINT64_MAX;
835	if (be_lun->vn == NULL)
836		return (val);
837	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
838	if (strcmp(attrname, "blocksused") == 0) {
839		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
840		if (error == 0)
841			val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
842	}
843	if (strcmp(attrname, "blocksavail") == 0 &&
844	    (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
845		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
846		if (error == 0)
847			val = statfs.f_bavail * statfs.f_bsize /
848			    be_lun->cbe_lun.blocksize;
849	}
850	VOP_UNLOCK(be_lun->vn, 0);
851	return (val);
852}
853
854static void
855ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
856			   struct ctl_be_block_io *beio)
857{
858	union ctl_io *io;
859	struct cdevsw *csw;
860	struct cdev *dev;
861	struct uio xuio;
862	struct iovec *xiovec;
863	int error, flags, i, ref;
864
865	DPRINTF("entered\n");
866
867	io = beio->io;
868	flags = 0;
869	if (ARGS(io)->flags & CTL_LLF_DPO)
870		flags |= IO_DIRECT;
871	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
872		flags |= IO_SYNC;
873
874	bzero(&xuio, sizeof(xuio));
875	if (beio->bio_cmd == BIO_READ) {
876		SDT_PROBE0(cbb, , read, file_start);
877		xuio.uio_rw = UIO_READ;
878	} else {
879		SDT_PROBE0(cbb, , write, file_start);
880		xuio.uio_rw = UIO_WRITE;
881	}
882	xuio.uio_offset = beio->io_offset;
883	xuio.uio_resid = beio->io_len;
884	xuio.uio_segflg = UIO_SYSSPACE;
885	xuio.uio_iov = beio->xiovecs;
886	xuio.uio_iovcnt = beio->num_segs;
887	xuio.uio_td = curthread;
888
889	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
890		xiovec->iov_base = beio->sg_segs[i].addr;
891		xiovec->iov_len = beio->sg_segs[i].len;
892	}
893
894	binuptime(&beio->ds_t0);
895	mtx_lock(&be_lun->io_lock);
896	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
897	mtx_unlock(&be_lun->io_lock);
898
899	csw = devvn_refthread(be_lun->vn, &dev, &ref);
900	if (csw) {
901		if (beio->bio_cmd == BIO_READ)
902			error = csw->d_read(dev, &xuio, flags);
903		else
904			error = csw->d_write(dev, &xuio, flags);
905		dev_relthread(dev, ref);
906	} else
907		error = ENXIO;
908
909	if (beio->bio_cmd == BIO_READ)
910		SDT_PROBE0(cbb, , read, file_done);
911	else
912		SDT_PROBE0(cbb, , write, file_done);
913
914	mtx_lock(&be_lun->io_lock);
915	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
916	    beio->ds_tag_type, beio->ds_trans_type,
917	    /*now*/ NULL, /*then*/&beio->ds_t0);
918	mtx_unlock(&be_lun->io_lock);
919
920	/*
921	 * If we got an error, set the sense data to "MEDIUM ERROR" and
922	 * return the I/O to the user.
923	 */
924	if (error != 0) {
925		if (error == ENOSPC || error == EDQUOT) {
926			ctl_set_space_alloc_fail(&io->scsiio);
927		} else if (error == EROFS || error == EACCES) {
928			ctl_set_hw_write_protected(&io->scsiio);
929		} else {
930			ctl_set_medium_error(&io->scsiio,
931			    beio->bio_cmd == BIO_READ);
932		}
933		ctl_complete_beio(beio);
934		return;
935	}
936
937	/*
938	 * If this is a write or a verify, we're all done.
939	 * If this is a read, we can now send the data to the user.
940	 */
941	if ((beio->bio_cmd == BIO_WRITE) ||
942	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
943		ctl_set_success(&io->scsiio);
944		ctl_complete_beio(beio);
945	} else {
946		if ((ARGS(io)->flags & CTL_LLF_READ) &&
947		    beio->beio_cont == NULL) {
948			ctl_set_success(&io->scsiio);
949			ctl_serseq_done(io);
950		}
951#ifdef CTL_TIME_IO
952		getbinuptime(&io->io_hdr.dma_start_bt);
953#endif
954		ctl_datamove(io);
955	}
956}
957
958static void
959ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
960			struct ctl_be_block_io *beio)
961{
962	union ctl_io *io = beio->io;
963	struct cdevsw *csw;
964	struct cdev *dev;
965	struct ctl_lba_len_flags *lbalen = ARGS(io);
966	struct scsi_get_lba_status_data *data;
967	off_t roff, off;
968	int error, ref, status;
969
970	DPRINTF("entered\n");
971
972	csw = devvn_refthread(be_lun->vn, &dev, &ref);
973	if (csw == NULL) {
974		status = 0;	/* unknown up to the end */
975		off = be_lun->size_bytes;
976		goto done;
977	}
978	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
979	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
980	    curthread);
981	if (error == 0 && off > roff)
982		status = 0;	/* mapped up to off */
983	else {
984		error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
985		    curthread);
986		if (error == 0 && off > roff)
987			status = 1;	/* deallocated up to off */
988		else {
989			status = 0;	/* unknown up to the end */
990			off = be_lun->size_bytes;
991		}
992	}
993	dev_relthread(dev, ref);
994
995done:
996	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
997	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
998	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
999	    lbalen->lba), data->descr[0].length);
1000	data->descr[0].status = status;
1001
1002	ctl_complete_beio(beio);
1003}
1004
1005static void
1006ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
1007		       struct ctl_be_block_io *beio)
1008{
1009	struct bio *bio;
1010	struct cdevsw *csw;
1011	struct cdev *dev;
1012	int ref;
1013
1014	DPRINTF("entered\n");
1015
1016	/* This can't fail, it's a blocking allocation. */
1017	bio = g_alloc_bio();
1018
1019	bio->bio_cmd	    = BIO_FLUSH;
1020	bio->bio_offset	    = 0;
1021	bio->bio_data	    = 0;
1022	bio->bio_done	    = ctl_be_block_biodone;
1023	bio->bio_caller1    = beio;
1024	bio->bio_pblkno	    = 0;
1025
1026	/*
1027	 * We don't need to acquire the LUN lock here, because we are only
1028	 * sending one bio, and so there is no other context to synchronize
1029	 * with.
1030	 */
1031	beio->num_bios_sent = 1;
1032	beio->send_complete = 1;
1033
1034	binuptime(&beio->ds_t0);
1035	mtx_lock(&be_lun->io_lock);
1036	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1037	mtx_unlock(&be_lun->io_lock);
1038
1039	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1040	if (csw) {
1041		bio->bio_dev = dev;
1042		csw->d_strategy(bio);
1043		dev_relthread(dev, ref);
1044	} else {
1045		bio->bio_error = ENXIO;
1046		ctl_be_block_biodone(bio);
1047	}
1048}
1049
1050static void
1051ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1052		       struct ctl_be_block_io *beio,
1053		       uint64_t off, uint64_t len, int last)
1054{
1055	struct bio *bio;
1056	uint64_t maxlen;
1057	struct cdevsw *csw;
1058	struct cdev *dev;
1059	int ref;
1060
1061	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1062	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
1063	while (len > 0) {
1064		bio = g_alloc_bio();
1065		bio->bio_cmd	    = BIO_DELETE;
1066		bio->bio_dev	    = dev;
1067		bio->bio_offset	    = off;
1068		bio->bio_length	    = MIN(len, maxlen);
1069		bio->bio_data	    = 0;
1070		bio->bio_done	    = ctl_be_block_biodone;
1071		bio->bio_caller1    = beio;
1072		bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
1073
1074		off += bio->bio_length;
1075		len -= bio->bio_length;
1076
1077		mtx_lock(&be_lun->io_lock);
1078		beio->num_bios_sent++;
1079		if (last && len == 0)
1080			beio->send_complete = 1;
1081		mtx_unlock(&be_lun->io_lock);
1082
1083		if (csw) {
1084			csw->d_strategy(bio);
1085		} else {
1086			bio->bio_error = ENXIO;
1087			ctl_be_block_biodone(bio);
1088		}
1089	}
1090	if (csw)
1091		dev_relthread(dev, ref);
1092}
1093
1094static void
1095ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1096		       struct ctl_be_block_io *beio)
1097{
1098	union ctl_io *io;
1099	struct ctl_ptr_len_flags *ptrlen;
1100	struct scsi_unmap_desc *buf, *end;
1101	uint64_t len;
1102
1103	io = beio->io;
1104
1105	DPRINTF("entered\n");
1106
1107	binuptime(&beio->ds_t0);
1108	mtx_lock(&be_lun->io_lock);
1109	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1110	mtx_unlock(&be_lun->io_lock);
1111
1112	if (beio->io_offset == -1) {
1113		beio->io_len = 0;
1114		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1115		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1116		end = buf + ptrlen->len / sizeof(*buf);
1117		for (; buf < end; buf++) {
1118			len = (uint64_t)scsi_4btoul(buf->length) *
1119			    be_lun->cbe_lun.blocksize;
1120			beio->io_len += len;
1121			ctl_be_block_unmap_dev_range(be_lun, beio,
1122			    scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
1123			    len, (end - buf < 2) ? TRUE : FALSE);
1124		}
1125	} else
1126		ctl_be_block_unmap_dev_range(be_lun, beio,
1127		    beio->io_offset, beio->io_len, TRUE);
1128}
1129
1130static void
1131ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1132			  struct ctl_be_block_io *beio)
1133{
1134	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1135	struct bio *bio;
1136	struct cdevsw *csw;
1137	struct cdev *dev;
1138	off_t cur_offset;
1139	int i, max_iosize, ref;
1140
1141	DPRINTF("entered\n");
1142	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1143
1144	/*
1145	 * We have to limit our I/O size to the maximum supported by the
1146	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1147	 * set it properly, use DFLTPHYS.
1148	 */
1149	if (csw) {
1150		max_iosize = dev->si_iosize_max;
1151		if (max_iosize < PAGE_SIZE)
1152			max_iosize = DFLTPHYS;
1153	} else
1154		max_iosize = DFLTPHYS;
1155
1156	cur_offset = beio->io_offset;
1157	for (i = 0; i < beio->num_segs; i++) {
1158		size_t cur_size;
1159		uint8_t *cur_ptr;
1160
1161		cur_size = beio->sg_segs[i].len;
1162		cur_ptr = beio->sg_segs[i].addr;
1163
1164		while (cur_size > 0) {
1165			/* This can't fail, it's a blocking allocation. */
1166			bio = g_alloc_bio();
1167
1168			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1169
1170			bio->bio_cmd = beio->bio_cmd;
1171			bio->bio_dev = dev;
1172			bio->bio_caller1 = beio;
1173			bio->bio_length = min(cur_size, max_iosize);
1174			bio->bio_offset = cur_offset;
1175			bio->bio_data = cur_ptr;
1176			bio->bio_done = ctl_be_block_biodone;
1177			bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
1178
1179			cur_offset += bio->bio_length;
1180			cur_ptr += bio->bio_length;
1181			cur_size -= bio->bio_length;
1182
1183			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1184			beio->num_bios_sent++;
1185		}
1186	}
1187	binuptime(&beio->ds_t0);
1188	mtx_lock(&be_lun->io_lock);
1189	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1190	beio->send_complete = 1;
1191	mtx_unlock(&be_lun->io_lock);
1192
1193	/*
1194	 * Fire off all allocated requests!
1195	 */
1196	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1197		TAILQ_REMOVE(&queue, bio, bio_queue);
1198		if (csw)
1199			csw->d_strategy(bio);
1200		else {
1201			bio->bio_error = ENXIO;
1202			ctl_be_block_biodone(bio);
1203		}
1204	}
1205	if (csw)
1206		dev_relthread(dev, ref);
1207}
1208
1209static uint64_t
1210ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1211{
1212	struct diocgattr_arg	arg;
1213	struct cdevsw *csw;
1214	struct cdev *dev;
1215	int error, ref;
1216
1217	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1218	if (csw == NULL)
1219		return (UINT64_MAX);
1220	strlcpy(arg.name, attrname, sizeof(arg.name));
1221	arg.len = sizeof(arg.value.off);
1222	if (csw->d_ioctl) {
1223		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
1224		    curthread);
1225	} else
1226		error = ENODEV;
1227	dev_relthread(dev, ref);
1228	if (error != 0)
1229		return (UINT64_MAX);
1230	return (arg.value.off);
1231}
1232
1233static void
1234ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
1235			    union ctl_io *io)
1236{
1237	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1238	struct ctl_be_block_io *beio;
1239	struct ctl_lba_len_flags *lbalen;
1240
1241	DPRINTF("entered\n");
1242	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1243	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1244
1245	beio->io_len = lbalen->len * cbe_lun->blocksize;
1246	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1247	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
1248	beio->bio_cmd = BIO_FLUSH;
1249	beio->ds_trans_type = DEVSTAT_NO_DATA;
1250	DPRINTF("SYNC\n");
1251	be_lun->lun_flush(be_lun, beio);
1252}
1253
1254static void
1255ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1256{
1257	union ctl_io *io;
1258
1259	io = beio->io;
1260	ctl_free_beio(beio);
1261	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1262	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1263	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1264		ctl_config_write_done(io);
1265		return;
1266	}
1267
1268	ctl_be_block_config_write(io);
1269}
1270
1271static void
1272ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1273			    union ctl_io *io)
1274{
1275	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1276	struct ctl_be_block_io *beio;
1277	struct ctl_lba_len_flags *lbalen;
1278	uint64_t len_left, lba;
1279	uint32_t pb, pbo, adj;
1280	int i, seglen;
1281	uint8_t *buf, *end;
1282
1283	DPRINTF("entered\n");
1284
1285	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1286	lbalen = ARGS(beio->io);
1287
1288	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1289	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1290		ctl_free_beio(beio);
1291		ctl_set_invalid_field(&io->scsiio,
1292				      /*sks_valid*/ 1,
1293				      /*command*/ 1,
1294				      /*field*/ 1,
1295				      /*bit_valid*/ 0,
1296				      /*bit*/ 0);
1297		ctl_config_write_done(io);
1298		return;
1299	}
1300
1301	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1302		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1303		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1304		beio->bio_cmd = BIO_DELETE;
1305		beio->ds_trans_type = DEVSTAT_FREE;
1306
1307		be_lun->unmap(be_lun, beio);
1308		return;
1309	}
1310
1311	beio->bio_cmd = BIO_WRITE;
1312	beio->ds_trans_type = DEVSTAT_WRITE;
1313
1314	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1315	       (uintmax_t)lbalen->lba, lbalen->len);
1316
1317	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1318	if (be_lun->cbe_lun.pblockoff > 0)
1319		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1320	else
1321		pbo = 0;
1322	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1323	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1324
1325		/*
1326		 * Setup the S/G entry for this chunk.
1327		 */
1328		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1329		if (pb > cbe_lun->blocksize) {
1330			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1331			    seglen - pbo) % pb;
1332			if (seglen > adj)
1333				seglen -= adj;
1334			else
1335				seglen -= seglen % cbe_lun->blocksize;
1336		} else
1337			seglen -= seglen % cbe_lun->blocksize;
1338		beio->sg_segs[i].len = seglen;
1339		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1340
1341		DPRINTF("segment %d addr %p len %zd\n", i,
1342			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1343
1344		beio->num_segs++;
1345		len_left -= seglen;
1346
1347		buf = beio->sg_segs[i].addr;
1348		end = buf + seglen;
1349		for (; buf < end; buf += cbe_lun->blocksize) {
1350			if (lbalen->flags & SWS_NDOB) {
1351				memset(buf, 0, cbe_lun->blocksize);
1352			} else {
1353				memcpy(buf, io->scsiio.kern_data_ptr,
1354				    cbe_lun->blocksize);
1355			}
1356			if (lbalen->flags & SWS_LBDATA)
1357				scsi_ulto4b(lbalen->lba + lba, buf);
1358			lba++;
1359		}
1360	}
1361
1362	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1363	beio->io_len = lba * cbe_lun->blocksize;
1364
1365	/* We can not do all in one run. Correct and schedule rerun. */
1366	if (len_left > 0) {
1367		lbalen->lba += lba;
1368		lbalen->len -= lba;
1369		beio->beio_cont = ctl_be_block_cw_done_ws;
1370	}
1371
1372	be_lun->dispatch(be_lun, beio);
1373}
1374
1375static void
1376ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1377			    union ctl_io *io)
1378{
1379	struct ctl_be_block_io *beio;
1380	struct ctl_ptr_len_flags *ptrlen;
1381
1382	DPRINTF("entered\n");
1383
1384	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1385	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1386
1387	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1388		ctl_free_beio(beio);
1389		ctl_set_invalid_field(&io->scsiio,
1390				      /*sks_valid*/ 0,
1391				      /*command*/ 1,
1392				      /*field*/ 0,
1393				      /*bit_valid*/ 0,
1394				      /*bit*/ 0);
1395		ctl_config_write_done(io);
1396		return;
1397	}
1398
1399	beio->io_len = 0;
1400	beio->io_offset = -1;
1401	beio->bio_cmd = BIO_DELETE;
1402	beio->ds_trans_type = DEVSTAT_FREE;
1403	DPRINTF("UNMAP\n");
1404	be_lun->unmap(be_lun, beio);
1405}
1406
1407static void
1408ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1409{
1410	union ctl_io *io;
1411
1412	io = beio->io;
1413	ctl_free_beio(beio);
1414	ctl_config_read_done(io);
1415}
1416
1417static void
1418ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1419			 union ctl_io *io)
1420{
1421	struct ctl_be_block_io *beio;
1422	struct ctl_be_block_softc *softc;
1423
1424	DPRINTF("entered\n");
1425
1426	softc = be_lun->softc;
1427	beio = ctl_alloc_beio(softc);
1428	beio->io = io;
1429	beio->lun = be_lun;
1430	beio->beio_cont = ctl_be_block_cr_done;
1431	PRIV(io)->ptr = (void *)beio;
1432
1433	switch (io->scsiio.cdb[0]) {
1434	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1435		beio->bio_cmd = -1;
1436		beio->ds_trans_type = DEVSTAT_NO_DATA;
1437		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1438		beio->io_len = 0;
1439		if (be_lun->get_lba_status)
1440			be_lun->get_lba_status(be_lun, beio);
1441		else
1442			ctl_be_block_cr_done(beio);
1443		break;
1444	default:
1445		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1446		break;
1447	}
1448}
1449
1450static void
1451ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1452{
1453	union ctl_io *io;
1454
1455	io = beio->io;
1456	ctl_free_beio(beio);
1457	ctl_config_write_done(io);
1458}
1459
1460static void
1461ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1462			 union ctl_io *io)
1463{
1464	struct ctl_be_block_io *beio;
1465	struct ctl_be_block_softc *softc;
1466
1467	DPRINTF("entered\n");
1468
1469	softc = be_lun->softc;
1470	beio = ctl_alloc_beio(softc);
1471	beio->io = io;
1472	beio->lun = be_lun;
1473	beio->beio_cont = ctl_be_block_cw_done;
1474	switch (io->scsiio.tag_type) {
1475	case CTL_TAG_ORDERED:
1476		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1477		break;
1478	case CTL_TAG_HEAD_OF_QUEUE:
1479		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1480		break;
1481	case CTL_TAG_UNTAGGED:
1482	case CTL_TAG_SIMPLE:
1483	case CTL_TAG_ACA:
1484	default:
1485		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1486		break;
1487	}
1488	PRIV(io)->ptr = (void *)beio;
1489
1490	switch (io->scsiio.cdb[0]) {
1491	case SYNCHRONIZE_CACHE:
1492	case SYNCHRONIZE_CACHE_16:
1493		ctl_be_block_cw_dispatch_sync(be_lun, io);
1494		break;
1495	case WRITE_SAME_10:
1496	case WRITE_SAME_16:
1497		ctl_be_block_cw_dispatch_ws(be_lun, io);
1498		break;
1499	case UNMAP:
1500		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1501		break;
1502	default:
1503		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1504		break;
1505	}
1506}
1507
1508SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t");
1509SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t");
1510SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t");
1511SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t");
1512
1513static void
1514ctl_be_block_next(struct ctl_be_block_io *beio)
1515{
1516	struct ctl_be_block_lun *be_lun;
1517	union ctl_io *io;
1518
1519	io = beio->io;
1520	be_lun = beio->lun;
1521	ctl_free_beio(beio);
1522	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1523	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1524	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1525		ctl_data_submit_done(io);
1526		return;
1527	}
1528
1529	io->io_hdr.status &= ~CTL_STATUS_MASK;
1530	io->io_hdr.status |= CTL_STATUS_NONE;
1531
1532	mtx_lock(&be_lun->queue_lock);
1533	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1534	mtx_unlock(&be_lun->queue_lock);
1535	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1536}
1537
1538static void
1539ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1540			   union ctl_io *io)
1541{
1542	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1543	struct ctl_be_block_io *beio;
1544	struct ctl_be_block_softc *softc;
1545	struct ctl_lba_len_flags *lbalen;
1546	struct ctl_ptr_len_flags *bptrlen;
1547	uint64_t len_left, lbas;
1548	int i;
1549
1550	softc = be_lun->softc;
1551
1552	DPRINTF("entered\n");
1553
1554	lbalen = ARGS(io);
1555	if (lbalen->flags & CTL_LLF_WRITE) {
1556		SDT_PROBE0(cbb, , write, start);
1557	} else {
1558		SDT_PROBE0(cbb, , read, start);
1559	}
1560
1561	beio = ctl_alloc_beio(softc);
1562	beio->io = io;
1563	beio->lun = be_lun;
1564	bptrlen = PRIV(io);
1565	bptrlen->ptr = (void *)beio;
1566
1567	switch (io->scsiio.tag_type) {
1568	case CTL_TAG_ORDERED:
1569		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1570		break;
1571	case CTL_TAG_HEAD_OF_QUEUE:
1572		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1573		break;
1574	case CTL_TAG_UNTAGGED:
1575	case CTL_TAG_SIMPLE:
1576	case CTL_TAG_ACA:
1577	default:
1578		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1579		break;
1580	}
1581
1582	if (lbalen->flags & CTL_LLF_WRITE) {
1583		beio->bio_cmd = BIO_WRITE;
1584		beio->ds_trans_type = DEVSTAT_WRITE;
1585	} else {
1586		beio->bio_cmd = BIO_READ;
1587		beio->ds_trans_type = DEVSTAT_READ;
1588	}
1589
1590	DPRINTF("%s at LBA %jx len %u @%ju\n",
1591	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1592	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1593	if (lbalen->flags & CTL_LLF_COMPARE)
1594		lbas = CTLBLK_HALF_IO_SIZE;
1595	else
1596		lbas = CTLBLK_MAX_IO_SIZE;
1597	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
1598	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
1599	beio->io_len = lbas * cbe_lun->blocksize;
1600	bptrlen->len += lbas;
1601
1602	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1603		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1604		    i, CTLBLK_MAX_SEGS));
1605
1606		/*
1607		 * Setup the S/G entry for this chunk.
1608		 */
1609		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1610		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1611
1612		DPRINTF("segment %d addr %p len %zd\n", i,
1613			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1614
1615		/* Set up second segment for compare operation. */
1616		if (lbalen->flags & CTL_LLF_COMPARE) {
1617			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1618			    beio->sg_segs[i].len;
1619			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1620			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
1621		}
1622
1623		beio->num_segs++;
1624		len_left -= beio->sg_segs[i].len;
1625	}
1626	if (bptrlen->len < lbalen->len)
1627		beio->beio_cont = ctl_be_block_next;
1628	io->scsiio.be_move_done = ctl_be_block_move_done;
1629	/* For compare we have separate S/G lists for read and datamove. */
1630	if (lbalen->flags & CTL_LLF_COMPARE)
1631		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1632	else
1633		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1634	io->scsiio.kern_data_len = beio->io_len;
1635	io->scsiio.kern_sg_entries = beio->num_segs;
1636	io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
1637
1638	/*
1639	 * For the read case, we need to read the data into our buffers and
1640	 * then we can send it back to the user.  For the write case, we
1641	 * need to get the data from the user first.
1642	 */
1643	if (beio->bio_cmd == BIO_READ) {
1644		SDT_PROBE0(cbb, , read, alloc_done);
1645		be_lun->dispatch(be_lun, beio);
1646	} else {
1647		SDT_PROBE0(cbb, , write, alloc_done);
1648#ifdef CTL_TIME_IO
1649		getbinuptime(&io->io_hdr.dma_start_bt);
1650#endif
1651		ctl_datamove(io);
1652	}
1653}
1654
1655static void
1656ctl_be_block_worker(void *context, int pending)
1657{
1658	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
1659	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1660	union ctl_io *io;
1661	struct ctl_be_block_io *beio;
1662
1663	DPRINTF("entered\n");
1664	/*
1665	 * Fetch and process I/Os from all queues.  If we detect LUN
1666	 * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race,
1667	 * so make response maximally opaque to not confuse initiator.
1668	 */
1669	for (;;) {
1670		mtx_lock(&be_lun->queue_lock);
1671		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1672		if (io != NULL) {
1673			DPRINTF("datamove queue\n");
1674			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1675				      ctl_io_hdr, links);
1676			mtx_unlock(&be_lun->queue_lock);
1677			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1678			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1679				ctl_set_busy(&io->scsiio);
1680				ctl_complete_beio(beio);
1681				return;
1682			}
1683			be_lun->dispatch(be_lun, beio);
1684			continue;
1685		}
1686		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1687		if (io != NULL) {
1688			DPRINTF("config write queue\n");
1689			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1690				      ctl_io_hdr, links);
1691			mtx_unlock(&be_lun->queue_lock);
1692			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1693				ctl_set_busy(&io->scsiio);
1694				ctl_config_write_done(io);
1695				return;
1696			}
1697			ctl_be_block_cw_dispatch(be_lun, io);
1698			continue;
1699		}
1700		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1701		if (io != NULL) {
1702			DPRINTF("config read queue\n");
1703			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1704				      ctl_io_hdr, links);
1705			mtx_unlock(&be_lun->queue_lock);
1706			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1707				ctl_set_busy(&io->scsiio);
1708				ctl_config_read_done(io);
1709				return;
1710			}
1711			ctl_be_block_cr_dispatch(be_lun, io);
1712			continue;
1713		}
1714		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1715		if (io != NULL) {
1716			DPRINTF("input queue\n");
1717			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1718				      ctl_io_hdr, links);
1719			mtx_unlock(&be_lun->queue_lock);
1720			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1721				ctl_set_busy(&io->scsiio);
1722				ctl_data_submit_done(io);
1723				return;
1724			}
1725			ctl_be_block_dispatch(be_lun, io);
1726			continue;
1727		}
1728
1729		/*
1730		 * If we get here, there is no work left in the queues, so
1731		 * just break out and let the task queue go to sleep.
1732		 */
1733		mtx_unlock(&be_lun->queue_lock);
1734		break;
1735	}
1736}
1737
1738/*
1739 * Entry point from CTL to the backend for I/O.  We queue everything to a
1740 * work thread, so this just puts the I/O on a queue and wakes up the
1741 * thread.
1742 */
1743static int
1744ctl_be_block_submit(union ctl_io *io)
1745{
1746	struct ctl_be_block_lun *be_lun;
1747	struct ctl_be_lun *cbe_lun;
1748
1749	DPRINTF("entered\n");
1750
1751	cbe_lun = CTL_BACKEND_LUN(io);
1752	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
1753
1754	/*
1755	 * Make sure we only get SCSI I/O.
1756	 */
1757	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1758		"%#x) encountered", io->io_hdr.io_type));
1759
1760	PRIV(io)->len = 0;
1761
1762	mtx_lock(&be_lun->queue_lock);
1763	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1764	mtx_unlock(&be_lun->queue_lock);
1765	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1766
1767	return (CTL_RETVAL_COMPLETE);
1768}
1769
1770static int
1771ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1772			int flag, struct thread *td)
1773{
1774	struct ctl_be_block_softc *softc;
1775	int error;
1776
1777	softc = &backend_block_softc;
1778
1779	error = 0;
1780
1781	switch (cmd) {
1782	case CTL_LUN_REQ: {
1783		struct ctl_lun_req *lun_req;
1784
1785		lun_req = (struct ctl_lun_req *)addr;
1786
1787		switch (lun_req->reqtype) {
1788		case CTL_LUNREQ_CREATE:
1789			error = ctl_be_block_create(softc, lun_req);
1790			break;
1791		case CTL_LUNREQ_RM:
1792			error = ctl_be_block_rm(softc, lun_req);
1793			break;
1794		case CTL_LUNREQ_MODIFY:
1795			error = ctl_be_block_modify(softc, lun_req);
1796			break;
1797		default:
1798			lun_req->status = CTL_LUN_ERROR;
1799			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1800				 "invalid LUN request type %d",
1801				 lun_req->reqtype);
1802			break;
1803		}
1804		break;
1805	}
1806	default:
1807		error = ENOTTY;
1808		break;
1809	}
1810
1811	return (error);
1812}
1813
1814static int
1815ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1816{
1817	struct ctl_be_lun *cbe_lun;
1818	struct ctl_be_block_filedata *file_data;
1819	struct ctl_lun_create_params *params;
1820	char			     *value;
1821	struct vattr		      vattr;
1822	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1823	int			      error;
1824
1825	cbe_lun = &be_lun->cbe_lun;
1826	file_data = &be_lun->backend.file;
1827	params = &be_lun->params;
1828
1829	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1830	be_lun->dispatch = ctl_be_block_dispatch_file;
1831	be_lun->lun_flush = ctl_be_block_flush_file;
1832	be_lun->get_lba_status = ctl_be_block_gls_file;
1833	be_lun->getattr = ctl_be_block_getattr_file;
1834	be_lun->unmap = NULL;
1835	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
1836
1837	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1838	if (error != 0) {
1839		snprintf(req->error_str, sizeof(req->error_str),
1840			 "error calling VOP_GETATTR() for file %s",
1841			 be_lun->dev_path);
1842		return (error);
1843	}
1844
1845	file_data->cred = crhold(curthread->td_ucred);
1846	if (params->lun_size_bytes != 0)
1847		be_lun->size_bytes = params->lun_size_bytes;
1848	else
1849		be_lun->size_bytes = vattr.va_size;
1850
1851	/*
1852	 * For files we can use any logical block size.  Prefer 512 bytes
1853	 * for compatibility reasons.  If file's vattr.va_blocksize
1854	 * (preferred I/O block size) is bigger and multiple to chosen
1855	 * logical block size -- report it as physical block size.
1856	 */
1857	if (params->blocksize_bytes != 0)
1858		cbe_lun->blocksize = params->blocksize_bytes;
1859	else if (cbe_lun->lun_type == T_CDROM)
1860		cbe_lun->blocksize = 2048;
1861	else
1862		cbe_lun->blocksize = 512;
1863	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
1864	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
1865	    0 : (be_lun->size_blocks - 1);
1866
1867	us = ps = vattr.va_blocksize;
1868	uo = po = 0;
1869
1870	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
1871	if (value != NULL)
1872		ctl_expand_number(value, &ps);
1873	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
1874	if (value != NULL)
1875		ctl_expand_number(value, &po);
1876	pss = ps / cbe_lun->blocksize;
1877	pos = po / cbe_lun->blocksize;
1878	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
1879	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
1880		cbe_lun->pblockexp = fls(pss) - 1;
1881		cbe_lun->pblockoff = (pss - pos) % pss;
1882	}
1883
1884	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
1885	if (value != NULL)
1886		ctl_expand_number(value, &us);
1887	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
1888	if (value != NULL)
1889		ctl_expand_number(value, &uo);
1890	uss = us / cbe_lun->blocksize;
1891	uos = uo / cbe_lun->blocksize;
1892	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
1893	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
1894		cbe_lun->ublockexp = fls(uss) - 1;
1895		cbe_lun->ublockoff = (uss - uos) % uss;
1896	}
1897
1898	/*
1899	 * Sanity check.  The media size has to be at least one
1900	 * sector long.
1901	 */
1902	if (be_lun->size_bytes < cbe_lun->blocksize) {
1903		error = EINVAL;
1904		snprintf(req->error_str, sizeof(req->error_str),
1905			 "file %s size %ju < block size %u", be_lun->dev_path,
1906			 (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
1907	}
1908
1909	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
1910	return (error);
1911}
1912
1913static int
1914ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1915{
1916	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1917	struct ctl_lun_create_params *params;
1918	struct cdevsw		     *csw;
1919	struct cdev		     *dev;
1920	char			     *value;
1921	int			      error, atomic, maxio, ref, unmap, tmp;
1922	off_t			      ps, pss, po, pos, us, uss, uo, uos, otmp;
1923
1924	params = &be_lun->params;
1925
1926	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1927	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1928	if (csw == NULL)
1929		return (ENXIO);
1930	if (strcmp(csw->d_name, "zvol") == 0) {
1931		be_lun->dispatch = ctl_be_block_dispatch_zvol;
1932		be_lun->get_lba_status = ctl_be_block_gls_zvol;
1933		atomic = maxio = CTLBLK_MAX_IO_SIZE;
1934	} else {
1935		be_lun->dispatch = ctl_be_block_dispatch_dev;
1936		be_lun->get_lba_status = NULL;
1937		atomic = 0;
1938		maxio = dev->si_iosize_max;
1939		if (maxio <= 0)
1940			maxio = DFLTPHYS;
1941		if (maxio > CTLBLK_MAX_IO_SIZE)
1942			maxio = CTLBLK_MAX_IO_SIZE;
1943	}
1944	be_lun->lun_flush = ctl_be_block_flush_dev;
1945	be_lun->getattr = ctl_be_block_getattr_dev;
1946	be_lun->unmap = ctl_be_block_unmap_dev;
1947
1948	if (!csw->d_ioctl) {
1949		dev_relthread(dev, ref);
1950		snprintf(req->error_str, sizeof(req->error_str),
1951			 "no d_ioctl for device %s!", be_lun->dev_path);
1952		return (ENODEV);
1953	}
1954
1955	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
1956			       curthread);
1957	if (error) {
1958		dev_relthread(dev, ref);
1959		snprintf(req->error_str, sizeof(req->error_str),
1960			 "error %d returned for DIOCGSECTORSIZE ioctl "
1961			 "on %s!", error, be_lun->dev_path);
1962		return (error);
1963	}
1964
1965	/*
1966	 * If the user has asked for a blocksize that is greater than the
1967	 * backing device's blocksize, we can do it only if the blocksize
1968	 * the user is asking for is an even multiple of the underlying
1969	 * device's blocksize.
1970	 */
1971	if ((params->blocksize_bytes != 0) &&
1972	    (params->blocksize_bytes >= tmp)) {
1973		if (params->blocksize_bytes % tmp == 0) {
1974			cbe_lun->blocksize = params->blocksize_bytes;
1975		} else {
1976			dev_relthread(dev, ref);
1977			snprintf(req->error_str, sizeof(req->error_str),
1978				 "requested blocksize %u is not an even "
1979				 "multiple of backing device blocksize %u",
1980				 params->blocksize_bytes, tmp);
1981			return (EINVAL);
1982		}
1983	} else if (params->blocksize_bytes != 0) {
1984		dev_relthread(dev, ref);
1985		snprintf(req->error_str, sizeof(req->error_str),
1986			 "requested blocksize %u < backing device "
1987			 "blocksize %u", params->blocksize_bytes, tmp);
1988		return (EINVAL);
1989	} else if (cbe_lun->lun_type == T_CDROM)
1990		cbe_lun->blocksize = MAX(tmp, 2048);
1991	else
1992		cbe_lun->blocksize = tmp;
1993
1994	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
1995			     curthread);
1996	if (error) {
1997		dev_relthread(dev, ref);
1998		snprintf(req->error_str, sizeof(req->error_str),
1999			 "error %d returned for DIOCGMEDIASIZE "
2000			 " ioctl on %s!", error,
2001			 be_lun->dev_path);
2002		return (error);
2003	}
2004
2005	if (params->lun_size_bytes != 0) {
2006		if (params->lun_size_bytes > otmp) {
2007			dev_relthread(dev, ref);
2008			snprintf(req->error_str, sizeof(req->error_str),
2009				 "requested LUN size %ju > backing device "
2010				 "size %ju",
2011				 (uintmax_t)params->lun_size_bytes,
2012				 (uintmax_t)otmp);
2013			return (EINVAL);
2014		}
2015
2016		be_lun->size_bytes = params->lun_size_bytes;
2017	} else
2018		be_lun->size_bytes = otmp;
2019	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2020	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2021	    0 : (be_lun->size_blocks - 1);
2022
2023	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
2024	    curthread);
2025	if (error)
2026		ps = po = 0;
2027	else {
2028		error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
2029		    FREAD, curthread);
2030		if (error)
2031			po = 0;
2032	}
2033	us = ps;
2034	uo = po;
2035
2036	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
2037	if (value != NULL)
2038		ctl_expand_number(value, &ps);
2039	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
2040	if (value != NULL)
2041		ctl_expand_number(value, &po);
2042	pss = ps / cbe_lun->blocksize;
2043	pos = po / cbe_lun->blocksize;
2044	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2045	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2046		cbe_lun->pblockexp = fls(pss) - 1;
2047		cbe_lun->pblockoff = (pss - pos) % pss;
2048	}
2049
2050	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
2051	if (value != NULL)
2052		ctl_expand_number(value, &us);
2053	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
2054	if (value != NULL)
2055		ctl_expand_number(value, &uo);
2056	uss = us / cbe_lun->blocksize;
2057	uos = uo / cbe_lun->blocksize;
2058	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2059	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2060		cbe_lun->ublockexp = fls(uss) - 1;
2061		cbe_lun->ublockoff = (uss - uos) % uss;
2062	}
2063
2064	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
2065	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
2066
2067	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2068		unmap = 1;
2069	} else {
2070		struct diocgattr_arg	arg;
2071
2072		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2073		arg.len = sizeof(arg.value.i);
2074		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
2075		    curthread);
2076		unmap = (error == 0) ? arg.value.i : 0;
2077	}
2078	value = ctl_get_opt(&cbe_lun->options, "unmap");
2079	if (value != NULL)
2080		unmap = (strcmp(value, "on") == 0);
2081	if (unmap)
2082		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2083	else
2084		cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2085
2086	dev_relthread(dev, ref);
2087	return (0);
2088}
2089
2090static int
2091ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2092{
2093	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2094	int flags;
2095
2096	if (be_lun->vn) {
2097		flags = FREAD;
2098		if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
2099			flags |= FWRITE;
2100		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2101		be_lun->vn = NULL;
2102
2103		switch (be_lun->dev_type) {
2104		case CTL_BE_BLOCK_DEV:
2105			break;
2106		case CTL_BE_BLOCK_FILE:
2107			if (be_lun->backend.file.cred != NULL) {
2108				crfree(be_lun->backend.file.cred);
2109				be_lun->backend.file.cred = NULL;
2110			}
2111			break;
2112		case CTL_BE_BLOCK_NONE:
2113			break;
2114		default:
2115			panic("Unexpected backend type %d", be_lun->dev_type);
2116			break;
2117		}
2118		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2119	}
2120	return (0);
2121}
2122
2123static int
2124ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2125{
2126	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2127	struct nameidata nd;
2128	char		*value;
2129	int		 error, flags;
2130
2131	error = 0;
2132	if (rootvnode == NULL) {
2133		snprintf(req->error_str, sizeof(req->error_str),
2134			 "Root filesystem is not mounted");
2135		return (1);
2136	}
2137	if (!curthread->td_proc->p_fd->fd_cdir) {
2138		curthread->td_proc->p_fd->fd_cdir = rootvnode;
2139		VREF(rootvnode);
2140	}
2141	if (!curthread->td_proc->p_fd->fd_rdir) {
2142		curthread->td_proc->p_fd->fd_rdir = rootvnode;
2143		VREF(rootvnode);
2144	}
2145	if (!curthread->td_proc->p_fd->fd_jdir) {
2146		curthread->td_proc->p_fd->fd_jdir = rootvnode;
2147		VREF(rootvnode);
2148	}
2149
2150	value = ctl_get_opt(&cbe_lun->options, "file");
2151	if (value == NULL) {
2152		snprintf(req->error_str, sizeof(req->error_str),
2153			 "no file argument specified");
2154		return (1);
2155	}
2156	free(be_lun->dev_path, M_CTLBLK);
2157	be_lun->dev_path = strdup(value, M_CTLBLK);
2158
2159	flags = FREAD;
2160	value = ctl_get_opt(&cbe_lun->options, "readonly");
2161	if (value != NULL) {
2162		if (strcmp(value, "on") != 0)
2163			flags |= FWRITE;
2164	} else if (cbe_lun->lun_type == T_DIRECT)
2165		flags |= FWRITE;
2166
2167again:
2168	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2169	error = vn_open(&nd, &flags, 0, NULL);
2170	if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
2171		flags &= ~FWRITE;
2172		goto again;
2173	}
2174	if (error) {
2175		/*
2176		 * This is the only reasonable guess we can make as far as
2177		 * path if the user doesn't give us a fully qualified path.
2178		 * If they want to specify a file, they need to specify the
2179		 * full path.
2180		 */
2181		if (be_lun->dev_path[0] != '/') {
2182			char *dev_name;
2183
2184			asprintf(&dev_name, M_CTLBLK, "/dev/%s",
2185				be_lun->dev_path);
2186			free(be_lun->dev_path, M_CTLBLK);
2187			be_lun->dev_path = dev_name;
2188			goto again;
2189		}
2190		snprintf(req->error_str, sizeof(req->error_str),
2191		    "error opening %s: %d", be_lun->dev_path, error);
2192		return (error);
2193	}
2194	if (flags & FWRITE)
2195		cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
2196	else
2197		cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
2198
2199	NDFREE(&nd, NDF_ONLY_PNBUF);
2200	be_lun->vn = nd.ni_vp;
2201
2202	/* We only support disks and files. */
2203	if (vn_isdisk(be_lun->vn, &error)) {
2204		error = ctl_be_block_open_dev(be_lun, req);
2205	} else if (be_lun->vn->v_type == VREG) {
2206		error = ctl_be_block_open_file(be_lun, req);
2207	} else {
2208		error = EINVAL;
2209		snprintf(req->error_str, sizeof(req->error_str),
2210			 "%s is not a disk or plain file", be_lun->dev_path);
2211	}
2212	VOP_UNLOCK(be_lun->vn, 0);
2213
2214	if (error != 0)
2215		ctl_be_block_close(be_lun);
2216	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2217	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2218		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2219	value = ctl_get_opt(&cbe_lun->options, "serseq");
2220	if (value != NULL && strcmp(value, "on") == 0)
2221		cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
2222	else if (value != NULL && strcmp(value, "read") == 0)
2223		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2224	else if (value != NULL && strcmp(value, "off") == 0)
2225		cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2226	return (0);
2227}
2228
2229static int
2230ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2231{
2232	struct ctl_be_lun *cbe_lun;
2233	struct ctl_be_block_lun *be_lun;
2234	struct ctl_lun_create_params *params;
2235	char num_thread_str[16];
2236	char tmpstr[32];
2237	char *value;
2238	int retval, num_threads;
2239	int tmp_num_threads;
2240
2241	params = &req->reqdata.create;
2242	retval = 0;
2243	req->status = CTL_LUN_OK;
2244
2245	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2246	cbe_lun = &be_lun->cbe_lun;
2247	cbe_lun->be_lun = be_lun;
2248	be_lun->params = req->reqdata.create;
2249	be_lun->softc = softc;
2250	STAILQ_INIT(&be_lun->input_queue);
2251	STAILQ_INIT(&be_lun->config_read_queue);
2252	STAILQ_INIT(&be_lun->config_write_queue);
2253	STAILQ_INIT(&be_lun->datamove_queue);
2254	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
2255	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
2256	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
2257	ctl_init_opts(&cbe_lun->options,
2258	    req->num_be_args, req->kern_be_args);
2259	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
2260	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2261	if (be_lun->lun_zone == NULL) {
2262		snprintf(req->error_str, sizeof(req->error_str),
2263			 "error allocating UMA zone");
2264		goto bailout_error;
2265	}
2266
2267	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2268		cbe_lun->lun_type = params->device_type;
2269	else
2270		cbe_lun->lun_type = T_DIRECT;
2271	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
2272	cbe_lun->flags = 0;
2273	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2274	if (value != NULL) {
2275		if (strcmp(value, "primary") == 0)
2276			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2277	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2278		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2279
2280	if (cbe_lun->lun_type == T_DIRECT ||
2281	    cbe_lun->lun_type == T_CDROM) {
2282		be_lun->size_bytes = params->lun_size_bytes;
2283		if (params->blocksize_bytes != 0)
2284			cbe_lun->blocksize = params->blocksize_bytes;
2285		else if (cbe_lun->lun_type == T_CDROM)
2286			cbe_lun->blocksize = 2048;
2287		else
2288			cbe_lun->blocksize = 512;
2289		be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2290		cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2291		    0 : (be_lun->size_blocks - 1);
2292
2293		if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2294		    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2295			retval = ctl_be_block_open(be_lun, req);
2296			if (retval != 0) {
2297				retval = 0;
2298				req->status = CTL_LUN_WARNING;
2299			}
2300		}
2301		num_threads = cbb_num_threads;
2302	} else {
2303		num_threads = 1;
2304	}
2305
2306	value = ctl_get_opt(&cbe_lun->options, "num_threads");
2307	if (value != NULL) {
2308		tmp_num_threads = strtol(value, NULL, 0);
2309
2310		/*
2311		 * We don't let the user specify less than one
2312		 * thread, but hope he's clueful enough not to
2313		 * specify 1000 threads.
2314		 */
2315		if (tmp_num_threads < 1) {
2316			snprintf(req->error_str, sizeof(req->error_str),
2317				 "invalid number of threads %s",
2318				 num_thread_str);
2319			goto bailout_error;
2320		}
2321		num_threads = tmp_num_threads;
2322	}
2323
2324	if (be_lun->vn == NULL)
2325		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2326	/* Tell the user the blocksize we ended up using */
2327	params->lun_size_bytes = be_lun->size_bytes;
2328	params->blocksize_bytes = cbe_lun->blocksize;
2329	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2330		cbe_lun->req_lun_id = params->req_lun_id;
2331		cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
2332	} else
2333		cbe_lun->req_lun_id = 0;
2334
2335	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
2336	cbe_lun->lun_config_status = ctl_be_block_lun_config_status;
2337	cbe_lun->be = &ctl_be_block_driver;
2338
2339	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2340		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
2341			 softc->num_luns);
2342		strncpy((char *)cbe_lun->serial_num, tmpstr,
2343			MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
2344
2345		/* Tell the user what we used for a serial number */
2346		strncpy((char *)params->serial_num, tmpstr,
2347			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2348	} else {
2349		strncpy((char *)cbe_lun->serial_num, params->serial_num,
2350			MIN(sizeof(cbe_lun->serial_num),
2351			sizeof(params->serial_num)));
2352	}
2353	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2354		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
2355		strncpy((char *)cbe_lun->device_id, tmpstr,
2356			MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
2357
2358		/* Tell the user what we used for a device ID */
2359		strncpy((char *)params->device_id, tmpstr,
2360			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2361	} else {
2362		strncpy((char *)cbe_lun->device_id, params->device_id,
2363			MIN(sizeof(cbe_lun->device_id),
2364			    sizeof(params->device_id)));
2365	}
2366
2367	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2368
2369	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
2370	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2371
2372	if (be_lun->io_taskqueue == NULL) {
2373		snprintf(req->error_str, sizeof(req->error_str),
2374			 "unable to create taskqueue");
2375		goto bailout_error;
2376	}
2377
2378	/*
2379	 * Note that we start the same number of threads by default for
2380	 * both the file case and the block device case.  For the file
2381	 * case, we need multiple threads to allow concurrency, because the
2382	 * vnode interface is designed to be a blocking interface.  For the
2383	 * block device case, ZFS zvols at least will block the caller's
2384	 * context in many instances, and so we need multiple threads to
2385	 * overcome that problem.  Other block devices don't need as many
2386	 * threads, but they shouldn't cause too many problems.
2387	 *
2388	 * If the user wants to just have a single thread for a block
2389	 * device, he can specify that when the LUN is created, or change
2390	 * the tunable/sysctl to alter the default number of threads.
2391	 */
2392	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
2393					 /*num threads*/num_threads,
2394					 /*priority*/PWAIT,
2395					 /*thread name*/
2396					 "%s taskq", be_lun->lunname);
2397
2398	if (retval != 0)
2399		goto bailout_error;
2400
2401	be_lun->num_threads = num_threads;
2402
2403	mtx_lock(&softc->lock);
2404	softc->num_luns++;
2405	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
2406
2407	mtx_unlock(&softc->lock);
2408
2409	retval = ctl_add_lun(&be_lun->cbe_lun);
2410	if (retval != 0) {
2411		mtx_lock(&softc->lock);
2412		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2413			      links);
2414		softc->num_luns--;
2415		mtx_unlock(&softc->lock);
2416		snprintf(req->error_str, sizeof(req->error_str),
2417			 "ctl_add_lun() returned error %d, see dmesg for "
2418			 "details", retval);
2419		retval = 0;
2420		goto bailout_error;
2421	}
2422
2423	mtx_lock(&softc->lock);
2424
2425	/*
2426	 * Tell the config_status routine that we're waiting so it won't
2427	 * clean up the LUN in the event of an error.
2428	 */
2429	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2430
2431	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2432		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2433		if (retval == EINTR)
2434			break;
2435	}
2436	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2437
2438	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
2439		snprintf(req->error_str, sizeof(req->error_str),
2440			 "LUN configuration error, see dmesg for details");
2441		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2442			      links);
2443		softc->num_luns--;
2444		mtx_unlock(&softc->lock);
2445		goto bailout_error;
2446	} else {
2447		params->req_lun_id = cbe_lun->lun_id;
2448	}
2449
2450	mtx_unlock(&softc->lock);
2451
2452	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
2453					       cbe_lun->blocksize,
2454					       DEVSTAT_ALL_SUPPORTED,
2455					       cbe_lun->lun_type
2456					       | DEVSTAT_TYPE_IF_OTHER,
2457					       DEVSTAT_PRIORITY_OTHER);
2458
2459	return (retval);
2460
2461bailout_error:
2462	req->status = CTL_LUN_ERROR;
2463
2464	if (be_lun->io_taskqueue != NULL)
2465		taskqueue_free(be_lun->io_taskqueue);
2466	ctl_be_block_close(be_lun);
2467	if (be_lun->dev_path != NULL)
2468		free(be_lun->dev_path, M_CTLBLK);
2469	if (be_lun->lun_zone != NULL)
2470		uma_zdestroy(be_lun->lun_zone);
2471	ctl_free_opts(&cbe_lun->options);
2472	mtx_destroy(&be_lun->queue_lock);
2473	mtx_destroy(&be_lun->io_lock);
2474	free(be_lun, M_CTLBLK);
2475
2476	return (retval);
2477}
2478
2479static int
2480ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2481{
2482	struct ctl_lun_rm_params *params;
2483	struct ctl_be_block_lun *be_lun;
2484	struct ctl_be_lun *cbe_lun;
2485	int retval;
2486
2487	params = &req->reqdata.rm;
2488
2489	mtx_lock(&softc->lock);
2490	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2491		if (be_lun->cbe_lun.lun_id == params->lun_id)
2492			break;
2493	}
2494	mtx_unlock(&softc->lock);
2495	if (be_lun == NULL) {
2496		snprintf(req->error_str, sizeof(req->error_str),
2497			 "LUN %u is not managed by the block backend",
2498			 params->lun_id);
2499		goto bailout_error;
2500	}
2501	cbe_lun = &be_lun->cbe_lun;
2502
2503	retval = ctl_disable_lun(cbe_lun);
2504	if (retval != 0) {
2505		snprintf(req->error_str, sizeof(req->error_str),
2506			 "error %d returned from ctl_disable_lun() for "
2507			 "LUN %d", retval, params->lun_id);
2508		goto bailout_error;
2509	}
2510
2511	if (be_lun->vn != NULL) {
2512		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2513		ctl_lun_no_media(cbe_lun);
2514		taskqueue_drain_all(be_lun->io_taskqueue);
2515		ctl_be_block_close(be_lun);
2516	}
2517
2518	retval = ctl_invalidate_lun(cbe_lun);
2519	if (retval != 0) {
2520		snprintf(req->error_str, sizeof(req->error_str),
2521			 "error %d returned from ctl_invalidate_lun() for "
2522			 "LUN %d", retval, params->lun_id);
2523		goto bailout_error;
2524	}
2525
2526	mtx_lock(&softc->lock);
2527	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2528	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2529                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2530                if (retval == EINTR)
2531                        break;
2532        }
2533	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2534
2535	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2536		snprintf(req->error_str, sizeof(req->error_str),
2537			 "interrupted waiting for LUN to be freed");
2538		mtx_unlock(&softc->lock);
2539		goto bailout_error;
2540	}
2541
2542	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2543
2544	softc->num_luns--;
2545	mtx_unlock(&softc->lock);
2546
2547	taskqueue_drain_all(be_lun->io_taskqueue);
2548	taskqueue_free(be_lun->io_taskqueue);
2549
2550	if (be_lun->disk_stats != NULL)
2551		devstat_remove_entry(be_lun->disk_stats);
2552
2553	uma_zdestroy(be_lun->lun_zone);
2554
2555	ctl_free_opts(&cbe_lun->options);
2556	free(be_lun->dev_path, M_CTLBLK);
2557	mtx_destroy(&be_lun->queue_lock);
2558	mtx_destroy(&be_lun->io_lock);
2559	free(be_lun, M_CTLBLK);
2560
2561	req->status = CTL_LUN_OK;
2562	return (0);
2563
2564bailout_error:
2565	req->status = CTL_LUN_ERROR;
2566	return (0);
2567}
2568
2569static int
2570ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2571{
2572	struct ctl_lun_modify_params *params;
2573	struct ctl_be_block_lun *be_lun;
2574	struct ctl_be_lun *cbe_lun;
2575	char *value;
2576	uint64_t oldsize;
2577	int error, wasprim;
2578
2579	params = &req->reqdata.modify;
2580
2581	mtx_lock(&softc->lock);
2582	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2583		if (be_lun->cbe_lun.lun_id == params->lun_id)
2584			break;
2585	}
2586	mtx_unlock(&softc->lock);
2587	if (be_lun == NULL) {
2588		snprintf(req->error_str, sizeof(req->error_str),
2589			 "LUN %u is not managed by the block backend",
2590			 params->lun_id);
2591		goto bailout_error;
2592	}
2593	cbe_lun = &be_lun->cbe_lun;
2594
2595	if (params->lun_size_bytes != 0)
2596		be_lun->params.lun_size_bytes = params->lun_size_bytes;
2597	ctl_update_opts(&cbe_lun->options, req->num_be_args, req->kern_be_args);
2598
2599	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
2600	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2601	if (value != NULL) {
2602		if (strcmp(value, "primary") == 0)
2603			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2604		else
2605			cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2606	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2607		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2608	else
2609		cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2610	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
2611		if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
2612			ctl_lun_primary(cbe_lun);
2613		else
2614			ctl_lun_secondary(cbe_lun);
2615	}
2616
2617	oldsize = be_lun->size_blocks;
2618	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2619	    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2620		if (be_lun->vn == NULL)
2621			error = ctl_be_block_open(be_lun, req);
2622		else if (vn_isdisk(be_lun->vn, &error))
2623			error = ctl_be_block_open_dev(be_lun, req);
2624		else if (be_lun->vn->v_type == VREG) {
2625			vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2626			error = ctl_be_block_open_file(be_lun, req);
2627			VOP_UNLOCK(be_lun->vn, 0);
2628		} else
2629			error = EINVAL;
2630		if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) &&
2631		    be_lun->vn != NULL) {
2632			cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
2633			ctl_lun_has_media(cbe_lun);
2634		} else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 &&
2635		    be_lun->vn == NULL) {
2636			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2637			ctl_lun_no_media(cbe_lun);
2638		}
2639		cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
2640	} else {
2641		if (be_lun->vn != NULL) {
2642			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2643			ctl_lun_no_media(cbe_lun);
2644			taskqueue_drain_all(be_lun->io_taskqueue);
2645			error = ctl_be_block_close(be_lun);
2646		} else
2647			error = 0;
2648	}
2649	if (be_lun->size_blocks != oldsize)
2650		ctl_lun_capacity_changed(cbe_lun);
2651
2652	/* Tell the user the exact size we ended up using */
2653	params->lun_size_bytes = be_lun->size_bytes;
2654
2655	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2656	return (0);
2657
2658bailout_error:
2659	req->status = CTL_LUN_ERROR;
2660	return (0);
2661}
2662
2663static void
2664ctl_be_block_lun_shutdown(void *be_lun)
2665{
2666	struct ctl_be_block_lun *lun;
2667	struct ctl_be_block_softc *softc;
2668
2669	lun = (struct ctl_be_block_lun *)be_lun;
2670	softc = lun->softc;
2671
2672	mtx_lock(&softc->lock);
2673	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2674	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2675		wakeup(lun);
2676	mtx_unlock(&softc->lock);
2677}
2678
2679static void
2680ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2681{
2682	struct ctl_be_block_lun *lun;
2683	struct ctl_be_block_softc *softc;
2684
2685	lun = (struct ctl_be_block_lun *)be_lun;
2686	softc = lun->softc;
2687
2688	if (status == CTL_LUN_CONFIG_OK) {
2689		mtx_lock(&softc->lock);
2690		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2691		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2692			wakeup(lun);
2693		mtx_unlock(&softc->lock);
2694
2695		/*
2696		 * We successfully added the LUN, attempt to enable it.
2697		 */
2698		if (ctl_enable_lun(&lun->cbe_lun) != 0) {
2699			printf("%s: ctl_enable_lun() failed!\n", __func__);
2700			if (ctl_invalidate_lun(&lun->cbe_lun) != 0) {
2701				printf("%s: ctl_invalidate_lun() failed!\n",
2702				       __func__);
2703			}
2704		}
2705
2706		return;
2707	}
2708
2709
2710	mtx_lock(&softc->lock);
2711	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2712	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2713	wakeup(lun);
2714	mtx_unlock(&softc->lock);
2715}
2716
2717
2718static int
2719ctl_be_block_config_write(union ctl_io *io)
2720{
2721	struct ctl_be_block_lun *be_lun;
2722	struct ctl_be_lun *cbe_lun;
2723	int retval;
2724
2725	DPRINTF("entered\n");
2726
2727	cbe_lun = CTL_BACKEND_LUN(io);
2728	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2729
2730	retval = 0;
2731	switch (io->scsiio.cdb[0]) {
2732	case SYNCHRONIZE_CACHE:
2733	case SYNCHRONIZE_CACHE_16:
2734	case WRITE_SAME_10:
2735	case WRITE_SAME_16:
2736	case UNMAP:
2737		/*
2738		 * The upper level CTL code will filter out any CDBs with
2739		 * the immediate bit set and return the proper error.
2740		 *
2741		 * We don't really need to worry about what LBA range the
2742		 * user asked to be synced out.  When they issue a sync
2743		 * cache command, we'll sync out the whole thing.
2744		 */
2745		mtx_lock(&be_lun->queue_lock);
2746		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2747				   links);
2748		mtx_unlock(&be_lun->queue_lock);
2749		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2750		break;
2751	case START_STOP_UNIT: {
2752		struct scsi_start_stop_unit *cdb;
2753		struct ctl_lun_req req;
2754
2755		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2756		if ((cdb->how & SSS_PC_MASK) != 0) {
2757			ctl_set_success(&io->scsiio);
2758			ctl_config_write_done(io);
2759			break;
2760		}
2761		if (cdb->how & SSS_START) {
2762			if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) {
2763				retval = ctl_be_block_open(be_lun, &req);
2764				cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
2765				if (retval == 0) {
2766					cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
2767					ctl_lun_has_media(cbe_lun);
2768				} else {
2769					cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2770					ctl_lun_no_media(cbe_lun);
2771				}
2772			}
2773			ctl_start_lun(cbe_lun);
2774		} else {
2775			ctl_stop_lun(cbe_lun);
2776			if (cdb->how & SSS_LOEJ) {
2777				cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2778				cbe_lun->flags |= CTL_LUN_FLAG_EJECTED;
2779				ctl_lun_ejected(cbe_lun);
2780				if (be_lun->vn != NULL)
2781					ctl_be_block_close(be_lun);
2782			}
2783		}
2784
2785		ctl_set_success(&io->scsiio);
2786		ctl_config_write_done(io);
2787		break;
2788	}
2789	case PREVENT_ALLOW:
2790		ctl_set_success(&io->scsiio);
2791		ctl_config_write_done(io);
2792		break;
2793	default:
2794		ctl_set_invalid_opcode(&io->scsiio);
2795		ctl_config_write_done(io);
2796		retval = CTL_RETVAL_COMPLETE;
2797		break;
2798	}
2799
2800	return (retval);
2801}
2802
2803static int
2804ctl_be_block_config_read(union ctl_io *io)
2805{
2806	struct ctl_be_block_lun *be_lun;
2807	struct ctl_be_lun *cbe_lun;
2808	int retval = 0;
2809
2810	DPRINTF("entered\n");
2811
2812	cbe_lun = CTL_BACKEND_LUN(io);
2813	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2814
2815	switch (io->scsiio.cdb[0]) {
2816	case SERVICE_ACTION_IN:
2817		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2818			mtx_lock(&be_lun->queue_lock);
2819			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2820			    &io->io_hdr, links);
2821			mtx_unlock(&be_lun->queue_lock);
2822			taskqueue_enqueue(be_lun->io_taskqueue,
2823			    &be_lun->io_task);
2824			retval = CTL_RETVAL_QUEUED;
2825			break;
2826		}
2827		ctl_set_invalid_field(&io->scsiio,
2828				      /*sks_valid*/ 1,
2829				      /*command*/ 1,
2830				      /*field*/ 1,
2831				      /*bit_valid*/ 1,
2832				      /*bit*/ 4);
2833		ctl_config_read_done(io);
2834		retval = CTL_RETVAL_COMPLETE;
2835		break;
2836	default:
2837		ctl_set_invalid_opcode(&io->scsiio);
2838		ctl_config_read_done(io);
2839		retval = CTL_RETVAL_COMPLETE;
2840		break;
2841	}
2842
2843	return (retval);
2844}
2845
2846static int
2847ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2848{
2849	struct ctl_be_block_lun *lun;
2850	int retval;
2851
2852	lun = (struct ctl_be_block_lun *)be_lun;
2853
2854	retval = sbuf_printf(sb, "\t<num_threads>");
2855	if (retval != 0)
2856		goto bailout;
2857	retval = sbuf_printf(sb, "%d", lun->num_threads);
2858	if (retval != 0)
2859		goto bailout;
2860	retval = sbuf_printf(sb, "</num_threads>\n");
2861
2862bailout:
2863	return (retval);
2864}
2865
2866static uint64_t
2867ctl_be_block_lun_attr(void *be_lun, const char *attrname)
2868{
2869	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
2870
2871	if (lun->getattr == NULL)
2872		return (UINT64_MAX);
2873	return (lun->getattr(lun, attrname));
2874}
2875
2876int
2877ctl_be_block_init(void)
2878{
2879	struct ctl_be_block_softc *softc;
2880	int retval;
2881
2882	softc = &backend_block_softc;
2883	retval = 0;
2884
2885	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2886	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2887	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2888	STAILQ_INIT(&softc->lun_list);
2889
2890	return (retval);
2891}
2892