1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8#include <sys/param.h>
9#include <sys/bus.h>
10#include <sys/conf.h>
11#include <sys/lock.h>
12#include <sys/kernel.h>
13#include <sys/malloc.h>
14#include <sys/memdesc.h>
15#include <sys/module.h>
16#include <sys/mutex.h>
17#include <sys/sx.h>
18#include <sys/taskqueue.h>
19#include <dev/nvme/nvme.h>
20#include <dev/nvmf/nvmf.h>
21#include <dev/nvmf/nvmf_transport.h>
22#include <dev/nvmf/host/nvmf_var.h>
23
24static struct cdevsw nvmf_cdevsw;
25
26MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
27
28static void	nvmf_disconnect_task(void *arg, int pending);
29
30void
31nvmf_complete(void *arg, const struct nvme_completion *cqe)
32{
33	struct nvmf_completion_status *status = arg;
34	struct mtx *mtx;
35
36	status->cqe = *cqe;
37	mtx = mtx_pool_find(mtxpool_sleep, status);
38	mtx_lock(mtx);
39	status->done = true;
40	mtx_unlock(mtx);
41	wakeup(status);
42}
43
44void
45nvmf_io_complete(void *arg, size_t xfered, int error)
46{
47	struct nvmf_completion_status *status = arg;
48	struct mtx *mtx;
49
50	status->io_error = error;
51	mtx = mtx_pool_find(mtxpool_sleep, status);
52	mtx_lock(mtx);
53	status->io_done = true;
54	mtx_unlock(mtx);
55	wakeup(status);
56}
57
58void
59nvmf_wait_for_reply(struct nvmf_completion_status *status)
60{
61	struct mtx *mtx;
62
63	mtx = mtx_pool_find(mtxpool_sleep, status);
64	mtx_lock(mtx);
65	while (!status->done || !status->io_done)
66		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
67	mtx_unlock(mtx);
68}
69
70static int
71nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
72    uint64_t *value)
73{
74	const struct nvmf_fabric_prop_get_rsp *rsp;
75	struct nvmf_completion_status status;
76
77	nvmf_status_init(&status);
78	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
79	    M_WAITOK))
80		return (ECONNABORTED);
81	nvmf_wait_for_reply(&status);
82
83	if (status.cqe.status != 0) {
84		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
85		    le16toh(status.cqe.status));
86		return (EIO);
87	}
88
89	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
90	if (size == 8)
91		*value = le64toh(rsp->value.u64);
92	else
93		*value = le32toh(rsp->value.u32.low);
94	return (0);
95}
96
97static int
98nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
99    uint64_t value)
100{
101	struct nvmf_completion_status status;
102
103	nvmf_status_init(&status);
104	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
105	    M_WAITOK))
106		return (ECONNABORTED);
107	nvmf_wait_for_reply(&status);
108
109	if (status.cqe.status != 0) {
110		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
111		    le16toh(status.cqe.status));
112		return (EIO);
113	}
114	return (0);
115}
116
117static void
118nvmf_shutdown_controller(struct nvmf_softc *sc)
119{
120	uint64_t cc;
121	int error;
122
123	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
124	if (error != 0) {
125		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
126		return;
127	}
128
129	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
130
131	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
132	if (error != 0)
133		device_printf(sc->dev,
134		    "Failed to set CC to trigger shutdown\n");
135}
136
137static void
138nvmf_check_keep_alive(void *arg)
139{
140	struct nvmf_softc *sc = arg;
141	int traffic;
142
143	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
144	if (traffic == 0) {
145		device_printf(sc->dev,
146		    "disconnecting due to KeepAlive timeout\n");
147		nvmf_disconnect(sc);
148		return;
149	}
150
151	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
152}
153
154static void
155nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
156{
157	struct nvmf_softc *sc = arg;
158
159	atomic_store_int(&sc->ka_active_rx_traffic, 1);
160	if (cqe->status != 0) {
161		device_printf(sc->dev,
162		    "KeepAlive response reported status %#x\n",
163		    le16toh(cqe->status));
164	}
165}
166
167static void
168nvmf_send_keep_alive(void *arg)
169{
170	struct nvmf_softc *sc = arg;
171	int traffic;
172
173	/*
174	 * Don't bother sending a KeepAlive command if TKAS is active
175	 * and another command has been sent during the interval.
176	 */
177	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
178	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
179	    sc, M_NOWAIT))
180		device_printf(sc->dev,
181		    "Failed to allocate KeepAlive command\n");
182
183	/* Clear ka_active_tx_traffic after sending the keep alive command. */
184	atomic_store_int(&sc->ka_active_tx_traffic, 0);
185
186	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
187}
188
189int
190nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
191{
192	size_t len;
193	u_int i;
194	int error;
195
196	memset(ivars, 0, sizeof(*ivars));
197
198	if (!hh->admin.admin || hh->num_io_queues < 1)
199		return (EINVAL);
200
201	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
202	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
203	if (error != 0)
204		goto out;
205	nvme_controller_data_swapbytes(ivars->cdata);
206
207	len = hh->num_io_queues * sizeof(*ivars->io_params);
208	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
209	error = copyin(hh->io, ivars->io_params, len);
210	if (error != 0)
211		goto out;
212	for (i = 0; i < hh->num_io_queues; i++) {
213		if (ivars->io_params[i].admin) {
214			error = EINVAL;
215			goto out;
216		}
217
218		/* Require all I/O queues to be the same size. */
219		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
220			error = EINVAL;
221			goto out;
222		}
223	}
224
225	ivars->hh = hh;
226	return (0);
227
228out:
229	free(ivars->io_params, M_NVMF);
230	free(ivars->cdata, M_NVMF);
231	return (error);
232}
233
234void
235nvmf_free_ivars(struct nvmf_ivars *ivars)
236{
237	free(ivars->io_params, M_NVMF);
238	free(ivars->cdata, M_NVMF);
239}
240
241static int
242nvmf_probe(device_t dev)
243{
244	struct nvmf_ivars *ivars = device_get_ivars(dev);
245	char desc[260];
246
247	if (ivars == NULL)
248		return (ENXIO);
249
250	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
251	device_set_desc_copy(dev, desc);
252	return (BUS_PROBE_DEFAULT);
253}
254
255static int
256nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
257{
258	char name[16];
259
260	/* Setup the admin queue. */
261	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
262	    "admin queue");
263	if (sc->admin == NULL) {
264		device_printf(sc->dev, "Failed to setup admin queue\n");
265		return (ENXIO);
266	}
267
268	/* Setup I/O queues. */
269	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
270	    M_WAITOK | M_ZERO);
271	sc->num_io_queues = ivars->hh->num_io_queues;
272	for (u_int i = 0; i < sc->num_io_queues; i++) {
273		snprintf(name, sizeof(name), "I/O queue %u", i);
274		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
275		    &ivars->io_params[i], name);
276		if (sc->io[i] == NULL) {
277			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
278			    i + 1);
279			return (ENXIO);
280		}
281	}
282
283	/* Start KeepAlive timers. */
284	if (ivars->hh->kato != 0) {
285		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
286		    sc->cdata->ctratt) != 0;
287		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
288		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
289		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
290		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
291		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
292		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
293	}
294
295	return (0);
296}
297
298static bool
299nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
300    struct nvme_namespace_data *data, uint32_t *nsidp)
301{
302	struct nvmf_completion_status status;
303	uint32_t nsid;
304
305	nvmf_status_init(&status);
306	nvmf_status_wait_io(&status);
307	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
308	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
309		device_printf(sc->dev,
310		    "failed to send IDENTIFY active namespaces command\n");
311		return (false);
312	}
313	nvmf_wait_for_reply(&status);
314
315	if (status.cqe.status != 0) {
316		device_printf(sc->dev,
317		    "IDENTIFY active namespaces failed, status %#x\n",
318		    le16toh(status.cqe.status));
319		return (false);
320	}
321
322	if (status.io_error != 0) {
323		device_printf(sc->dev,
324		    "IDENTIFY active namespaces failed with I/O error %d\n",
325		    status.io_error);
326		return (false);
327	}
328
329	for (u_int i = 0; i < nitems(nslist->ns); i++) {
330		nsid = nslist->ns[i];
331		if (nsid == 0) {
332			*nsidp = 0;
333			return (true);
334		}
335
336		if (sc->ns[nsid - 1] != NULL) {
337			device_printf(sc->dev,
338			    "duplicate namespace %u in active namespace list\n",
339			    nsid);
340			return (false);
341		}
342
343		nvmf_status_init(&status);
344		nvmf_status_wait_io(&status);
345		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
346		    &status, nvmf_io_complete, &status, M_WAITOK)) {
347			device_printf(sc->dev,
348			    "failed to send IDENTIFY namespace %u command\n",
349			    nsid);
350			return (false);
351		}
352		nvmf_wait_for_reply(&status);
353
354		if (status.cqe.status != 0) {
355			device_printf(sc->dev,
356			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
357			    le16toh(status.cqe.status));
358			return (false);
359		}
360
361		if (status.io_error != 0) {
362			device_printf(sc->dev,
363			    "IDENTIFY namespace %u failed with I/O error %d\n",
364			    nsid, status.io_error);
365			return (false);
366		}
367
368		/*
369		 * As in nvme_ns_construct, a size of zero indicates an
370		 * invalid namespace.
371		 */
372		nvme_namespace_data_swapbytes(data);
373		if (data->nsze == 0) {
374			device_printf(sc->dev,
375			    "ignoring active namespace %u with zero size\n",
376			    nsid);
377			continue;
378		}
379
380		sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
381
382		nvmf_sim_rescan_ns(sc, nsid);
383	}
384
385	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
386
387	if (nsid >= 0xfffffffd)
388		*nsidp = 0;
389	else
390		*nsidp = nsid + 1;
391	return (true);
392}
393
394static bool
395nvmf_add_namespaces(struct nvmf_softc *sc)
396{
397	struct nvme_namespace_data *data;
398	struct nvme_ns_list *nslist;
399	uint32_t nsid;
400	bool retval;
401
402	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
403	    M_WAITOK | M_ZERO);
404	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
405	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
406
407	nsid = 0;
408	retval = true;
409	for (;;) {
410		if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
411			retval = false;
412			break;
413		}
414		if (nsid == 0)
415			break;
416	}
417
418	free(data, M_NVMF);
419	free(nslist, M_NVMF);
420	return (retval);
421}
422
423static int
424nvmf_attach(device_t dev)
425{
426	struct make_dev_args mda;
427	struct nvmf_softc *sc = device_get_softc(dev);
428	struct nvmf_ivars *ivars = device_get_ivars(dev);
429	uint64_t val;
430	u_int i;
431	int error;
432
433	if (ivars == NULL)
434		return (ENXIO);
435
436	sc->dev = dev;
437	sc->trtype = ivars->hh->trtype;
438	callout_init(&sc->ka_rx_timer, 1);
439	callout_init(&sc->ka_tx_timer, 1);
440	sx_init(&sc->connection_lock, "nvmf connection");
441	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
442
443	/* Claim the cdata pointer from ivars. */
444	sc->cdata = ivars->cdata;
445	ivars->cdata = NULL;
446
447	nvmf_init_aer(sc);
448
449	/* TODO: Multiqueue support. */
450	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
451
452	error = nvmf_establish_connection(sc, ivars);
453	if (error != 0)
454		goto out;
455
456	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
457	if (error != 0) {
458		device_printf(sc->dev, "Failed to fetch CAP\n");
459		error = ENXIO;
460		goto out;
461	}
462
463	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
464	if (error != 0) {
465		device_printf(sc->dev, "Failed to fetch VS\n");
466		error = ENXIO;
467		goto out;
468	}
469	sc->vs = val;
470
471	/* Honor MDTS if it is set. */
472	sc->max_xfer_size = maxphys;
473	if (sc->cdata->mdts != 0) {
474		sc->max_xfer_size = ulmin(sc->max_xfer_size,
475		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
476		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
477	}
478
479	error = nvmf_init_sim(sc);
480	if (error != 0)
481		goto out;
482
483	error = nvmf_start_aer(sc);
484	if (error != 0) {
485		nvmf_destroy_sim(sc);
486		goto out;
487	}
488
489	if (!nvmf_add_namespaces(sc)) {
490		nvmf_destroy_sim(sc);
491		goto out;
492	}
493
494	make_dev_args_init(&mda);
495	mda.mda_devsw = &nvmf_cdevsw;
496	mda.mda_uid = UID_ROOT;
497	mda.mda_gid = GID_WHEEL;
498	mda.mda_mode = 0600;
499	mda.mda_si_drv1 = sc;
500	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
501	if (error != 0) {
502		nvmf_destroy_sim(sc);
503		goto out;
504	}
505
506	return (0);
507out:
508	if (sc->ns != NULL) {
509		for (i = 0; i < sc->cdata->nn; i++) {
510			if (sc->ns[i] != NULL)
511				nvmf_destroy_ns(sc->ns[i]);
512		}
513		free(sc->ns, M_NVMF);
514	}
515
516	callout_drain(&sc->ka_tx_timer);
517	callout_drain(&sc->ka_rx_timer);
518
519	if (sc->admin != NULL)
520		nvmf_shutdown_controller(sc);
521
522	for (i = 0; i < sc->num_io_queues; i++) {
523		if (sc->io[i] != NULL)
524			nvmf_destroy_qp(sc->io[i]);
525	}
526	free(sc->io, M_NVMF);
527	if (sc->admin != NULL)
528		nvmf_destroy_qp(sc->admin);
529
530	nvmf_destroy_aer(sc);
531
532	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
533	sx_destroy(&sc->connection_lock);
534	free(sc->cdata, M_NVMF);
535	return (error);
536}
537
538void
539nvmf_disconnect(struct nvmf_softc *sc)
540{
541	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
542}
543
544static void
545nvmf_disconnect_task(void *arg, int pending __unused)
546{
547	struct nvmf_softc *sc = arg;
548	u_int i;
549
550	sx_xlock(&sc->connection_lock);
551	if (sc->admin == NULL) {
552		/*
553		 * Ignore transport errors if there is no active
554		 * association.
555		 */
556		sx_xunlock(&sc->connection_lock);
557		return;
558	}
559
560	if (sc->detaching) {
561		if (sc->admin != NULL) {
562			/*
563			 * This unsticks the detach process if a
564			 * transport error occurs during detach.
565			 */
566			nvmf_shutdown_qp(sc->admin);
567		}
568		sx_xunlock(&sc->connection_lock);
569		return;
570	}
571
572	if (sc->cdev == NULL) {
573		/*
574		 * Transport error occurred during attach (nvmf_add_namespaces).
575		 * Shutdown the admin queue.
576		 */
577		nvmf_shutdown_qp(sc->admin);
578		sx_xunlock(&sc->connection_lock);
579		return;
580	}
581
582	callout_drain(&sc->ka_tx_timer);
583	callout_drain(&sc->ka_rx_timer);
584	sc->ka_traffic = false;
585
586	/* Quiesce namespace consumers. */
587	nvmf_disconnect_sim(sc);
588	for (i = 0; i < sc->cdata->nn; i++) {
589		if (sc->ns[i] != NULL)
590			nvmf_disconnect_ns(sc->ns[i]);
591	}
592
593	/* Shutdown the existing qpairs. */
594	for (i = 0; i < sc->num_io_queues; i++) {
595		nvmf_destroy_qp(sc->io[i]);
596	}
597	free(sc->io, M_NVMF);
598	sc->io = NULL;
599	sc->num_io_queues = 0;
600	nvmf_destroy_qp(sc->admin);
601	sc->admin = NULL;
602
603	sx_xunlock(&sc->connection_lock);
604}
605
606static int
607nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
608{
609	struct nvmf_ivars ivars;
610	u_int i;
611	int error;
612
613	/* XXX: Should we permit changing the transport type? */
614	if (sc->trtype != hh->trtype) {
615		device_printf(sc->dev,
616		    "transport type mismatch on reconnect\n");
617		return (EINVAL);
618	}
619
620	error = nvmf_init_ivars(&ivars, hh);
621	if (error != 0)
622		return (error);
623
624	sx_xlock(&sc->connection_lock);
625	if (sc->admin != NULL || sc->detaching) {
626		error = EBUSY;
627		goto out;
628	}
629
630	/*
631	 * Ensure this is for the same controller.  Note that the
632	 * controller ID can vary across associations if the remote
633	 * system is using the dynamic controller model.  This merely
634	 * ensures the new association is connected to the same NVMe
635	 * subsystem.
636	 */
637	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
638	    sizeof(ivars.cdata->subnqn)) != 0) {
639		device_printf(sc->dev,
640		    "controller subsystem NQN mismatch on reconnect\n");
641		error = EINVAL;
642		goto out;
643	}
644
645	/*
646	 * XXX: Require same number and size of I/O queues so that
647	 * max_pending_io is still correct?
648	 */
649
650	error = nvmf_establish_connection(sc, &ivars);
651	if (error != 0)
652		goto out;
653
654	error = nvmf_start_aer(sc);
655	if (error != 0)
656		goto out;
657
658	device_printf(sc->dev,
659	    "established new association with %u I/O queues\n",
660	    sc->num_io_queues);
661
662	/* Restart namespace consumers. */
663	for (i = 0; i < sc->cdata->nn; i++) {
664		if (sc->ns[i] != NULL)
665			nvmf_reconnect_ns(sc->ns[i]);
666	}
667	nvmf_reconnect_sim(sc);
668out:
669	sx_xunlock(&sc->connection_lock);
670	nvmf_free_ivars(&ivars);
671	return (error);
672}
673
674static int
675nvmf_detach(device_t dev)
676{
677	struct nvmf_softc *sc = device_get_softc(dev);
678	u_int i;
679
680	destroy_dev(sc->cdev);
681
682	sx_xlock(&sc->connection_lock);
683	sc->detaching = true;
684	sx_xunlock(&sc->connection_lock);
685
686	nvmf_destroy_sim(sc);
687	for (i = 0; i < sc->cdata->nn; i++) {
688		if (sc->ns[i] != NULL)
689			nvmf_destroy_ns(sc->ns[i]);
690	}
691	free(sc->ns, M_NVMF);
692
693	callout_drain(&sc->ka_tx_timer);
694	callout_drain(&sc->ka_rx_timer);
695
696	if (sc->admin != NULL)
697		nvmf_shutdown_controller(sc);
698
699	for (i = 0; i < sc->num_io_queues; i++) {
700		nvmf_destroy_qp(sc->io[i]);
701	}
702	free(sc->io, M_NVMF);
703
704	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
705
706	if (sc->admin != NULL)
707		nvmf_destroy_qp(sc->admin);
708
709	nvmf_destroy_aer(sc);
710
711	sx_destroy(&sc->connection_lock);
712	free(sc->cdata, M_NVMF);
713	return (0);
714}
715
716void
717nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
718{
719	struct nvmf_completion_status status;
720	struct nvme_namespace_data *data;
721	struct nvmf_namespace *ns;
722
723	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
724
725	nvmf_status_init(&status);
726	nvmf_status_wait_io(&status);
727	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
728	    &status, nvmf_io_complete, &status, M_WAITOK)) {
729		device_printf(sc->dev,
730		    "failed to send IDENTIFY namespace %u command\n", nsid);
731		free(data, M_NVMF);
732		return;
733	}
734	nvmf_wait_for_reply(&status);
735
736	if (status.cqe.status != 0) {
737		device_printf(sc->dev,
738		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
739		    le16toh(status.cqe.status));
740		free(data, M_NVMF);
741		return;
742	}
743
744	if (status.io_error != 0) {
745		device_printf(sc->dev,
746		    "IDENTIFY namespace %u failed with I/O error %d\n",
747		    nsid, status.io_error);
748		free(data, M_NVMF);
749		return;
750	}
751
752	nvme_namespace_data_swapbytes(data);
753
754	/* XXX: Needs locking around sc->ns[]. */
755	ns = sc->ns[nsid - 1];
756	if (data->nsze == 0) {
757		/* XXX: Needs locking */
758		if (ns != NULL) {
759			nvmf_destroy_ns(ns);
760			sc->ns[nsid - 1] = NULL;
761		}
762	} else {
763		/* XXX: Needs locking */
764		if (ns == NULL) {
765			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
766		} else {
767			if (!nvmf_update_ns(ns, data)) {
768				nvmf_destroy_ns(ns);
769				sc->ns[nsid - 1] = NULL;
770			}
771		}
772	}
773
774	free(data, M_NVMF);
775
776	nvmf_sim_rescan_ns(sc, nsid);
777}
778
779int
780nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
781    bool admin)
782{
783	struct nvmf_completion_status status;
784	struct nvme_command cmd;
785	struct memdesc mem;
786	struct nvmf_host_qpair *qp;
787	struct nvmf_request *req;
788	void *buf;
789	int error;
790
791	if (pt->len > sc->max_xfer_size)
792		return (EINVAL);
793
794	buf = NULL;
795	if (pt->len != 0) {
796		/*
797		 * XXX: Depending on the size we may want to pin the
798		 * user pages and use a memdesc with vm_page_t's
799		 * instead.
800		 */
801		buf = malloc(pt->len, M_NVMF, M_WAITOK);
802		if (pt->is_read == 0) {
803			error = copyin(pt->buf, buf, pt->len);
804			if (error != 0) {
805				free(buf, M_NVMF);
806				return (error);
807			}
808		} else {
809			/* Ensure no kernel data is leaked to userland. */
810			memset(buf, 0, pt->len);
811		}
812	}
813
814	memset(&cmd, 0, sizeof(cmd));
815	cmd.opc = pt->cmd.opc;
816	cmd.fuse = pt->cmd.fuse;
817	cmd.nsid = pt->cmd.nsid;
818	cmd.cdw10 = pt->cmd.cdw10;
819	cmd.cdw11 = pt->cmd.cdw11;
820	cmd.cdw12 = pt->cmd.cdw12;
821	cmd.cdw13 = pt->cmd.cdw13;
822	cmd.cdw14 = pt->cmd.cdw14;
823	cmd.cdw15 = pt->cmd.cdw15;
824
825	if (admin)
826		qp = sc->admin;
827	else
828		qp = nvmf_select_io_queue(sc);
829	nvmf_status_init(&status);
830	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
831	if (req == NULL) {
832		device_printf(sc->dev, "failed to send passthrough command\n");
833		error = ECONNABORTED;
834		goto error;
835	}
836
837	if (pt->len != 0) {
838		mem = memdesc_vaddr(buf, pt->len);
839		nvmf_capsule_append_data(req->nc, &mem, pt->len,
840		    pt->is_read == 0, nvmf_io_complete, &status);
841		nvmf_status_wait_io(&status);
842	}
843
844	nvmf_submit_request(req);
845	nvmf_wait_for_reply(&status);
846
847	memset(&pt->cpl, 0, sizeof(pt->cpl));
848	pt->cpl.cdw0 = status.cqe.cdw0;
849	pt->cpl.status = status.cqe.status;
850
851	error = status.io_error;
852	if (error == 0 && pt->len != 0 && pt->is_read != 0)
853		error = copyout(buf, pt->buf, pt->len);
854error:
855	free(buf, M_NVMF);
856	return (error);
857}
858
859static int
860nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
861    struct thread *td)
862{
863	struct nvmf_softc *sc = cdev->si_drv1;
864	struct nvme_get_nsid *gnsid;
865	struct nvme_pt_command *pt;
866	struct nvmf_reconnect_params *rp;
867	struct nvmf_handoff_host *hh;
868
869	switch (cmd) {
870	case NVME_PASSTHROUGH_CMD:
871		pt = (struct nvme_pt_command *)arg;
872		return (nvmf_passthrough_cmd(sc, pt, true));
873	case NVME_GET_NSID:
874		gnsid = (struct nvme_get_nsid *)arg;
875		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
876		    sizeof(gnsid->cdev));
877		gnsid->nsid = 0;
878		return (0);
879	case NVME_GET_MAX_XFER_SIZE:
880		*(uint64_t *)arg = sc->max_xfer_size;
881		return (0);
882	case NVMF_RECONNECT_PARAMS:
883		rp = (struct nvmf_reconnect_params *)arg;
884		if ((sc->cdata->fcatt & 1) == 0)
885			rp->cntlid = NVMF_CNTLID_DYNAMIC;
886		else
887			rp->cntlid = sc->cdata->ctrlr_id;
888		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
889		return (0);
890	case NVMF_RECONNECT_HOST:
891		hh = (struct nvmf_handoff_host *)arg;
892		return (nvmf_reconnect_host(sc, hh));
893	default:
894		return (ENOTTY);
895	}
896}
897
898static struct cdevsw nvmf_cdevsw = {
899	.d_version = D_VERSION,
900	.d_ioctl = nvmf_ioctl
901};
902
903static int
904nvmf_modevent(module_t mod, int what, void *arg)
905{
906	switch (what) {
907	case MOD_LOAD:
908		return (nvmf_ctl_load());
909	case MOD_QUIESCE:
910		return (0);
911	case MOD_UNLOAD:
912		nvmf_ctl_unload();
913		destroy_dev_drain(&nvmf_cdevsw);
914		return (0);
915	default:
916		return (EOPNOTSUPP);
917	}
918}
919
920static device_method_t nvmf_methods[] = {
921	/* Device interface */
922	DEVMETHOD(device_probe,     nvmf_probe),
923	DEVMETHOD(device_attach,    nvmf_attach),
924	DEVMETHOD(device_detach,    nvmf_detach),
925#if 0
926	DEVMETHOD(device_shutdown,  nvmf_shutdown),
927#endif
928	DEVMETHOD_END
929};
930
931driver_t nvme_nvmf_driver = {
932	"nvme",
933	nvmf_methods,
934	sizeof(struct nvmf_softc),
935};
936
937DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
938MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
939