1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8#include <sys/param.h>
9#include <sys/bio.h>
10#include <sys/bus.h>
11#include <sys/conf.h>
12#include <sys/disk.h>
13#include <sys/fcntl.h>
14#include <sys/lock.h>
15#include <sys/malloc.h>
16#include <sys/memdesc.h>
17#include <sys/mutex.h>
18#include <sys/proc.h>
19#include <sys/refcount.h>
20#include <sys/sbuf.h>
21#include <machine/stdarg.h>
22#include <dev/nvme/nvme.h>
23#include <dev/nvmf/host/nvmf_var.h>
24
25struct nvmf_namespace {
26	struct nvmf_softc *sc;
27	uint64_t size;
28	uint32_t id;
29	u_int	flags;
30	uint32_t lba_size;
31	bool disconnected;
32
33	TAILQ_HEAD(, bio) pending_bios;
34	struct mtx lock;
35	volatile u_int active_bios;
36
37	struct cdev *cdev;
38};
39
40static void	nvmf_ns_strategy(struct bio *bio);
41
42static void
43ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
44{
45	char buf[128];
46	struct sbuf sb;
47	va_list ap;
48
49	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
50	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
51
52	sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
53	    ns->id);
54
55	va_start(ap, fmt);
56	sbuf_vprintf(&sb, fmt, ap);
57	va_end(ap);
58
59	sbuf_finish(&sb);
60	sbuf_delete(&sb);
61}
62
63/*
64 * The I/O completion may trigger after the received CQE if the I/O
65 * used a zero-copy mbuf that isn't harvested until after the NIC
66 * driver processes TX completions.  Abuse bio_driver1 as a refcount.
67 * Store I/O errors in bio_driver2.
68 */
69static __inline u_int *
70bio_refs(struct bio *bio)
71{
72	return ((u_int *)&bio->bio_driver1);
73}
74
75static void
76nvmf_ns_biodone(struct bio *bio)
77{
78	struct nvmf_namespace *ns;
79	int error;
80
81	if (!refcount_release(bio_refs(bio)))
82		return;
83
84	ns = bio->bio_dev->si_drv1;
85
86	/* If a request is aborted, resubmit or queue it for resubmission. */
87	if (bio->bio_error == ECONNABORTED) {
88		bio->bio_error = 0;
89		bio->bio_driver2 = 0;
90		mtx_lock(&ns->lock);
91		if (ns->disconnected) {
92			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
93			mtx_unlock(&ns->lock);
94		} else {
95			mtx_unlock(&ns->lock);
96			nvmf_ns_strategy(bio);
97		}
98	} else {
99		/*
100		 * I/O errors take precedence over generic EIO from
101		 * CQE errors.
102		 */
103		error = (intptr_t)bio->bio_driver2;
104		if (error != 0)
105			bio->bio_error = error;
106		if (bio->bio_error != 0)
107			bio->bio_flags |= BIO_ERROR;
108		biodone(bio);
109	}
110
111	if (refcount_release(&ns->active_bios))
112		wakeup(ns);
113}
114
115static void
116nvmf_ns_io_complete(void *arg, size_t xfered, int error)
117{
118	struct bio *bio = arg;
119
120	KASSERT(xfered <= bio->bio_bcount,
121	    ("%s: xfered > bio_bcount", __func__));
122
123	bio->bio_driver2 = (void *)(intptr_t)error;
124	bio->bio_resid = bio->bio_bcount - xfered;
125
126	nvmf_ns_biodone(bio);
127}
128
129static void
130nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
131{
132	struct bio *bio = arg;
133
134	if (error != 0)
135		bio->bio_resid = bio->bio_bcount;
136	else
137		bio->bio_resid = 0;
138
139	free(bio->bio_driver2, M_NVMF);
140	bio->bio_driver2 = (void *)(intptr_t)error;
141
142	nvmf_ns_biodone(bio);
143}
144
145static void
146nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
147{
148	struct bio *bio = arg;
149
150	if (nvmf_cqe_aborted(cqe))
151		bio->bio_error = ECONNABORTED;
152	else if (cqe->status != 0)
153		bio->bio_error = EIO;
154
155	nvmf_ns_biodone(bio);
156}
157
158static int
159nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
160{
161	struct nvme_command cmd;
162	struct nvmf_request *req;
163	struct nvme_dsm_range *dsm_range;
164	struct memdesc mem;
165	uint64_t lba, lba_count;
166
167	dsm_range = NULL;
168	memset(&cmd, 0, sizeof(cmd));
169	switch (bio->bio_cmd) {
170	case BIO_READ:
171		lba = bio->bio_offset / ns->lba_size;
172		lba_count = bio->bio_bcount / ns->lba_size;
173		nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
174		break;
175	case BIO_WRITE:
176		lba = bio->bio_offset / ns->lba_size;
177		lba_count = bio->bio_bcount / ns->lba_size;
178		nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
179		break;
180	case BIO_FLUSH:
181		nvme_ns_flush_cmd(&cmd, ns->id);
182		break;
183	case BIO_DELETE:
184		dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
185		    M_ZERO);
186		if (dsm_range == NULL)
187			return (ENOMEM);
188		lba = bio->bio_offset / ns->lba_size;
189		lba_count = bio->bio_bcount / ns->lba_size;
190		dsm_range->starting_lba = htole64(lba);
191		dsm_range->length = htole32(lba_count);
192
193		cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
194		cmd.nsid = htole32(ns->id);
195		cmd.cdw10 = htole32(0);		/* 1 range */
196		cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
197		break;
198	default:
199		return (EOPNOTSUPP);
200	}
201
202	mtx_lock(&ns->lock);
203	if (ns->disconnected) {
204		TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
205		mtx_unlock(&ns->lock);
206		free(dsm_range, M_NVMF);
207		return (0);
208	}
209
210	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
211	    nvmf_ns_bio_complete, bio, M_NOWAIT);
212	if (req == NULL) {
213		mtx_unlock(&ns->lock);
214		free(dsm_range, M_NVMF);
215		return (ENOMEM);
216	}
217
218	switch (bio->bio_cmd) {
219	case BIO_READ:
220	case BIO_WRITE:
221		refcount_init(bio_refs(bio), 2);
222		mem = memdesc_bio(bio);
223		nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
224		    bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
225		break;
226	case BIO_DELETE:
227		refcount_init(bio_refs(bio), 2);
228		mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
229		nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
230		    true, nvmf_ns_delete_complete, bio);
231		bio->bio_driver2 = dsm_range;
232		break;
233	default:
234		refcount_init(bio_refs(bio), 1);
235		KASSERT(bio->bio_resid == 0,
236		    ("%s: input bio_resid != 0", __func__));
237		break;
238	}
239
240	refcount_acquire(&ns->active_bios);
241	nvmf_submit_request(req);
242	mtx_unlock(&ns->lock);
243	return (0);
244}
245
246static int
247nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
248    struct thread *td)
249{
250	struct nvmf_namespace *ns = dev->si_drv1;
251	struct nvme_get_nsid *gnsid;
252	struct nvme_pt_command *pt;
253
254	switch (cmd) {
255	case NVME_PASSTHROUGH_CMD:
256		pt = (struct nvme_pt_command *)arg;
257		pt->cmd.nsid = htole32(ns->id);
258		return (nvmf_passthrough_cmd(ns->sc, pt, false));
259	case NVME_GET_NSID:
260		gnsid = (struct nvme_get_nsid *)arg;
261		strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
262		    sizeof(gnsid->cdev));
263		gnsid->nsid = ns->id;
264		return (0);
265	case DIOCGMEDIASIZE:
266		*(off_t *)arg = ns->size;
267		return (0);
268	case DIOCGSECTORSIZE:
269		*(u_int *)arg = ns->lba_size;
270		return (0);
271	default:
272		return (ENOTTY);
273	}
274}
275
276static int
277nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
278{
279	int error;
280
281	error = 0;
282	if ((oflags & FWRITE) != 0)
283		error = securelevel_gt(td->td_ucred, 0);
284	return (error);
285}
286
287void
288nvmf_ns_strategy(struct bio *bio)
289{
290	struct nvmf_namespace *ns;
291	int error;
292
293	ns = bio->bio_dev->si_drv1;
294
295	error = nvmf_ns_submit_bio(ns, bio);
296	if (error != 0) {
297		bio->bio_error = error;
298		bio->bio_flags |= BIO_ERROR;
299		bio->bio_resid = bio->bio_bcount;
300		biodone(bio);
301	}
302}
303
304static struct cdevsw nvmf_ns_cdevsw = {
305	.d_version = D_VERSION,
306	.d_flags = D_DISK,
307	.d_open = nvmf_ns_open,
308	.d_read = physread,
309	.d_write = physwrite,
310	.d_strategy = nvmf_ns_strategy,
311	.d_ioctl = nvmf_ns_ioctl
312};
313
314struct nvmf_namespace *
315nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
316    struct nvme_namespace_data *data)
317{
318	struct make_dev_args mda;
319	struct nvmf_namespace *ns;
320	int error;
321	uint8_t lbads, lbaf;
322
323	ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
324	ns->sc = sc;
325	ns->id = id;
326	TAILQ_INIT(&ns->pending_bios);
327	mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
328
329	/* One dummy bio avoids dropping to 0 until destroy. */
330	refcount_init(&ns->active_bios, 1);
331
332	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
333		ns_printf(ns, "End-to-end data protection not supported\n");
334		goto fail;
335	}
336
337	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
338	if (lbaf > data->nlbaf) {
339		ns_printf(ns, "Invalid LBA format index\n");
340		goto fail;
341	}
342
343	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
344		ns_printf(ns, "Namespaces with metadata are not supported\n");
345		goto fail;
346	}
347
348	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
349	if (lbads == 0) {
350		ns_printf(ns, "Invalid LBA format index\n");
351		goto fail;
352	}
353
354	ns->lba_size = 1 << lbads;
355	ns->size = data->nsze * ns->lba_size;
356
357	if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
358		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
359
360	if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
361		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
362
363	/*
364	 * XXX: Does any of the boundary splitting for NOIOB make any
365	 * sense for Fabrics?
366	 */
367
368	make_dev_args_init(&mda);
369	mda.mda_devsw = &nvmf_ns_cdevsw;
370	mda.mda_uid = UID_ROOT;
371	mda.mda_gid = GID_WHEEL;
372	mda.mda_mode = 0600;
373	mda.mda_si_drv1 = ns;
374	error = make_dev_s(&mda, &ns->cdev, "%sn%u",
375	    device_get_nameunit(sc->dev), id);
376	if (error != 0)
377		goto fail;
378	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
379	    device_get_nameunit(sc->dev), id);
380
381	ns->cdev->si_flags |= SI_UNMAPPED;
382
383	return (ns);
384fail:
385	mtx_destroy(&ns->lock);
386	free(ns, M_NVMF);
387	return (NULL);
388}
389
390void
391nvmf_disconnect_ns(struct nvmf_namespace *ns)
392{
393	mtx_lock(&ns->lock);
394	ns->disconnected = true;
395	mtx_unlock(&ns->lock);
396}
397
398void
399nvmf_reconnect_ns(struct nvmf_namespace *ns)
400{
401	TAILQ_HEAD(, bio) bios;
402	struct bio *bio;
403
404	mtx_lock(&ns->lock);
405	ns->disconnected = false;
406	TAILQ_INIT(&bios);
407	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
408	mtx_unlock(&ns->lock);
409
410	while (!TAILQ_EMPTY(&bios)) {
411		bio = TAILQ_FIRST(&bios);
412		TAILQ_REMOVE(&bios, bio, bio_queue);
413		nvmf_ns_strategy(bio);
414	}
415}
416
417void
418nvmf_destroy_ns(struct nvmf_namespace *ns)
419{
420	TAILQ_HEAD(, bio) bios;
421	struct bio *bio;
422
423	if (ns->cdev->si_drv2 != NULL)
424		destroy_dev(ns->cdev->si_drv2);
425	destroy_dev(ns->cdev);
426
427	/*
428	 * Wait for active I/O requests to drain.  The release drops
429	 * the reference on the "dummy bio" when the namespace is
430	 * created.
431	 */
432	mtx_lock(&ns->lock);
433	if (!refcount_release(&ns->active_bios)) {
434		while (ns->active_bios != 0)
435			mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
436	}
437
438	/* Abort any pending I/O requests. */
439	TAILQ_INIT(&bios);
440	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
441	mtx_unlock(&ns->lock);
442
443	while (!TAILQ_EMPTY(&bios)) {
444		bio = TAILQ_FIRST(&bios);
445		TAILQ_REMOVE(&bios, bio, bio_queue);
446		bio->bio_error = ECONNABORTED;
447		bio->bio_flags |= BIO_ERROR;
448		bio->bio_resid = bio->bio_bcount;
449		biodone(bio);
450	}
451
452	mtx_destroy(&ns->lock);
453	free(ns, M_NVMF);
454}
455
456bool
457nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
458{
459	uint8_t lbads, lbaf;
460
461	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
462		ns_printf(ns, "End-to-end data protection not supported\n");
463		return (false);
464	}
465
466	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
467	if (lbaf > data->nlbaf) {
468		ns_printf(ns, "Invalid LBA format index\n");
469		return (false);
470	}
471
472	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
473		ns_printf(ns, "Namespaces with metadata are not supported\n");
474		return (false);
475	}
476
477	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
478	if (lbads == 0) {
479		ns_printf(ns, "Invalid LBA format index\n");
480		return (false);
481	}
482
483	ns->lba_size = 1 << lbads;
484	ns->size = data->nsze * ns->lba_size;
485	return (true);
486}
487