1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8#include <sys/param.h> 9#include <sys/bio.h> 10#include <sys/bus.h> 11#include <sys/conf.h> 12#include <sys/disk.h> 13#include <sys/fcntl.h> 14#include <sys/lock.h> 15#include <sys/malloc.h> 16#include <sys/memdesc.h> 17#include <sys/mutex.h> 18#include <sys/proc.h> 19#include <sys/refcount.h> 20#include <sys/sbuf.h> 21#include <machine/stdarg.h> 22#include <dev/nvme/nvme.h> 23#include <dev/nvmf/host/nvmf_var.h> 24 25struct nvmf_namespace { 26 struct nvmf_softc *sc; 27 uint64_t size; 28 uint32_t id; 29 u_int flags; 30 uint32_t lba_size; 31 bool disconnected; 32 33 TAILQ_HEAD(, bio) pending_bios; 34 struct mtx lock; 35 volatile u_int active_bios; 36 37 struct cdev *cdev; 38}; 39 40static void nvmf_ns_strategy(struct bio *bio); 41 42static void 43ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) 44{ 45 char buf[128]; 46 struct sbuf sb; 47 va_list ap; 48 49 sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 50 sbuf_set_drain(&sb, sbuf_printf_drain, NULL); 51 52 sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev), 53 ns->id); 54 55 va_start(ap, fmt); 56 sbuf_vprintf(&sb, fmt, ap); 57 va_end(ap); 58 59 sbuf_finish(&sb); 60 sbuf_delete(&sb); 61} 62 63/* 64 * The I/O completion may trigger after the received CQE if the I/O 65 * used a zero-copy mbuf that isn't harvested until after the NIC 66 * driver processes TX completions. Abuse bio_driver1 as a refcount. 67 * Store I/O errors in bio_driver2. 68 */ 69static __inline u_int * 70bio_refs(struct bio *bio) 71{ 72 return ((u_int *)&bio->bio_driver1); 73} 74 75static void 76nvmf_ns_biodone(struct bio *bio) 77{ 78 struct nvmf_namespace *ns; 79 int error; 80 81 if (!refcount_release(bio_refs(bio))) 82 return; 83 84 ns = bio->bio_dev->si_drv1; 85 86 /* If a request is aborted, resubmit or queue it for resubmission. */ 87 if (bio->bio_error == ECONNABORTED) { 88 bio->bio_error = 0; 89 bio->bio_driver2 = 0; 90 mtx_lock(&ns->lock); 91 if (ns->disconnected) { 92 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 93 mtx_unlock(&ns->lock); 94 } else { 95 mtx_unlock(&ns->lock); 96 nvmf_ns_strategy(bio); 97 } 98 } else { 99 /* 100 * I/O errors take precedence over generic EIO from 101 * CQE errors. 102 */ 103 error = (intptr_t)bio->bio_driver2; 104 if (error != 0) 105 bio->bio_error = error; 106 if (bio->bio_error != 0) 107 bio->bio_flags |= BIO_ERROR; 108 biodone(bio); 109 } 110 111 if (refcount_release(&ns->active_bios)) 112 wakeup(ns); 113} 114 115static void 116nvmf_ns_io_complete(void *arg, size_t xfered, int error) 117{ 118 struct bio *bio = arg; 119 120 KASSERT(xfered <= bio->bio_bcount, 121 ("%s: xfered > bio_bcount", __func__)); 122 123 bio->bio_driver2 = (void *)(intptr_t)error; 124 bio->bio_resid = bio->bio_bcount - xfered; 125 126 nvmf_ns_biodone(bio); 127} 128 129static void 130nvmf_ns_delete_complete(void *arg, size_t xfered, int error) 131{ 132 struct bio *bio = arg; 133 134 if (error != 0) 135 bio->bio_resid = bio->bio_bcount; 136 else 137 bio->bio_resid = 0; 138 139 free(bio->bio_driver2, M_NVMF); 140 bio->bio_driver2 = (void *)(intptr_t)error; 141 142 nvmf_ns_biodone(bio); 143} 144 145static void 146nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) 147{ 148 struct bio *bio = arg; 149 150 if (nvmf_cqe_aborted(cqe)) 151 bio->bio_error = ECONNABORTED; 152 else if (cqe->status != 0) 153 bio->bio_error = EIO; 154 155 nvmf_ns_biodone(bio); 156} 157 158static int 159nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) 160{ 161 struct nvme_command cmd; 162 struct nvmf_request *req; 163 struct nvme_dsm_range *dsm_range; 164 struct memdesc mem; 165 uint64_t lba, lba_count; 166 167 dsm_range = NULL; 168 memset(&cmd, 0, sizeof(cmd)); 169 switch (bio->bio_cmd) { 170 case BIO_READ: 171 lba = bio->bio_offset / ns->lba_size; 172 lba_count = bio->bio_bcount / ns->lba_size; 173 nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); 174 break; 175 case BIO_WRITE: 176 lba = bio->bio_offset / ns->lba_size; 177 lba_count = bio->bio_bcount / ns->lba_size; 178 nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); 179 break; 180 case BIO_FLUSH: 181 nvme_ns_flush_cmd(&cmd, ns->id); 182 break; 183 case BIO_DELETE: 184 dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | 185 M_ZERO); 186 if (dsm_range == NULL) 187 return (ENOMEM); 188 lba = bio->bio_offset / ns->lba_size; 189 lba_count = bio->bio_bcount / ns->lba_size; 190 dsm_range->starting_lba = htole64(lba); 191 dsm_range->length = htole32(lba_count); 192 193 cmd.opc = NVME_OPC_DATASET_MANAGEMENT; 194 cmd.nsid = htole32(ns->id); 195 cmd.cdw10 = htole32(0); /* 1 range */ 196 cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); 197 break; 198 default: 199 return (EOPNOTSUPP); 200 } 201 202 mtx_lock(&ns->lock); 203 if (ns->disconnected) { 204 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 205 mtx_unlock(&ns->lock); 206 free(dsm_range, M_NVMF); 207 return (0); 208 } 209 210 req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, 211 nvmf_ns_bio_complete, bio, M_NOWAIT); 212 if (req == NULL) { 213 mtx_unlock(&ns->lock); 214 free(dsm_range, M_NVMF); 215 return (ENOMEM); 216 } 217 218 switch (bio->bio_cmd) { 219 case BIO_READ: 220 case BIO_WRITE: 221 refcount_init(bio_refs(bio), 2); 222 mem = memdesc_bio(bio); 223 nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, 224 bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); 225 break; 226 case BIO_DELETE: 227 refcount_init(bio_refs(bio), 2); 228 mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); 229 nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), 230 true, nvmf_ns_delete_complete, bio); 231 bio->bio_driver2 = dsm_range; 232 break; 233 default: 234 refcount_init(bio_refs(bio), 1); 235 KASSERT(bio->bio_resid == 0, 236 ("%s: input bio_resid != 0", __func__)); 237 break; 238 } 239 240 refcount_acquire(&ns->active_bios); 241 nvmf_submit_request(req); 242 mtx_unlock(&ns->lock); 243 return (0); 244} 245 246static int 247nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, 248 struct thread *td) 249{ 250 struct nvmf_namespace *ns = dev->si_drv1; 251 struct nvme_get_nsid *gnsid; 252 struct nvme_pt_command *pt; 253 254 switch (cmd) { 255 case NVME_PASSTHROUGH_CMD: 256 pt = (struct nvme_pt_command *)arg; 257 pt->cmd.nsid = htole32(ns->id); 258 return (nvmf_passthrough_cmd(ns->sc, pt, false)); 259 case NVME_GET_NSID: 260 gnsid = (struct nvme_get_nsid *)arg; 261 strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), 262 sizeof(gnsid->cdev)); 263 gnsid->nsid = ns->id; 264 return (0); 265 case DIOCGMEDIASIZE: 266 *(off_t *)arg = ns->size; 267 return (0); 268 case DIOCGSECTORSIZE: 269 *(u_int *)arg = ns->lba_size; 270 return (0); 271 default: 272 return (ENOTTY); 273 } 274} 275 276static int 277nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 278{ 279 int error; 280 281 error = 0; 282 if ((oflags & FWRITE) != 0) 283 error = securelevel_gt(td->td_ucred, 0); 284 return (error); 285} 286 287void 288nvmf_ns_strategy(struct bio *bio) 289{ 290 struct nvmf_namespace *ns; 291 int error; 292 293 ns = bio->bio_dev->si_drv1; 294 295 error = nvmf_ns_submit_bio(ns, bio); 296 if (error != 0) { 297 bio->bio_error = error; 298 bio->bio_flags |= BIO_ERROR; 299 bio->bio_resid = bio->bio_bcount; 300 biodone(bio); 301 } 302} 303 304static struct cdevsw nvmf_ns_cdevsw = { 305 .d_version = D_VERSION, 306 .d_flags = D_DISK, 307 .d_open = nvmf_ns_open, 308 .d_read = physread, 309 .d_write = physwrite, 310 .d_strategy = nvmf_ns_strategy, 311 .d_ioctl = nvmf_ns_ioctl 312}; 313 314struct nvmf_namespace * 315nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, 316 struct nvme_namespace_data *data) 317{ 318 struct make_dev_args mda; 319 struct nvmf_namespace *ns; 320 int error; 321 uint8_t lbads, lbaf; 322 323 ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); 324 ns->sc = sc; 325 ns->id = id; 326 TAILQ_INIT(&ns->pending_bios); 327 mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); 328 329 /* One dummy bio avoids dropping to 0 until destroy. */ 330 refcount_init(&ns->active_bios, 1); 331 332 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 333 ns_printf(ns, "End-to-end data protection not supported\n"); 334 goto fail; 335 } 336 337 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 338 if (lbaf > data->nlbaf) { 339 ns_printf(ns, "Invalid LBA format index\n"); 340 goto fail; 341 } 342 343 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 344 ns_printf(ns, "Namespaces with metadata are not supported\n"); 345 goto fail; 346 } 347 348 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 349 if (lbads == 0) { 350 ns_printf(ns, "Invalid LBA format index\n"); 351 goto fail; 352 } 353 354 ns->lba_size = 1 << lbads; 355 ns->size = data->nsze * ns->lba_size; 356 357 if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) 358 ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; 359 360 if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) 361 ns->flags |= NVME_NS_FLUSH_SUPPORTED; 362 363 /* 364 * XXX: Does any of the boundary splitting for NOIOB make any 365 * sense for Fabrics? 366 */ 367 368 make_dev_args_init(&mda); 369 mda.mda_devsw = &nvmf_ns_cdevsw; 370 mda.mda_uid = UID_ROOT; 371 mda.mda_gid = GID_WHEEL; 372 mda.mda_mode = 0600; 373 mda.mda_si_drv1 = ns; 374 error = make_dev_s(&mda, &ns->cdev, "%sn%u", 375 device_get_nameunit(sc->dev), id); 376 if (error != 0) 377 goto fail; 378 ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u", 379 device_get_nameunit(sc->dev), id); 380 381 ns->cdev->si_flags |= SI_UNMAPPED; 382 383 return (ns); 384fail: 385 mtx_destroy(&ns->lock); 386 free(ns, M_NVMF); 387 return (NULL); 388} 389 390void 391nvmf_disconnect_ns(struct nvmf_namespace *ns) 392{ 393 mtx_lock(&ns->lock); 394 ns->disconnected = true; 395 mtx_unlock(&ns->lock); 396} 397 398void 399nvmf_reconnect_ns(struct nvmf_namespace *ns) 400{ 401 TAILQ_HEAD(, bio) bios; 402 struct bio *bio; 403 404 mtx_lock(&ns->lock); 405 ns->disconnected = false; 406 TAILQ_INIT(&bios); 407 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 408 mtx_unlock(&ns->lock); 409 410 while (!TAILQ_EMPTY(&bios)) { 411 bio = TAILQ_FIRST(&bios); 412 TAILQ_REMOVE(&bios, bio, bio_queue); 413 nvmf_ns_strategy(bio); 414 } 415} 416 417void 418nvmf_destroy_ns(struct nvmf_namespace *ns) 419{ 420 TAILQ_HEAD(, bio) bios; 421 struct bio *bio; 422 423 if (ns->cdev->si_drv2 != NULL) 424 destroy_dev(ns->cdev->si_drv2); 425 destroy_dev(ns->cdev); 426 427 /* 428 * Wait for active I/O requests to drain. The release drops 429 * the reference on the "dummy bio" when the namespace is 430 * created. 431 */ 432 mtx_lock(&ns->lock); 433 if (!refcount_release(&ns->active_bios)) { 434 while (ns->active_bios != 0) 435 mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); 436 } 437 438 /* Abort any pending I/O requests. */ 439 TAILQ_INIT(&bios); 440 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 441 mtx_unlock(&ns->lock); 442 443 while (!TAILQ_EMPTY(&bios)) { 444 bio = TAILQ_FIRST(&bios); 445 TAILQ_REMOVE(&bios, bio, bio_queue); 446 bio->bio_error = ECONNABORTED; 447 bio->bio_flags |= BIO_ERROR; 448 bio->bio_resid = bio->bio_bcount; 449 biodone(bio); 450 } 451 452 mtx_destroy(&ns->lock); 453 free(ns, M_NVMF); 454} 455 456bool 457nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) 458{ 459 uint8_t lbads, lbaf; 460 461 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 462 ns_printf(ns, "End-to-end data protection not supported\n"); 463 return (false); 464 } 465 466 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 467 if (lbaf > data->nlbaf) { 468 ns_printf(ns, "Invalid LBA format index\n"); 469 return (false); 470 } 471 472 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 473 ns_printf(ns, "Namespaces with metadata are not supported\n"); 474 return (false); 475 } 476 477 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 478 if (lbads == 0) { 479 ns_printf(ns, "Invalid LBA format index\n"); 480 return (false); 481 } 482 483 ns->lba_size = 1 << lbads; 484 ns->size = data->nsze * ns->lba_size; 485 return (true); 486} 487