1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright (c) Intel Corporation. All rights reserved.
5 *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#ifndef __NVME_INTERNAL_H__
35#define __NVME_INTERNAL_H__
36
37#include "nvme_common.h"
38#include "nvme_pci.h"
39#include "nvme_intel.h"
40#include "nvme_mem.h"
41
42#ifndef __HAIKU__
43#include <pthread.h>
44#include <sys/user.h> /* PAGE_SIZE */
45#else
46#include "nvme_platform.h"
47#endif
48
49/*
50 * List functions.
51 */
52#define	LIST_FOREACH_SAFE(var, head, field, tvar)			\
53	for ((var) = LIST_FIRST((head));				\
54	     (var) && ((tvar) = LIST_NEXT((var), field), 1);		\
55	     (var) = (tvar))
56
57/*
58 * Tail queue functions.
59 */
60#define	TAILQ_FOREACH_SAFE(var, head, field, tvar)			\
61	for ((var) = TAILQ_FIRST((head));				\
62	     (var) && ((tvar) = TAILQ_NEXT((var), field), 1);		\
63	     (var) = (tvar))
64
65#define INTEL_DC_P3X00_DEVID	0x09538086
66
67#define NVME_TIMEOUT_INFINITE	UINT64_MAX
68
69/*
70 * Some Intel devices support vendor-unique read latency log page even
71 * though the log page directory says otherwise.
72 */
73#define NVME_INTEL_QUIRK_READ_LATENCY   0x1
74
75/*
76 * Some Intel devices support vendor-unique write latency log page even
77 * though the log page directory says otherwise.
78 */
79#define NVME_INTEL_QUIRK_WRITE_LATENCY  0x2
80
81/*
82 * Some controllers need a delay before starting to check the device
83 * readiness, which is done by reading the controller status register rdy bit.
84 */
85#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY	0x4
86
87/*
88 * Some controllers need a delay once the controller status register rdy bit
89 * switches from 0 to 1.
90 */
91#define NVME_QUIRK_DELAY_AFTER_RDY	0x8
92
93/*
94 * Queues may consist of a contiguous block of physical
95 * memory or optionally a non-contiguous set of physical
96 * memory pages (defined by a Physical Region Pages List)
97 */
98#define NVME_MAX_PRP_LIST_ENTRIES       (506)
99
100/*
101 * For commands requiring more than 2 PRP entries, one PRP will be
102 * embedded in the command (prp1), and the rest of the PRP entries
103 * will be in a list pointed to by the command (prp2).  This means
104 * that real max number of PRP entries we support is 506+1, which
105 * results in a max xfer size of 506*PAGE_SIZE.
106 */
107#define NVME_MAX_XFER_SIZE	NVME_MAX_PRP_LIST_ENTRIES * PAGE_SIZE
108
109#define NVME_ADMIN_TRACKERS	        (16)
110#define NVME_ADMIN_ENTRIES	        (128)
111
112/*
113 * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion
114 * queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we
115 * will allow outstanding on an I/O qpair at any time. The only advantage in
116 * having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping
117 * the contents of the submission and completion queues, it will show a longer
118 * history of data.
119 */
120#define NVME_IO_ENTRIES		        (1024U)
121#define NVME_IO_TRACKERS	        (128U)
122#define NVME_IO_ENTRIES_VS_TRACKERS_RATIO (NVME_IO_ENTRIES / NVME_IO_TRACKERS)
123
124/*
125 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
126 * segment.
127 */
128#define NVME_MAX_SGL_DESCRIPTORS	(253)
129
130/*
131 * NVME_MAX_IO_ENTRIES is not defined, since it is specified in CC.MQES
132 * for each controller.
133 */
134
135#define NVME_MAX_ASYNC_EVENTS	        (8)
136
137/*
138 * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this
139 * define specifies the maximum number of queues this driver will actually
140 * try to configure, if available.
141 */
142#define DEFAULT_MAX_IO_QUEUES		(1024)
143
144/*
145 * Maximum of times a failed command can be retried.
146 */
147#define NVME_MAX_RETRY_COUNT		(3)
148
149/*
150 * I/O queue type.
151 */
152enum nvme_io_queue_type {
153
154	NVME_IO_QTYPE_INVALID = 0,
155	NVME_IO_SUBMISSION_QUEUE,
156	NVME_IO_COMPLETION_QUEUE,
157};
158
159enum nvme_payload_type {
160
161	NVME_PAYLOAD_TYPE_INVALID = 0,
162
163	/*
164	 * nvme_request::u.payload.contig_buffer is valid for this request.
165	 */
166	NVME_PAYLOAD_TYPE_CONTIG,
167
168	/*
169	 * nvme_request::u.sgl is valid for this request
170	 */
171	NVME_PAYLOAD_TYPE_SGL,
172};
173
174/*
175 * Controller support flags.
176 */
177enum nvme_ctrlr_flags {
178
179	/*
180	 * The SGL is supported.
181	 */
182	NVME_CTRLR_SGL_SUPPORTED = 0x1,
183
184};
185
186/*
187 * Descriptor for a request data payload.
188 *
189 * This struct is arranged so that it fits nicely in struct nvme_request.
190 */
191struct __attribute__((packed)) nvme_payload {
192
193	union {
194		/*
195		 * Virtual memory address of a single
196		 * physically contiguous buffer
197		 */
198		void *contig;
199
200		/*
201		 * Call back functions for retrieving physical
202		 * addresses for scattered payloads.
203		 */
204		struct {
205			nvme_req_reset_sgl_cb reset_sgl_fn;
206			nvme_req_next_sge_cb next_sge_fn;
207			void *cb_arg;
208		} sgl;
209	} u;
210
211	/*
212	 * Virtual memory address of a single physically
213	 * contiguous metadata buffer
214	 */
215	void *md;
216
217	/*
218	 * Payload type.
219	 */
220	uint8_t type;
221
222};
223
224struct nvme_request {
225
226	/*
227	 * NVMe command: must be aligned on 64B.
228	 */
229	struct nvme_cmd		         cmd;
230
231	/*
232	 * Data payload for this request's command.
233	 */
234	struct nvme_payload              payload;
235
236	uint8_t			         retries;
237
238	/*
239	 * Number of child requests still outstanding for this
240	 * request which was split into multiple child requests.
241	 */
242	uint8_t			         child_reqs;
243	uint32_t		         payload_size;
244
245	/*
246	 * Offset in bytes from the beginning of payload for this request.
247	 * This is used for I/O commands that are split into multiple requests.
248	 */
249	uint32_t	                 payload_offset;
250	uint32_t		         md_offset;
251
252	nvme_cmd_cb		         cb_fn;
253	void			         *cb_arg;
254
255	/*
256	 * The following members should not be reordered with members
257	 * above.  These members are only needed when splitting
258	 * requests which is done rarely, and the driver is careful
259	 * to not touch the following fields until a split operation is
260	 * needed, to avoid touching an extra cacheline.
261	 */
262
263	/*
264	 * Points to the outstanding child requests for a parent request.
265	 * Only valid if a request was split into multiple child
266	 * requests, and is not initialized for non-split requests.
267	 */
268	TAILQ_HEAD(, nvme_request)	 children;
269
270	/*
271	 * Linked-list pointers for a child request in its parent's list.
272	 */
273	TAILQ_ENTRY(nvme_request)	 child_tailq;
274
275	/*
276	 * For queueing in qpair queued_req or free_req.
277	 */
278	struct nvme_qpair		 *qpair;
279	STAILQ_ENTRY(nvme_request)	 stailq;
280
281	/*
282	 * Points to a parent request if part of a split request,
283	 * NULL otherwise.
284	 */
285	struct nvme_request		 *parent;
286
287	/*
288	 * Completion status for a parent request.  Initialized to all 0's
289	 * (SUCCESS) before child requests are submitted.  If a child
290	 * request completes with error, the error status is copied here,
291	 * to ensure that the parent request is also completed with error
292	 * status once all child requests are completed.
293	 */
294	struct nvme_cpl		         parent_status;
295
296} __attribute__((aligned(64)));
297
298struct nvme_completion_poll_status {
299	struct nvme_cpl		cpl;
300	bool			done;
301};
302
303struct nvme_async_event_request {
304	struct nvme_ctrlr	*ctrlr;
305	struct nvme_request	*req;
306	struct nvme_cpl		cpl;
307};
308
309struct nvme_tracker {
310
311	LIST_ENTRY(nvme_tracker)	list;
312
313	struct nvme_request		*req;
314#if INTPTR_MAX == INT32_MAX
315	int32_t __pad[3];
316#elif !defined(INTPTR_MAX)
317#	error Need definition of INTPTR_MAX!
318#endif
319
320	uint16_t			cid;
321
322	uint16_t			rsvd1: 15;
323	uint16_t			active: 1;
324
325	uint32_t			rsvd2;
326
327	uint64_t			prp_sgl_bus_addr;
328
329	union {
330		uint64_t			prp[NVME_MAX_PRP_LIST_ENTRIES];
331		struct nvme_sgl_descriptor	sgl[NVME_MAX_SGL_DESCRIPTORS];
332	} u;
333
334	uint64_t			rsvd3;
335};
336
337/*
338 * struct nvme_tracker must be exactly 4K so that the prp[] array does not
339 * cross a page boundery and so that there is no padding required to meet
340 * alignment requirements.
341 */
342nvme_static_assert(sizeof(struct nvme_tracker) == 4096,
343		   "nvme_tracker is not 4K");
344nvme_static_assert((offsetof(struct nvme_tracker, u.sgl) & 7) == 0,
345		   "SGL must be Qword aligned");
346
347struct nvme_qpair {
348	/*
349	 * Guards access to this structure.
350	 */
351	pthread_mutex_t				lock;
352
353	volatile uint32_t	        *sq_tdbl;
354	volatile uint32_t	        *cq_hdbl;
355
356	/*
357	 * Submission queue
358	 */
359	struct nvme_cmd		        *cmd;
360
361	/*
362	 * Completion queue
363	 */
364	struct nvme_cpl		        *cpl;
365
366	LIST_HEAD(, nvme_tracker)	free_tr;
367	LIST_HEAD(, nvme_tracker)	outstanding_tr;
368
369	/*
370	 * Array of trackers indexed by command ID.
371	 */
372	uint16_t			trackers;
373	struct nvme_tracker		*tr;
374
375	struct nvme_request		*reqs;
376	unsigned int			num_reqs;
377	STAILQ_HEAD(, nvme_request)	free_req;
378	STAILQ_HEAD(, nvme_request)	queued_req;
379
380	uint16_t			id;
381
382	uint16_t			entries;
383	uint16_t			sq_tail;
384	uint16_t			cq_head;
385
386	uint8_t				phase;
387
388	bool				enabled;
389	bool				sq_in_cmb;
390
391	/*
392	 * Fields below this point should not be touched on the
393	 * normal I/O happy path.
394	 */
395
396	uint8_t				qprio;
397
398	struct nvme_ctrlr		*ctrlr;
399
400	/* List entry for nvme_ctrlr::free_io_qpairs and active_io_qpairs */
401	TAILQ_ENTRY(nvme_qpair)		tailq;
402
403	phys_addr_t			cmd_bus_addr;
404	phys_addr_t			cpl_bus_addr;
405};
406
407struct nvme_ns {
408
409	struct nvme_ctrlr		*ctrlr;
410
411	uint32_t			stripe_size;
412	uint32_t			sector_size;
413
414	uint32_t			md_size;
415	uint32_t			pi_type;
416
417	uint32_t			sectors_per_max_io;
418	uint32_t			sectors_per_stripe;
419
420	uint16_t			id;
421	uint16_t			flags;
422
423	int				open_count;
424
425};
426
427/*
428 * State of struct nvme_ctrlr (in particular, during initialization).
429 */
430enum nvme_ctrlr_state {
431
432	/*
433	 * Controller has not been initialized yet.
434	 */
435	NVME_CTRLR_STATE_INIT = 0,
436
437	/*
438	 * Waiting for CSTS.RDY to transition from 0 to 1
439	 * so that CC.EN may be set to 0.
440	 */
441	NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
442
443	/*
444	 * Waiting for CSTS.RDY to transition from 1 to 0
445	 * so that CC.EN may be set to 1.
446	 */
447	NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
448
449	/*
450	 * Waiting for CSTS.RDY to transition from 0 to 1
451	 * after enabling the controller.
452	 */
453	NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
454
455	/*
456	 * Controller initialization has completed and
457	 * the controller is ready.
458	 */
459	NVME_CTRLR_STATE_READY
460};
461
462/*
463 * One of these per allocated PCI device.
464 */
465struct nvme_ctrlr {
466
467	/*
468	 * NVMe MMIO register space.
469	 */
470	volatile struct nvme_registers	*regs;
471
472	/*
473	 * Array of I/O queue pairs.
474	 */
475	struct nvme_qpair		*ioq;
476
477	/*
478	 * Size of the array of I/O queue pairs.
479	 */
480	unsigned int			io_queues;
481
482	/*
483	 * Maximum I/O queue pairs.
484	 */
485	unsigned int			max_io_queues;
486
487	/*
488	 * Number of I/O queue pairs enabled
489	 */
490	unsigned int			enabled_io_qpairs;
491
492	/*
493	 * Maximum entries for I/O qpairs
494	 */
495	unsigned int			io_qpairs_max_entries;
496
497	/*
498	 * Array of namespace IDs.
499	 */
500	unsigned int			nr_ns;
501	struct nvme_ns		        *ns;
502
503	/*
504	 * Controller state.
505	 */
506	bool				resetting;
507	bool				failed;
508
509	/*
510	 * Controller support flags.
511	 */
512	uint64_t			flags;
513
514	/*
515	 * Cold data (not accessed in normal I/O path) is after this point.
516	 */
517	enum nvme_ctrlr_state		state;
518	uint64_t			state_timeout_ms;
519
520	/*
521	 * All the log pages supported.
522	 */
523	bool				log_page_supported[256];
524
525	/*
526	 * All the features supported.
527	 */
528	bool				feature_supported[256];
529
530	/*
531	 * Associated PCI device information.
532	 */
533	struct pci_device		*pci_dev;
534
535	/*
536	 * Maximum i/o size in bytes.
537	 */
538	uint32_t			max_xfer_size;
539
540	/*
541	 * Minimum page size supported by this controller in bytes.
542	 */
543	uint32_t			min_page_size;
544
545	/*
546	 * Stride in uint32_t units between doorbell registers
547	 * (1 = 4 bytes, 2 = 8 bytes, ...).
548	 */
549	uint32_t			doorbell_stride_u32;
550
551	uint32_t			num_aers;
552	struct nvme_async_event_request	aer[NVME_MAX_ASYNC_EVENTS];
553	nvme_aer_cb		        aer_cb_fn;
554	void				*aer_cb_arg;
555
556	/*
557	 * Admin queue pair.
558	 */
559	struct nvme_qpair		adminq;
560
561	/*
562	 * Guards access to the controller itself.
563	 */
564	pthread_mutex_t			lock;
565
566	/*
567	 * Identify Controller data.
568	 */
569	struct nvme_ctrlr_data		cdata;
570
571	/*
572	 * Array of Identify Namespace data.
573	 * Stored separately from ns since nsdata should
574	 * not normally be accessed during I/O.
575	 */
576	struct nvme_ns_data	        *nsdata;
577
578	TAILQ_HEAD(, nvme_qpair)	free_io_qpairs;
579	TAILQ_HEAD(, nvme_qpair)	active_io_qpairs;
580
581	/*
582	 * Controller option set on open.
583	 */
584	struct nvme_ctrlr_opts		opts;
585
586	/*
587	 * BAR mapping address which contains controller memory buffer.
588	 */
589	void				*cmb_bar_virt_addr;
590
591	/*
592	 * BAR physical address which contains controller memory buffer.
593	 */
594	uint64_t			cmb_bar_phys_addr;
595
596	/*
597	 * Controller memory buffer size in Bytes.
598	 */
599	uint64_t			cmb_size;
600
601	/*
602	 * Current offset of controller memory buffer.
603	 */
604	uint64_t			cmb_current_offset;
605
606	/*
607	 * Quirks flags.
608	 */
609	unsigned int			quirks;
610
611	/*
612	 * For controller list.
613	 */
614	LIST_ENTRY(nvme_ctrlr)		link;
615
616} __attribute__((aligned(PAGE_SIZE)));
617
618/*
619 * Admin functions.
620 */
621extern int nvme_admin_identify_ctrlr(struct nvme_ctrlr *ctrlr,
622				     struct nvme_ctrlr_data *cdata);
623
624extern int nvme_admin_get_feature(struct nvme_ctrlr *ctrlr,
625				  enum nvme_feat_sel sel,
626				  enum nvme_feat feature,
627				  uint32_t cdw11, uint32_t *attributes);
628
629extern int nvme_admin_set_feature(struct nvme_ctrlr *ctrlr,
630				  bool save,
631				  enum nvme_feat feature,
632				  uint32_t cdw11, uint32_t cdw12,
633				  uint32_t *attributes);
634
635extern int nvme_admin_format_nvm(struct nvme_ctrlr *ctrlr,
636				 unsigned int nsid,
637				 struct nvme_format *format);
638
639extern int nvme_admin_get_log_page(struct nvme_ctrlr *ctrlr,
640				   uint8_t log_page, uint32_t nsid,
641				   void *payload, uint32_t payload_size);
642
643extern int nvme_admin_abort_cmd(struct nvme_ctrlr *ctrlr,
644				uint16_t cid, uint16_t sqid);
645
646extern int nvme_admin_create_ioq(struct nvme_ctrlr *ctrlr,
647				 struct nvme_qpair *io_que,
648				 enum nvme_io_queue_type io_qtype);
649
650extern int nvme_admin_delete_ioq(struct nvme_ctrlr *ctrlr,
651				 struct nvme_qpair *qpair,
652				 enum nvme_io_queue_type io_qtype);
653
654extern int nvme_admin_identify_ns(struct nvme_ctrlr *ctrlr,
655				  uint16_t nsid,
656				  struct nvme_ns_data *nsdata);
657
658extern int nvme_admin_attach_ns(struct nvme_ctrlr *ctrlr,
659				uint32_t nsid,
660				struct nvme_ctrlr_list *clist);
661
662extern int nvme_admin_detach_ns(struct nvme_ctrlr *ctrlr,
663				uint32_t nsid,
664				struct nvme_ctrlr_list *clist);
665
666extern int nvme_admin_create_ns(struct nvme_ctrlr *ctrlr,
667				struct nvme_ns_data *nsdata,
668				unsigned int *nsid);
669
670extern int nvme_admin_delete_ns(struct nvme_ctrlr *ctrlr,
671				unsigned int nsid);
672
673extern int nvme_admin_fw_commit(struct nvme_ctrlr *ctrlr,
674				const struct nvme_fw_commit *fw_commit);
675
676extern int nvme_admin_fw_image_dl(struct nvme_ctrlr *ctrlr,
677				  void *fw, uint32_t size, uint32_t offset);
678
679extern void nvme_request_completion_poll_cb(void *arg,
680					    const struct nvme_cpl *cpl);
681
682extern struct nvme_ctrlr *nvme_ctrlr_attach(struct pci_device *pci_dev,
683					    struct nvme_ctrlr_opts *opts);
684
685extern void nvme_ctrlr_detach(struct nvme_ctrlr *ctrlr);
686
687extern int nvme_qpair_construct(struct nvme_ctrlr *ctrlr,
688				struct nvme_qpair *qpair, enum nvme_qprio qprio,
689				uint16_t entries, uint16_t trackers);
690
691extern void nvme_qpair_destroy(struct nvme_qpair *qpair);
692extern void nvme_qpair_enable(struct nvme_qpair *qpair);
693extern void nvme_qpair_disable(struct nvme_qpair *qpair);
694extern int  nvme_qpair_submit_request(struct nvme_qpair *qpair,
695				      struct nvme_request *req);
696extern void nvme_qpair_reset(struct nvme_qpair *qpair);
697extern void nvme_qpair_fail(struct nvme_qpair *qpair);
698
699extern int nvme_request_pool_construct(struct nvme_qpair *qpair);
700
701extern void nvme_request_pool_destroy(struct nvme_qpair *qpair);
702
703extern struct nvme_request *nvme_request_allocate(struct nvme_qpair *qpair,
704		      const struct nvme_payload *payload, uint32_t payload_size,
705		      nvme_cmd_cb cb_fn, void *cb_arg);
706
707extern struct nvme_request *nvme_request_allocate_null(struct nvme_qpair *qpair,
708						       nvme_cmd_cb cb_fn,
709						       void *cb_arg);
710
711extern struct nvme_request *
712nvme_request_allocate_contig(struct nvme_qpair *qpair,
713			     void *buffer, uint32_t payload_size,
714			     nvme_cmd_cb cb_fn, void *cb_arg);
715
716extern void nvme_request_free(struct nvme_request *req);
717extern void nvme_request_free_locked(struct nvme_request *req);
718
719extern void nvme_request_add_child(struct nvme_request *parent,
720				   struct nvme_request *child);
721
722extern void nvme_request_remove_child(struct nvme_request *parent,
723				      struct nvme_request *child);
724
725extern unsigned int nvme_ctrlr_get_quirks(struct pci_device *pdev);
726
727extern int nvme_ns_construct(struct nvme_ctrlr *ctrlr,
728			     struct nvme_ns *ns, unsigned int id);
729
730/*
731 * Registers mmio access.
732 */
733#define nvme_reg_mmio_read_4(sc, reg)		\
734	nvme_mmio_read_4((__u32 *)&(sc)->regs->reg)
735
736#define nvme_reg_mmio_read_8(sc, reg)		\
737	nvme_mmio_read_8((__u64 *)&(sc)->regs->reg)
738
739#define nvme_reg_mmio_write_4(sc, reg, val)	\
740	nvme_mmio_write_4((__u32 *)&(sc)->regs->reg, val)
741
742#define nvme_reg_mmio_write_8(sc, reg, val)	\
743	nvme_mmio_write_8((__u64 *)&(sc)->regs->reg, val)
744
745#endif /* __NVME_INTERNAL_H__ */
746