dev.c revision 319255
1/*
2 * Copyright (c) 2006-2014 Chelsio, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#if HAVE_CONFIG_H
33#  include <config.h>
34#endif				/* HAVE_CONFIG_H */
35
36#include <stdio.h>
37#include <stdlib.h>
38#include <unistd.h>
39#include <errno.h>
40#include <sys/mman.h>
41#include <pthread.h>
42#include <string.h>
43#include <signal.h>
44
45#include "libcxgb4.h"
46#include "cxgb4-abi.h"
47
48#define PCI_VENDOR_ID_CHELSIO		0x1425
49
50/*
51 * Macros needed to support the PCI Device ID Table ...
52 */
53#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \
54	struct { \
55		unsigned vendor; \
56		unsigned device; \
57	} hca_table[] = {
58
59#define CH_PCI_DEVICE_ID_FUNCTION \
60		0x4
61
62#define CH_PCI_ID_TABLE_ENTRY(__DeviceID) \
63		{ \
64			.vendor = PCI_VENDOR_ID_CHELSIO, \
65			.device = (__DeviceID), \
66		}
67
68#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \
69	}
70
71#include "t4_chip_type.h"
72#include "t4_pci_id_tbl.h"
73
74unsigned long c4iw_page_size;
75unsigned long c4iw_page_shift;
76unsigned long c4iw_page_mask;
77int ma_wr;
78int t5_en_wc = 1;
79
80SLIST_HEAD(devices_struct, c4iw_dev) devices;
81
82static struct ibv_context_ops c4iw_ctx_ops = {
83	.query_device = c4iw_query_device,
84	.query_port = c4iw_query_port,
85	.alloc_pd = c4iw_alloc_pd,
86	.dealloc_pd = c4iw_free_pd,
87	.reg_mr = c4iw_reg_mr,
88	.dereg_mr = c4iw_dereg_mr,
89	.create_cq = c4iw_create_cq,
90	.resize_cq = c4iw_resize_cq,
91	.destroy_cq = c4iw_destroy_cq,
92	.create_srq = c4iw_create_srq,
93	.modify_srq = c4iw_modify_srq,
94	.destroy_srq = c4iw_destroy_srq,
95	.create_qp = c4iw_create_qp,
96	.modify_qp = c4iw_modify_qp,
97	.destroy_qp = c4iw_destroy_qp,
98	.query_qp = c4iw_query_qp,
99	.create_ah = c4iw_create_ah,
100	.destroy_ah = c4iw_destroy_ah,
101	.attach_mcast = c4iw_attach_mcast,
102	.detach_mcast = c4iw_detach_mcast,
103	.post_srq_recv = c4iw_post_srq_recv,
104	.req_notify_cq = c4iw_arm_cq,
105};
106
107static struct ibv_context *c4iw_alloc_context(struct ibv_device *ibdev,
108					      int cmd_fd)
109{
110	struct c4iw_context *context;
111	struct ibv_get_context cmd;
112	struct c4iw_alloc_ucontext_resp resp;
113	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
114	struct ibv_query_device qcmd;
115	uint64_t raw_fw_ver;
116	struct ibv_device_attr attr;
117
118	context = malloc(sizeof *context);
119	if (!context)
120		return NULL;
121
122	memset(context, 0, sizeof *context);
123	context->ibv_ctx.cmd_fd = cmd_fd;
124
125	resp.status_page_size = 0;
126	resp.reserved = 0;
127	if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
128				&resp.ibv_resp, sizeof resp))
129		goto err_free;
130
131	if (resp.reserved)
132		PDBG("%s c4iw_alloc_ucontext_resp reserved field modified by kernel\n",
133		     __FUNCTION__);
134
135	context->status_page_size = resp.status_page_size;
136	if (resp.status_page_size) {
137		context->status_page = mmap(NULL, resp.status_page_size,
138					    PROT_READ, MAP_SHARED, cmd_fd,
139					    resp.status_page_key);
140		if (context->status_page == MAP_FAILED)
141			goto err_free;
142	}
143
144	context->ibv_ctx.device = ibdev;
145	context->ibv_ctx.ops = c4iw_ctx_ops;
146
147	switch (rhp->chip_version) {
148	case CHELSIO_T6:
149	case CHELSIO_T5:
150	case CHELSIO_T4:
151		PDBG("%s T%d device\n", __FUNCTION__, rhp->chip_version);
152		context->ibv_ctx.ops.async_event = c4iw_async_event;
153		context->ibv_ctx.ops.post_send = c4iw_post_send;
154		context->ibv_ctx.ops.post_recv = c4iw_post_receive;
155		context->ibv_ctx.ops.poll_cq = c4iw_poll_cq;
156		context->ibv_ctx.ops.req_notify_cq = c4iw_arm_cq;
157		break;
158	default:
159		PDBG("%s unknown hca type %d\n", __FUNCTION__,
160		     rhp->chip_version);
161		goto err_unmap;
162		break;
163	}
164
165	if (!rhp->mmid2ptr) {
166		int ret;
167
168		ret = ibv_cmd_query_device(&context->ibv_ctx, &attr, &raw_fw_ver, &qcmd,
169					   sizeof qcmd);
170		if (ret)
171			goto err_unmap;
172		rhp->max_mr = attr.max_mr;
173		rhp->mmid2ptr = calloc(attr.max_mr, sizeof(void *));
174		if (!rhp->mmid2ptr) {
175			goto err_unmap;
176		}
177		rhp->max_qp = T4_QID_BASE + attr.max_cq;
178		rhp->qpid2ptr = calloc(T4_QID_BASE + attr.max_cq, sizeof(void *));
179		if (!rhp->qpid2ptr) {
180			goto err_unmap;
181		}
182		rhp->max_cq = T4_QID_BASE + attr.max_cq;
183		rhp->cqid2ptr = calloc(T4_QID_BASE + attr.max_cq, sizeof(void *));
184		if (!rhp->cqid2ptr)
185			goto err_unmap;
186	}
187
188	return &context->ibv_ctx;
189
190err_unmap:
191	munmap(context->status_page, context->status_page_size);
192err_free:
193	if (rhp->cqid2ptr)
194		free(rhp->cqid2ptr);
195	if (rhp->qpid2ptr)
196		free(rhp->cqid2ptr);
197	if (rhp->mmid2ptr)
198		free(rhp->cqid2ptr);
199	free(context);
200	return NULL;
201}
202
203static void c4iw_free_context(struct ibv_context *ibctx)
204{
205	struct c4iw_context *context = to_c4iw_context(ibctx);
206
207	if (context->status_page_size)
208		munmap(context->status_page, context->status_page_size);
209	free(context);
210}
211
212static struct ibv_device_ops c4iw_dev_ops = {
213	.alloc_context = c4iw_alloc_context,
214	.free_context = c4iw_free_context
215};
216
217#ifdef STALL_DETECTION
218
219int stall_to;
220
221static void dump_cq(struct c4iw_cq *chp)
222{
223	int i;
224
225	fprintf(stderr,
226 		"CQ: %p id %u queue %p cidx 0x%08x sw_queue %p sw_cidx %d sw_pidx %d sw_in_use %d depth %u error %u gen %d "
227		"cidx_inc %d bits_type_ts %016" PRIx64 " notempty %d\n", chp,
228                chp->cq.cqid, chp->cq.queue, chp->cq.cidx,
229	 	chp->cq.sw_queue, chp->cq.sw_cidx, chp->cq.sw_pidx, chp->cq.sw_in_use,
230                chp->cq.size, chp->cq.error, chp->cq.gen, chp->cq.cidx_inc, be64_to_cpu(chp->cq.bits_type_ts),
231		t4_cq_notempty(&chp->cq) || (chp->iq ? t4_iq_notempty(chp->iq) : 0));
232
233	for (i=0; i < chp->cq.size; i++) {
234		u64 *p = (u64 *)(chp->cq.queue + i);
235
236		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64, i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
237		if (i == chp->cq.cidx)
238			fprintf(stderr, " <-- cidx\n");
239		else
240			fprintf(stderr, "\n");
241		p+= 2;
242		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
243		p+= 2;
244		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
245		p+= 2;
246		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
247		p+= 2;
248	}
249}
250
251static void dump_qp(struct c4iw_qp *qhp)
252{
253	int i;
254	int j;
255	struct t4_swsqe *swsqe;
256	struct t4_swrqe *swrqe;
257	u16 cidx, pidx;
258	u64 *p;
259
260	fprintf(stderr,
261		"QP: %p id %u error %d flushed %d qid_mask 0x%x\n"
262		"    SQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u wq_pidx %u depth %u flags 0x%x flush_cidx %d\n"
263		"    RQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u depth %u\n",
264		qhp,
265		qhp->wq.sq.qid,
266		qhp->wq.error,
267		qhp->wq.flushed,
268		qhp->wq.qid_mask,
269		qhp->wq.sq.qid,
270		qhp->wq.sq.queue,
271		qhp->wq.sq.sw_sq,
272		qhp->wq.sq.cidx,
273		qhp->wq.sq.pidx,
274		qhp->wq.sq.in_use,
275		qhp->wq.sq.wq_pidx,
276		qhp->wq.sq.size,
277		qhp->wq.sq.flags,
278		qhp->wq.sq.flush_cidx,
279		qhp->wq.rq.qid,
280		qhp->wq.rq.queue,
281		qhp->wq.rq.sw_rq,
282		qhp->wq.rq.cidx,
283		qhp->wq.rq.pidx,
284		qhp->wq.rq.in_use,
285		qhp->wq.rq.size);
286	cidx = qhp->wq.sq.cidx;
287	pidx = qhp->wq.sq.pidx;
288	if (cidx != pidx)
289		fprintf(stderr, "SQ: \n");
290	while (cidx != pidx) {
291		swsqe = &qhp->wq.sq.sw_sq[cidx];
292		fprintf(stderr, "%04u: wr_id %016" PRIx64
293			" sq_wptr %08x read_len %u opcode 0x%x "
294			"complete %u signaled %u cqe %016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n",
295			cidx,
296			swsqe->wr_id,
297			swsqe->idx,
298			swsqe->read_len,
299			swsqe->opcode,
300			swsqe->complete,
301			swsqe->signaled,
302			cpu_to_be64(swsqe->cqe.u.flits[0]),
303			cpu_to_be64(swsqe->cqe.u.flits[1]),
304			cpu_to_be64((u64)swsqe->cqe.reserved),
305			cpu_to_be64(swsqe->cqe.bits_type_ts));
306		if (++cidx == qhp->wq.sq.size)
307			cidx = 0;
308	}
309
310	fprintf(stderr, "SQ WQ: \n");
311	p = (u64 *)qhp->wq.sq.queue;
312	for (i=0; i < qhp->wq.sq.size * T4_SQ_NUM_SLOTS; i++) {
313		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
314			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
315				i, ntohll(p[0]), ntohll(p[1]));
316			if (j == 0 && i == qhp->wq.sq.wq_pidx)
317				fprintf(stderr, " <-- pidx");
318			fprintf(stderr, "\n");
319			p += 2;
320		}
321	}
322	cidx = qhp->wq.rq.cidx;
323	pidx = qhp->wq.rq.pidx;
324	if (cidx != pidx)
325		fprintf(stderr, "RQ: \n");
326	while (cidx != pidx) {
327		swrqe = &qhp->wq.rq.sw_rq[cidx];
328		fprintf(stderr, "%04u: wr_id %016" PRIx64 "\n",
329			cidx,
330			swrqe->wr_id );
331		if (++cidx == qhp->wq.rq.size)
332			cidx = 0;
333	}
334
335	fprintf(stderr, "RQ WQ: \n");
336	p = (u64 *)qhp->wq.rq.queue;
337	for (i=0; i < qhp->wq.rq.size * T4_RQ_NUM_SLOTS; i++) {
338		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
339			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
340				i, ntohll(p[0]), ntohll(p[1]));
341			if (j == 0 && i == qhp->wq.rq.pidx)
342				fprintf(stderr, " <-- pidx");
343			if (j == 0 && i == qhp->wq.rq.cidx)
344				fprintf(stderr, " <-- cidx");
345			fprintf(stderr, "\n");
346			p+=2;
347		}
348	}
349}
350
351void dump_state()
352{
353	struct c4iw_dev *dev;
354	int i;
355
356	fprintf(stderr, "STALL DETECTED:\n");
357	SLIST_FOREACH(dev, &devices, list) {
358		//pthread_spin_lock(&dev->lock);
359		fprintf(stderr, "Device %s\n", dev->ibv_dev.name);
360		for (i=0; i < dev->max_cq; i++) {
361			if (dev->cqid2ptr[i]) {
362				struct c4iw_cq *chp = dev->cqid2ptr[i];
363				//pthread_spin_lock(&chp->lock);
364				dump_cq(chp);
365				//pthread_spin_unlock(&chp->lock);
366			}
367		}
368		for (i=0; i < dev->max_qp; i++) {
369			if (dev->qpid2ptr[i]) {
370				struct c4iw_qp *qhp = dev->qpid2ptr[i];
371				//pthread_spin_lock(&qhp->lock);
372				dump_qp(qhp);
373				//pthread_spin_unlock(&qhp->lock);
374			}
375		}
376		//pthread_spin_unlock(&dev->lock);
377	}
378	fprintf(stderr, "DUMP COMPLETE:\n");
379	fflush(stderr);
380}
381#endif /* end of STALL_DETECTION */
382
383/*
384 * c4iw_abi_version is used to store ABI for iw_cxgb4 so the user mode library
385 * can know if the driver supports the kernel mode db ringing.
386 */
387int c4iw_abi_version = 1;
388
389static struct ibv_device *cxgb4_driver_init(const char *uverbs_sys_path,
390					    int abi_version)
391{
392	char devstr[IBV_SYSFS_PATH_MAX], ibdev[16], value[128], *cp;
393	char dev_str[IBV_SYSFS_PATH_MAX];
394	struct c4iw_dev *dev;
395	unsigned vendor, device, fw_maj, fw_min;
396	int i;
397	char devnum;
398        char ib_param[16];
399
400#ifndef __linux__
401	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
402				ibdev, sizeof ibdev) < 0)
403		return NULL;
404
405	if (ibdev[0] == 't' && ibdev[1] >= '4' && ibdev[1] <= '6' &&
406	    strstr(&ibdev[2], "nex") && (devnum = atoi(&ibdev[5])) >= 0) {
407		snprintf(dev_str, sizeof(dev_str), "/dev/t%cnex/%d", ibdev[1],
408		    devnum);
409	} else
410		return NULL;
411
412	if (ibv_read_sysfs_file(dev_str, "\%pnpinfo", value, sizeof value) < 0)
413		return NULL;
414	else {
415		if (strstr(value,"vendor=")) {
416			strncpy(ib_param, strstr(value,"vendor=")+strlen("vendor="),6);
417			sscanf(ib_param,"%i",&vendor);
418		}
419
420		if (strstr(value,"device=")) {
421			strncpy(ib_param, strstr(value,"device=")+strlen("device="),6);
422			sscanf(ib_param,"%i",&device);
423		}
424	}
425#else
426	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
427				value, sizeof value) < 0)
428		return NULL;
429	sscanf(value, "%i", &vendor);
430
431	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
432				value, sizeof value) < 0)
433		return NULL;
434	sscanf(value, "%i", &device);
435#endif
436
437	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
438		if (vendor == hca_table[i].vendor &&
439		    device == hca_table[i].device)
440			goto found;
441
442	return NULL;
443
444found:
445	c4iw_abi_version = abi_version;
446
447
448#ifndef __linux__
449	if (ibv_read_sysfs_file(dev_str, "firmware_version",
450				value, sizeof value) < 0)
451		return NULL;
452#else
453	/*
454	 * Verify that the firmware major number matches.  Major number
455	 * mismatches are fatal.  Minor number mismatches are tolerated.
456	 */
457	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
458				ibdev, sizeof ibdev) < 0)
459		return NULL;
460
461	memset(devstr, 0, sizeof devstr);
462	snprintf(devstr, sizeof devstr, "%s/class/infiniband/%s",
463		 ibv_get_sysfs_path(), ibdev);
464	if (ibv_read_sysfs_file(devstr, "fw_ver", value, sizeof value) < 0)
465		return NULL;
466#endif
467
468	cp = strtok(value+1, ".");
469	sscanf(cp, "%i", &fw_maj);
470	cp = strtok(NULL, ".");
471	sscanf(cp, "%i", &fw_min);
472
473	if (fw_maj < FW_MAJ) {
474		fprintf(stderr, "libcxgb4: Fatal firmware version mismatch.  "
475			"Firmware major number is %u and libcxgb4 needs %u.\n",
476			fw_maj, FW_MAJ);
477		fflush(stderr);
478		return NULL;
479	}
480
481	DBGLOG("libcxgb4");
482
483	if (fw_min < FW_MIN) {
484		PDBG("libcxgb4: non-fatal firmware version mismatch.  "
485			"Firmware minor number is %u and libcxgb4 needs %u.\n",
486			fw_maj, FW_MAJ);
487		fflush(stderr);
488	}
489
490	PDBG("%s found vendor %d device %d type %d\n",
491		__FUNCTION__, vendor, device,
492		CHELSIO_PCI_ID_CHIP_VERSION(hca_table[i].device));
493
494	dev = calloc(1, sizeof *dev);
495	if (!dev) {
496		return NULL;
497	}
498
499	pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE);
500	dev->ibv_dev.ops = c4iw_dev_ops;
501	dev->chip_version = CHELSIO_PCI_ID_CHIP_VERSION(hca_table[i].device);
502	dev->abi_version = abi_version;
503
504	PDBG("%s device claimed\n", __FUNCTION__);
505	SLIST_INSERT_HEAD(&devices, dev, list);
506#ifdef STALL_DETECTION
507{
508	char *c = getenv("CXGB4_STALL_TIMEOUT");
509	if (c) {
510		stall_to = strtol(c, NULL, 0);
511		if (errno || stall_to < 0)
512			stall_to = 0;
513	}
514}
515#endif
516{
517	char *c = getenv("CXGB4_MA_WR");
518	if (c) {
519		ma_wr = strtol(c, NULL, 0);
520		if (ma_wr != 1)
521			ma_wr = 0;
522	}
523}
524{
525	char *c = getenv("T5_ENABLE_WC");
526	if (c) {
527		t5_en_wc = strtol(c, NULL, 0);
528		if (t5_en_wc != 1)
529			t5_en_wc = 0;
530	}
531}
532
533	return &dev->ibv_dev;
534}
535
536static __attribute__((constructor)) void cxgb4_register_driver(void)
537{
538	c4iw_page_size = sysconf(_SC_PAGESIZE);
539	c4iw_page_shift = long_log2(c4iw_page_size);
540	c4iw_page_mask = ~(c4iw_page_size - 1);
541	ibv_register_driver("cxgb4", cxgb4_driver_init);
542}
543
544#ifdef STATS
545void __attribute__ ((destructor)) cs_fini(void);
546void  __attribute__ ((destructor)) cs_fini(void)
547{
548	syslog(LOG_NOTICE, "cxgb4 stats - sends %lu recv %lu read %lu "
549	       "write %lu arm %lu cqe %lu mr %lu qp %lu cq %lu\n",
550	       c4iw_stats.send, c4iw_stats.recv, c4iw_stats.read,
551	       c4iw_stats.write, c4iw_stats.arm, c4iw_stats.cqe,
552	       c4iw_stats.mr, c4iw_stats.qp, c4iw_stats.cq);
553}
554#endif
555