krping.c revision 297655
1/*
2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3 * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/contrib/rdma/krping/krping.c 297655 2016-04-07 07:41:06Z hselasky $");
36
37#include <linux/module.h>
38#include <linux/moduleparam.h>
39#include <linux/slab.h>
40#include <linux/err.h>
41#include <linux/string.h>
42#include <linux/list.h>
43#include <linux/in.h>
44#include <linux/device.h>
45#include <linux/pci.h>
46#include <linux/sched.h>
47
48#include <asm/atomic.h>
49
50#include <rdma/ib_verbs.h>
51#include <rdma/rdma_cm.h>
52
53#include "krping.h"
54#include "getopt.h"
55
56extern int krping_debug;
57#define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
58#define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
59
60MODULE_AUTHOR("Steve Wise");
61MODULE_DESCRIPTION("RDMA ping client/server");
62MODULE_LICENSE("Dual BSD/GPL");
63
64static __inline uint64_t
65get_cycles(void)
66{
67	uint32_t low, high;
68	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
69	return (low | ((u_int64_t)high << 32));
70}
71
72typedef uint64_t cycles_t;
73
74enum mem_type {
75	DMA = 1,
76	FASTREG = 2,
77	MW = 3,
78	MR = 4
79};
80
81static const struct krping_option krping_opts[] = {
82	{"count", OPT_INT, 'C'},
83	{"size", OPT_INT, 'S'},
84	{"addr", OPT_STRING, 'a'},
85	{"port", OPT_INT, 'p'},
86	{"verbose", OPT_NOPARAM, 'v'},
87	{"validate", OPT_NOPARAM, 'V'},
88	{"server", OPT_NOPARAM, 's'},
89	{"client", OPT_NOPARAM, 'c'},
90	{"mem_mode", OPT_STRING, 'm'},
91	{"server_inv", OPT_NOPARAM, 'I'},
92 	{"wlat", OPT_NOPARAM, 'l'},
93 	{"rlat", OPT_NOPARAM, 'L'},
94 	{"bw", OPT_NOPARAM, 'B'},
95 	{"duplex", OPT_NOPARAM, 'd'},
96 	{"txdepth", OPT_INT, 'T'},
97 	{"poll", OPT_NOPARAM, 'P'},
98 	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
99 	{"read_inv", OPT_NOPARAM, 'R'},
100 	{"fr", OPT_NOPARAM, 'f'},
101	{NULL, 0, 0}
102};
103
104#define htonll(x) cpu_to_be64((x))
105#define ntohll(x) cpu_to_be64((x))
106
107static struct mutex krping_mutex;
108
109/*
110 * List of running krping threads.
111 */
112static LIST_HEAD(krping_cbs);
113
114/*
115 * krping "ping/pong" loop:
116 * 	client sends source rkey/addr/len
117 *	server receives source rkey/add/len
118 *	server rdma reads "ping" data from source
119 * 	server sends "go ahead" on rdma read completion
120 *	client sends sink rkey/addr/len
121 * 	server receives sink rkey/addr/len
122 * 	server rdma writes "pong" data to sink
123 * 	server sends "go ahead" on rdma write completion
124 * 	<repeat loop>
125 */
126
127/*
128 * These states are used to signal events between the completion handler
129 * and the main client or server thread.
130 *
131 * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
132 * and RDMA_WRITE_COMPLETE for each ping.
133 */
134enum test_state {
135	IDLE = 1,
136	CONNECT_REQUEST,
137	ADDR_RESOLVED,
138	ROUTE_RESOLVED,
139	CONNECTED,
140	RDMA_READ_ADV,
141	RDMA_READ_COMPLETE,
142	RDMA_WRITE_ADV,
143	RDMA_WRITE_COMPLETE,
144	ERROR
145};
146
147struct krping_rdma_info {
148	uint64_t buf;
149	uint32_t rkey;
150	uint32_t size;
151};
152
153/*
154 * Default max buffer size for IO...
155 */
156#define RPING_BUFSIZE 128*1024
157#define RPING_SQ_DEPTH 64
158
159/*
160 * Control block struct.
161 */
162struct krping_cb {
163	void *cookie;
164	int server;			/* 0 iff client */
165	struct ib_cq *cq;
166	struct ib_pd *pd;
167	struct ib_qp *qp;
168
169	enum mem_type mem;
170	struct ib_mr *dma_mr;
171
172	struct ib_fast_reg_page_list *page_list;
173	int page_list_len;
174	struct ib_send_wr fastreg_wr;
175	struct ib_send_wr invalidate_wr;
176	struct ib_mr *fastreg_mr;
177	int server_invalidate;
178	int read_inv;
179	u8 key;
180
181	struct ib_mw *mw;
182	struct ib_mw_bind bind_attr;
183
184	struct ib_recv_wr rq_wr;	/* recv work request record */
185	struct ib_sge recv_sgl;		/* recv single SGE */
186	struct krping_rdma_info recv_buf;/* malloc'd buffer */
187	u64 recv_dma_addr;
188	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
189	struct ib_mr *recv_mr;
190
191	struct ib_send_wr sq_wr;	/* send work requrest record */
192	struct ib_sge send_sgl;
193	struct krping_rdma_info send_buf;/* single send buf */
194	u64 send_dma_addr;
195	DECLARE_PCI_UNMAP_ADDR(send_mapping)
196	struct ib_mr *send_mr;
197
198	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
199	struct ib_sge rdma_sgl;		/* rdma single SGE */
200	char *rdma_buf;			/* used as rdma sink */
201	u64  rdma_dma_addr;
202	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
203	struct ib_mr *rdma_mr;
204
205	uint32_t remote_rkey;		/* remote guys RKEY */
206	uint64_t remote_addr;		/* remote guys TO */
207	uint32_t remote_len;		/* remote guys LEN */
208
209	char *start_buf;		/* rdma read src */
210	u64  start_dma_addr;
211	DECLARE_PCI_UNMAP_ADDR(start_mapping)
212	struct ib_mr *start_mr;
213
214	enum test_state state;		/* used for cond/signalling */
215	wait_queue_head_t sem;
216	struct krping_stats stats;
217
218	uint16_t port;			/* dst port in NBO */
219	struct in_addr addr;		/* dst addr in NBO */
220	char *addr_str;			/* dst addr string */
221	int verbose;			/* verbose logging */
222	int count;			/* ping count */
223	int size;			/* ping data size */
224	int validate;			/* validate ping data */
225	int wlat;			/* run wlat test */
226	int rlat;			/* run rlat test */
227	int bw;				/* run bw test */
228	int duplex;			/* run bw full duplex test */
229	int poll;			/* poll or block for rlat test */
230	int txdepth;			/* SQ depth */
231	int local_dma_lkey;		/* use 0 for lkey */
232	int frtest;			/* fastreg test */
233
234	/* CM stuff */
235	struct rdma_cm_id *cm_id;	/* connection on client side,*/
236					/* listener on server side. */
237	struct rdma_cm_id *child_cm_id;	/* connection on server side */
238	struct list_head list;
239};
240
241static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
242				   struct rdma_cm_event *event)
243{
244	int ret;
245	struct krping_cb *cb = cma_id->context;
246
247	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
248	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
249
250	switch (event->event) {
251	case RDMA_CM_EVENT_ADDR_RESOLVED:
252		cb->state = ADDR_RESOLVED;
253		ret = rdma_resolve_route(cma_id, 2000);
254		if (ret) {
255			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
256			wake_up_interruptible(&cb->sem);
257		}
258		break;
259
260	case RDMA_CM_EVENT_ROUTE_RESOLVED:
261		cb->state = ROUTE_RESOLVED;
262		cb->child_cm_id = cma_id;
263		wake_up_interruptible(&cb->sem);
264		break;
265
266	case RDMA_CM_EVENT_CONNECT_REQUEST:
267		cb->state = CONNECT_REQUEST;
268		cb->child_cm_id = cma_id;
269		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
270		wake_up_interruptible(&cb->sem);
271		break;
272
273	case RDMA_CM_EVENT_ESTABLISHED:
274		DEBUG_LOG(cb, "ESTABLISHED\n");
275		if (!cb->server) {
276			cb->state = CONNECTED;
277		}
278		wake_up_interruptible(&cb->sem);
279		break;
280
281	case RDMA_CM_EVENT_ADDR_ERROR:
282	case RDMA_CM_EVENT_ROUTE_ERROR:
283	case RDMA_CM_EVENT_CONNECT_ERROR:
284	case RDMA_CM_EVENT_UNREACHABLE:
285	case RDMA_CM_EVENT_REJECTED:
286		PRINTF(cb, "cma event %d, error %d\n", event->event,
287		       event->status);
288		cb->state = ERROR;
289		wake_up_interruptible(&cb->sem);
290		break;
291
292	case RDMA_CM_EVENT_DISCONNECTED:
293		PRINTF(cb, "DISCONNECT EVENT...\n");
294		cb->state = ERROR;
295		wake_up_interruptible(&cb->sem);
296		break;
297
298	case RDMA_CM_EVENT_DEVICE_REMOVAL:
299		PRINTF(cb, "cma detected device removal!!!!\n");
300		break;
301
302	default:
303		PRINTF(cb, "oof bad type!\n");
304		wake_up_interruptible(&cb->sem);
305		break;
306	}
307	return 0;
308}
309
310static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
311{
312	if (wc->byte_len != sizeof(cb->recv_buf)) {
313		PRINTF(cb, "Received bogus data, size %d\n",
314		       wc->byte_len);
315		return -1;
316	}
317
318	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
319	cb->remote_addr = ntohll(cb->recv_buf.buf);
320	cb->remote_len  = ntohl(cb->recv_buf.size);
321	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
322		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
323		  cb->remote_len);
324
325	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
326		cb->state = RDMA_READ_ADV;
327	else
328		cb->state = RDMA_WRITE_ADV;
329
330	return 0;
331}
332
333static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
334{
335	if (wc->byte_len != sizeof(cb->recv_buf)) {
336		PRINTF(cb, "Received bogus data, size %d\n",
337		       wc->byte_len);
338		return -1;
339	}
340
341	if (cb->state == RDMA_READ_ADV)
342		cb->state = RDMA_WRITE_ADV;
343	else
344		cb->state = RDMA_WRITE_COMPLETE;
345
346	return 0;
347}
348
349static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
350{
351	struct krping_cb *cb = ctx;
352	struct ib_wc wc;
353	struct ib_recv_wr *bad_wr;
354	int ret;
355
356	BUG_ON(cb->cq != cq);
357	if (cb->state == ERROR) {
358		PRINTF(cb, "cq completion in ERROR state\n");
359		return;
360	}
361	if (cb->frtest) {
362		PRINTF(cb, "cq completion event in frtest!\n");
363		return;
364	}
365	if (!cb->wlat && !cb->rlat && !cb->bw)
366		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
367	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
368		if (wc.status) {
369			if (wc.status == IB_WC_WR_FLUSH_ERR) {
370				DEBUG_LOG(cb, "cq flushed\n");
371				continue;
372			} else {
373				PRINTF(cb, "cq completion failed with "
374				       "wr_id %Lx status %d opcode %d vender_err %x\n",
375					wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
376				goto error;
377			}
378		}
379
380		switch (wc.opcode) {
381		case IB_WC_SEND:
382			DEBUG_LOG(cb, "send completion\n");
383			cb->stats.send_bytes += cb->send_sgl.length;
384			cb->stats.send_msgs++;
385			break;
386
387		case IB_WC_RDMA_WRITE:
388			DEBUG_LOG(cb, "rdma write completion\n");
389			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
390			cb->stats.write_msgs++;
391			cb->state = RDMA_WRITE_COMPLETE;
392			wake_up_interruptible(&cb->sem);
393			break;
394
395		case IB_WC_RDMA_READ:
396			DEBUG_LOG(cb, "rdma read completion\n");
397			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
398			cb->stats.read_msgs++;
399			cb->state = RDMA_READ_COMPLETE;
400			wake_up_interruptible(&cb->sem);
401			break;
402
403		case IB_WC_RECV:
404			DEBUG_LOG(cb, "recv completion\n");
405			cb->stats.recv_bytes += sizeof(cb->recv_buf);
406			cb->stats.recv_msgs++;
407			if (cb->wlat || cb->rlat || cb->bw)
408				ret = server_recv(cb, &wc);
409			else
410				ret = cb->server ? server_recv(cb, &wc) :
411						   client_recv(cb, &wc);
412			if (ret) {
413				PRINTF(cb, "recv wc error: %d\n", ret);
414				goto error;
415			}
416
417			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
418			if (ret) {
419				PRINTF(cb, "post recv error: %d\n",
420				       ret);
421				goto error;
422			}
423			wake_up_interruptible(&cb->sem);
424			break;
425
426		default:
427			PRINTF(cb,
428			       "%s:%d Unexpected opcode %d, Shutting down\n",
429			       __func__, __LINE__, wc.opcode);
430			goto error;
431		}
432	}
433	if (ret) {
434		PRINTF(cb, "poll error %d\n", ret);
435		goto error;
436	}
437	return;
438error:
439	cb->state = ERROR;
440	wake_up_interruptible(&cb->sem);
441}
442
443static int krping_accept(struct krping_cb *cb)
444{
445	struct rdma_conn_param conn_param;
446	int ret;
447
448	DEBUG_LOG(cb, "accepting client connection request\n");
449
450	memset(&conn_param, 0, sizeof conn_param);
451	conn_param.responder_resources = 1;
452	conn_param.initiator_depth = 1;
453
454	ret = rdma_accept(cb->child_cm_id, &conn_param);
455	if (ret) {
456		PRINTF(cb, "rdma_accept error: %d\n", ret);
457		return ret;
458	}
459
460	if (!cb->wlat && !cb->rlat && !cb->bw) {
461		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
462		if (cb->state == ERROR) {
463			PRINTF(cb, "wait for CONNECTED state %d\n",
464				cb->state);
465			return -1;
466		}
467	}
468	return 0;
469}
470
471static void krping_setup_wr(struct krping_cb *cb)
472{
473	cb->recv_sgl.addr = cb->recv_dma_addr;
474	cb->recv_sgl.length = sizeof cb->recv_buf;
475	if (cb->local_dma_lkey)
476		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
477	else if (cb->mem == DMA)
478		cb->recv_sgl.lkey = cb->dma_mr->lkey;
479	else
480		cb->recv_sgl.lkey = cb->recv_mr->lkey;
481	cb->rq_wr.sg_list = &cb->recv_sgl;
482	cb->rq_wr.num_sge = 1;
483
484	cb->send_sgl.addr = cb->send_dma_addr;
485	cb->send_sgl.length = sizeof cb->send_buf;
486	if (cb->local_dma_lkey)
487		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
488	else if (cb->mem == DMA)
489		cb->send_sgl.lkey = cb->dma_mr->lkey;
490	else
491		cb->send_sgl.lkey = cb->send_mr->lkey;
492
493	cb->sq_wr.opcode = IB_WR_SEND;
494	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
495	cb->sq_wr.sg_list = &cb->send_sgl;
496	cb->sq_wr.num_sge = 1;
497
498	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
499		cb->rdma_sgl.addr = cb->rdma_dma_addr;
500		if (cb->mem == MR)
501			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
502		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
503		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
504		cb->rdma_sq_wr.num_sge = 1;
505	}
506
507	switch(cb->mem) {
508	case FASTREG:
509
510		/*
511		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
512		 * both unsignaled.  The client uses them to reregister
513		 * the rdma buffers with a new key each iteration.
514		 */
515		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
516		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
517		cb->fastreg_wr.wr.fast_reg.length = cb->size;
518		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
519		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
520
521		cb->invalidate_wr.next = &cb->fastreg_wr;
522		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
523		break;
524	case MW:
525		cb->bind_attr.wr_id = 0xabbaabba;
526		cb->bind_attr.send_flags = 0; /* unsignaled */
527		cb->bind_attr.length = cb->size;
528		break;
529	default:
530		break;
531	}
532}
533
534static int krping_setup_buffers(struct krping_cb *cb)
535{
536	int ret;
537	struct ib_phys_buf buf;
538	u64 iovbase;
539
540	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
541
542	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device,
543				   &cb->recv_buf,
544				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
545	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
546	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device,
547					   &cb->send_buf, sizeof(cb->send_buf),
548					   DMA_BIDIRECTIONAL);
549	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
550
551	if (cb->mem == DMA) {
552		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
553					   IB_ACCESS_REMOTE_READ|
554				           IB_ACCESS_REMOTE_WRITE);
555		if (IS_ERR(cb->dma_mr)) {
556			DEBUG_LOG(cb, "reg_dmamr failed\n");
557			ret = PTR_ERR(cb->dma_mr);
558			goto bail;
559		}
560	} else {
561		if (!cb->local_dma_lkey) {
562			buf.addr = cb->recv_dma_addr;
563			buf.size = sizeof cb->recv_buf;
564			DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr,
565				(int)buf.size);
566			iovbase = cb->recv_dma_addr;
567			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
568						     IB_ACCESS_LOCAL_WRITE,
569						     &iovbase);
570
571			if (IS_ERR(cb->recv_mr)) {
572				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
573				ret = PTR_ERR(cb->recv_mr);
574				goto bail;
575			}
576
577			buf.addr = cb->send_dma_addr;
578			buf.size = sizeof cb->send_buf;
579			DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr,
580				(int)buf.size);
581			iovbase = cb->send_dma_addr;
582			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
583						     0, &iovbase);
584
585			if (IS_ERR(cb->send_mr)) {
586				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
587				ret = PTR_ERR(cb->send_mr);
588				goto bail;
589			}
590		}
591	}
592
593	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
594	if (!cb->rdma_buf) {
595		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
596		ret = -ENOMEM;
597		goto bail;
598	}
599
600	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device,
601			       cb->rdma_buf, cb->size,
602			       DMA_BIDIRECTIONAL);
603	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
604	if (cb->mem != DMA) {
605		switch (cb->mem) {
606		case FASTREG:
607			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
608				PAGE_SIZE) >> PAGE_SHIFT;
609			cb->page_list = ib_alloc_fast_reg_page_list(
610						cb->pd->device,
611						cb->page_list_len);
612			if (IS_ERR(cb->page_list)) {
613				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
614				ret = PTR_ERR(cb->page_list);
615				goto bail;
616			}
617			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
618					cb->page_list->max_page_list_len);
619			if (IS_ERR(cb->fastreg_mr)) {
620				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
621				ret = PTR_ERR(cb->fastreg_mr);
622				goto bail;
623			}
624			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
625				" page_list_len %u\n", cb->fastreg_mr->rkey,
626				cb->page_list, cb->page_list_len);
627			break;
628		case MW:
629			cb->mw = ib_alloc_mw(cb->pd);
630			if (IS_ERR(cb->mw)) {
631				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
632				ret = PTR_ERR(cb->mw);
633				goto bail;
634			}
635			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
636			/*FALLTHROUGH*/
637		case MR:
638			buf.addr = cb->rdma_dma_addr;
639			buf.size = cb->size;
640			iovbase = cb->rdma_dma_addr;
641			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
642					     IB_ACCESS_REMOTE_READ|
643					     IB_ACCESS_REMOTE_WRITE,
644					     &iovbase);
645			if (IS_ERR(cb->rdma_mr)) {
646				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
647				ret = PTR_ERR(cb->rdma_mr);
648				goto bail;
649			}
650			DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n",
651				buf.addr, (int)buf.size, cb->rdma_mr->rkey);
652			break;
653		default:
654			ret = -EINVAL;
655			goto bail;
656			break;
657		}
658	}
659
660	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
661
662		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
663		if (!cb->start_buf) {
664			DEBUG_LOG(cb, "start_buf malloc failed\n");
665			ret = -ENOMEM;
666			goto bail;
667		}
668
669		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device,
670						   cb->start_buf, cb->size,
671						   DMA_BIDIRECTIONAL);
672		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
673
674		if (cb->mem == MR || cb->mem == MW) {
675			unsigned flags = IB_ACCESS_REMOTE_READ;
676
677			if (cb->wlat || cb->rlat || cb->bw)
678				flags |= IB_ACCESS_REMOTE_WRITE;
679
680			buf.addr = cb->start_dma_addr;
681			buf.size = cb->size;
682			DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n",
683				buf.addr, (int)buf.size);
684			iovbase = cb->start_dma_addr;
685			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
686					     flags,
687					     &iovbase);
688
689			if (IS_ERR(cb->start_mr)) {
690				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
691				ret = PTR_ERR(cb->start_mr);
692				goto bail;
693			}
694		}
695	}
696
697	krping_setup_wr(cb);
698	DEBUG_LOG(cb, "allocated & registered buffers...\n");
699	return 0;
700bail:
701	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
702		ib_dereg_mr(cb->fastreg_mr);
703	if (cb->mw && !IS_ERR(cb->mw))
704		ib_dealloc_mw(cb->mw);
705	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
706		ib_dereg_mr(cb->rdma_mr);
707	if (cb->page_list && !IS_ERR(cb->page_list))
708		ib_free_fast_reg_page_list(cb->page_list);
709	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
710		ib_dereg_mr(cb->dma_mr);
711	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
712		ib_dereg_mr(cb->recv_mr);
713	if (cb->send_mr && !IS_ERR(cb->send_mr))
714		ib_dereg_mr(cb->send_mr);
715	if (cb->rdma_buf)
716		kfree(cb->rdma_buf);
717	if (cb->start_buf)
718		kfree(cb->start_buf);
719	return ret;
720}
721
722static void krping_free_buffers(struct krping_cb *cb)
723{
724	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
725
726	if (cb->dma_mr)
727		ib_dereg_mr(cb->dma_mr);
728	if (cb->send_mr)
729		ib_dereg_mr(cb->send_mr);
730	if (cb->recv_mr)
731		ib_dereg_mr(cb->recv_mr);
732	if (cb->rdma_mr)
733		ib_dereg_mr(cb->rdma_mr);
734	if (cb->start_mr)
735		ib_dereg_mr(cb->start_mr);
736	if (cb->fastreg_mr)
737		ib_dereg_mr(cb->fastreg_mr);
738	if (cb->mw)
739		ib_dealloc_mw(cb->mw);
740
741	dma_unmap_single(cb->pd->device->dma_device,
742			 pci_unmap_addr(cb, recv_mapping),
743			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
744	dma_unmap_single(cb->pd->device->dma_device,
745			 pci_unmap_addr(cb, send_mapping),
746			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
747	dma_unmap_single(cb->pd->device->dma_device,
748			 pci_unmap_addr(cb, rdma_mapping),
749			 cb->size, DMA_BIDIRECTIONAL);
750	kfree(cb->rdma_buf);
751	if (cb->start_buf) {
752		dma_unmap_single(cb->pd->device->dma_device,
753			 pci_unmap_addr(cb, start_mapping),
754			 cb->size, DMA_BIDIRECTIONAL);
755		kfree(cb->start_buf);
756	}
757}
758
759static int krping_create_qp(struct krping_cb *cb)
760{
761	struct ib_qp_init_attr init_attr;
762	int ret;
763
764	memset(&init_attr, 0, sizeof(init_attr));
765	init_attr.cap.max_send_wr = cb->txdepth;
766	init_attr.cap.max_recv_wr = 2;
767	init_attr.cap.max_recv_sge = 1;
768	init_attr.cap.max_send_sge = 1;
769	init_attr.qp_type = IB_QPT_RC;
770	init_attr.send_cq = cb->cq;
771	init_attr.recv_cq = cb->cq;
772	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
773
774	if (cb->server) {
775		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
776		if (!ret)
777			cb->qp = cb->child_cm_id->qp;
778	} else {
779		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
780		if (!ret)
781			cb->qp = cb->cm_id->qp;
782	}
783
784	return ret;
785}
786
787static void krping_free_qp(struct krping_cb *cb)
788{
789	ib_destroy_qp(cb->qp);
790	ib_destroy_cq(cb->cq);
791	ib_dealloc_pd(cb->pd);
792}
793
794static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
795{
796	int ret;
797	cb->pd = ib_alloc_pd(cm_id->device);
798	if (IS_ERR(cb->pd)) {
799		PRINTF(cb, "ib_alloc_pd failed\n");
800		return PTR_ERR(cb->pd);
801	}
802	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
803
804	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
805
806	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
807			      cb, cb->txdepth * 2, 0);
808	if (IS_ERR(cb->cq)) {
809		PRINTF(cb, "ib_create_cq failed\n");
810		ret = PTR_ERR(cb->cq);
811		goto err1;
812	}
813	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
814
815	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
816		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
817		if (ret) {
818			PRINTF(cb, "ib_create_cq failed\n");
819			goto err2;
820		}
821	}
822
823	ret = krping_create_qp(cb);
824	if (ret) {
825		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
826		goto err2;
827	}
828	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
829	return 0;
830err2:
831	ib_destroy_cq(cb->cq);
832err1:
833	ib_dealloc_pd(cb->pd);
834	return ret;
835}
836
837/*
838 * return the (possibly rebound) rkey for the rdma buffer.
839 * FASTREG mode: invalidate and rebind via fastreg wr.
840 * MW mode: rebind the MW.
841 * other modes: just return the mr rkey.
842 */
843static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
844{
845	u32 rkey = 0xffffffff;
846	u64 p;
847	struct ib_send_wr *bad_wr;
848	int i;
849	int ret;
850
851	switch (cb->mem) {
852	case FASTREG:
853		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
854
855		/*
856		 * Update the fastreg key.
857		 */
858		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
859		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
860
861		/*
862		 * Update the fastreg WR with new buf info.
863		 */
864		if (buf == (u64)cb->start_dma_addr)
865			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
866		else
867			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
868		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
869		p = (u64)(buf & PAGE_MASK);
870		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
871		     i++, p += PAGE_SIZE) {
872			cb->page_list->page_list[i] = p;
873			DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
874		}
875
876		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
877			" iova_start %llx page_list_len %u\n",
878			post_inv,
879			cb->fastreg_wr.wr.fast_reg.rkey,
880			cb->fastreg_wr.wr.fast_reg.page_shift,
881			cb->fastreg_wr.wr.fast_reg.length,
882			cb->fastreg_wr.wr.fast_reg.iova_start,
883			cb->fastreg_wr.wr.fast_reg.page_list_len);
884
885		if (post_inv)
886			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
887		else
888			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
889		if (ret) {
890			PRINTF(cb, "post send error %d\n", ret);
891			cb->state = ERROR;
892		}
893		rkey = cb->fastreg_mr->rkey;
894		break;
895	case MW:
896		/*
897		 * Update the MW with new buf info.
898		 */
899		if (buf == (u64)cb->start_dma_addr) {
900			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
901			cb->bind_attr.mr = cb->start_mr;
902		} else {
903			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
904			cb->bind_attr.mr = cb->rdma_mr;
905		}
906		cb->bind_attr.addr = buf;
907		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
908			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
909		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
910		if (ret) {
911			PRINTF(cb, "bind mw error %d\n", ret);
912			cb->state = ERROR;
913		} else
914			rkey = cb->mw->rkey;
915		break;
916	case MR:
917		if (buf == (u64)cb->start_dma_addr)
918			rkey = cb->start_mr->rkey;
919		else
920			rkey = cb->rdma_mr->rkey;
921		break;
922	case DMA:
923		rkey = cb->dma_mr->rkey;
924		break;
925	default:
926		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
927		cb->state = ERROR;
928		break;
929	}
930	return rkey;
931}
932
933static void krping_format_send(struct krping_cb *cb, u64 buf)
934{
935	struct krping_rdma_info *info = &cb->send_buf;
936	u32 rkey;
937
938	/*
939	 * Client side will do fastreg or mw bind before
940	 * advertising the rdma buffer.  Server side
941	 * sends have no data.
942	 */
943	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
944		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
945		info->buf = htonll(buf);
946		info->rkey = htonl(rkey);
947		info->size = htonl(cb->size);
948		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
949			  (unsigned long long)buf, rkey, cb->size);
950	}
951}
952
953static void krping_test_server(struct krping_cb *cb)
954{
955	struct ib_send_wr *bad_wr, inv;
956	int ret;
957
958	while (1) {
959		/* Wait for client's Start STAG/TO/Len */
960		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
961		if (cb->state != RDMA_READ_ADV) {
962			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
963				cb->state);
964			break;
965		}
966
967		DEBUG_LOG(cb, "server received sink adv\n");
968
969		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
970		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
971		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
972		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
973		cb->rdma_sq_wr.next = NULL;
974
975		/* Issue RDMA Read. */
976		if (cb->read_inv)
977			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
978		else {
979
980			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
981			if (cb->mem == FASTREG) {
982				/*
983				 * Immediately follow the read with a
984				 * fenced LOCAL_INV.
985				 */
986				cb->rdma_sq_wr.next = &inv;
987				memset(&inv, 0, sizeof inv);
988				inv.opcode = IB_WR_LOCAL_INV;
989				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
990				inv.send_flags = IB_SEND_FENCE;
991			}
992		}
993
994		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
995		if (ret) {
996			PRINTF(cb, "post send error %d\n", ret);
997			break;
998		}
999		cb->rdma_sq_wr.next = NULL;
1000
1001		DEBUG_LOG(cb, "server posted rdma read req \n");
1002
1003		/* Wait for read completion */
1004		wait_event_interruptible(cb->sem,
1005					 cb->state >= RDMA_READ_COMPLETE);
1006		if (cb->state != RDMA_READ_COMPLETE) {
1007			PRINTF(cb,
1008			       "wait for RDMA_READ_COMPLETE state %d\n",
1009			       cb->state);
1010			break;
1011		}
1012		DEBUG_LOG(cb, "server received read complete\n");
1013
1014		/* Display data in recv buf */
1015		if (cb->verbose)
1016			PRINTF(cb, "server ping data: %s\n",
1017				cb->rdma_buf);
1018
1019		/* Tell client to continue */
1020		if (cb->server && cb->server_invalidate) {
1021			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1022			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1023			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1024		}
1025		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1026		if (ret) {
1027			PRINTF(cb, "post send error %d\n", ret);
1028			break;
1029		}
1030		DEBUG_LOG(cb, "server posted go ahead\n");
1031
1032		/* Wait for client's RDMA STAG/TO/Len */
1033		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1034		if (cb->state != RDMA_WRITE_ADV) {
1035			PRINTF(cb,
1036			       "wait for RDMA_WRITE_ADV state %d\n",
1037			       cb->state);
1038			break;
1039		}
1040		DEBUG_LOG(cb, "server received sink adv\n");
1041
1042		/* RDMA Write echo data */
1043		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1044		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1045		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1046		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1047		if (cb->local_dma_lkey)
1048			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1049		else
1050			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1051
1052		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1053			  cb->rdma_sq_wr.sg_list->lkey,
1054			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1055			  cb->rdma_sq_wr.sg_list->length);
1056
1057		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1058		if (ret) {
1059			PRINTF(cb, "post send error %d\n", ret);
1060			break;
1061		}
1062
1063		/* Wait for completion */
1064		ret = wait_event_interruptible(cb->sem, cb->state >=
1065							 RDMA_WRITE_COMPLETE);
1066		if (cb->state != RDMA_WRITE_COMPLETE) {
1067			PRINTF(cb,
1068			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1069			       cb->state);
1070			break;
1071		}
1072		DEBUG_LOG(cb, "server rdma write complete \n");
1073
1074		cb->state = CONNECTED;
1075
1076		/* Tell client to begin again */
1077		if (cb->server && cb->server_invalidate) {
1078			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1079			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1080			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1081		}
1082		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1083		if (ret) {
1084			PRINTF(cb, "post send error %d\n", ret);
1085			break;
1086		}
1087		DEBUG_LOG(cb, "server posted go ahead\n");
1088	}
1089}
1090
1091static void rlat_test(struct krping_cb *cb)
1092{
1093	int scnt;
1094	int iters = cb->count;
1095	struct timeval start_tv, stop_tv;
1096	int ret;
1097	struct ib_wc wc;
1098	struct ib_send_wr *bad_wr;
1099	int ne;
1100
1101	scnt = 0;
1102	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1103	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1104	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1105	cb->rdma_sq_wr.sg_list->length = cb->size;
1106
1107	microtime(&start_tv);
1108	if (!cb->poll) {
1109		cb->state = RDMA_READ_ADV;
1110		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1111	}
1112	while (scnt < iters) {
1113
1114		cb->state = RDMA_READ_ADV;
1115		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1116		if (ret) {
1117			PRINTF(cb,
1118				"Couldn't post send: ret=%d scnt %d\n",
1119				ret, scnt);
1120			return;
1121		}
1122
1123		do {
1124			if (!cb->poll) {
1125				wait_event_interruptible(cb->sem,
1126					cb->state != RDMA_READ_ADV);
1127				if (cb->state == RDMA_READ_COMPLETE) {
1128					ne = 1;
1129					ib_req_notify_cq(cb->cq,
1130						IB_CQ_NEXT_COMP);
1131				} else {
1132					ne = -1;
1133				}
1134			} else
1135				ne = ib_poll_cq(cb->cq, 1, &wc);
1136			if (cb->state == ERROR) {
1137				PRINTF(cb,
1138					"state == ERROR...bailing scnt %d\n",
1139					scnt);
1140				return;
1141			}
1142		} while (ne == 0);
1143
1144		if (ne < 0) {
1145			PRINTF(cb, "poll CQ failed %d\n", ne);
1146			return;
1147		}
1148		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1149			PRINTF(cb, "Completion wth error at %s:\n",
1150				cb->server ? "server" : "client");
1151			PRINTF(cb, "Failed status %d: wr_id %d\n",
1152				wc.status, (int) wc.wr_id);
1153			return;
1154		}
1155		++scnt;
1156	}
1157	microtime(&stop_tv);
1158
1159        if (stop_tv.tv_usec < start_tv.tv_usec) {
1160                stop_tv.tv_usec += 1000000;
1161                stop_tv.tv_sec  -= 1;
1162        }
1163
1164	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1165		stop_tv.tv_sec - start_tv.tv_sec,
1166		stop_tv.tv_usec - start_tv.tv_usec,
1167		scnt, cb->size);
1168}
1169
1170static void wlat_test(struct krping_cb *cb)
1171{
1172	int ccnt, scnt, rcnt;
1173	int iters=cb->count;
1174	volatile char *poll_buf = (char *) cb->start_buf;
1175	char *buf = (char *)cb->rdma_buf;
1176	struct timeval start_tv, stop_tv;
1177	cycles_t *post_cycles_start, *post_cycles_stop;
1178	cycles_t *poll_cycles_start, *poll_cycles_stop;
1179	cycles_t *last_poll_cycles_start;
1180	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1181	int i;
1182	int cycle_iters = 1000;
1183
1184	ccnt = 0;
1185	scnt = 0;
1186	rcnt = 0;
1187
1188	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1189	if (!post_cycles_start) {
1190		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1191		return;
1192	}
1193	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1194	if (!post_cycles_stop) {
1195		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1196		return;
1197	}
1198	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1199	if (!poll_cycles_start) {
1200		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1201		return;
1202	}
1203	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1204	if (!poll_cycles_stop) {
1205		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1206		return;
1207	}
1208	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1209		GFP_KERNEL);
1210	if (!last_poll_cycles_start) {
1211		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1212		return;
1213	}
1214	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1215	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1216	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1217	cb->rdma_sq_wr.sg_list->length = cb->size;
1218
1219	if (cycle_iters > iters)
1220		cycle_iters = iters;
1221	microtime(&start_tv);
1222	while (scnt < iters || ccnt < iters || rcnt < iters) {
1223
1224		/* Wait till buffer changes. */
1225		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1226			++rcnt;
1227			while (*poll_buf != (char)rcnt) {
1228				if (cb->state == ERROR) {
1229					PRINTF(cb,
1230						"state = ERROR, bailing\n");
1231					return;
1232				}
1233			}
1234		}
1235
1236		if (scnt < iters) {
1237			struct ib_send_wr *bad_wr;
1238
1239			*buf = (char)scnt+1;
1240			if (scnt < cycle_iters)
1241				post_cycles_start[scnt] = get_cycles();
1242			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1243				PRINTF(cb,
1244					"Couldn't post send: scnt=%d\n",
1245					scnt);
1246				return;
1247			}
1248			if (scnt < cycle_iters)
1249				post_cycles_stop[scnt] = get_cycles();
1250			scnt++;
1251		}
1252
1253		if (ccnt < iters) {
1254			struct ib_wc wc;
1255			int ne;
1256
1257			if (ccnt < cycle_iters)
1258				poll_cycles_start[ccnt] = get_cycles();
1259			do {
1260				if (ccnt < cycle_iters)
1261					last_poll_cycles_start[ccnt] =
1262						get_cycles();
1263				ne = ib_poll_cq(cb->cq, 1, &wc);
1264			} while (ne == 0);
1265			if (ccnt < cycle_iters)
1266				poll_cycles_stop[ccnt] = get_cycles();
1267			++ccnt;
1268
1269			if (ne < 0) {
1270				PRINTF(cb, "poll CQ failed %d\n", ne);
1271				return;
1272			}
1273			if (wc.status != IB_WC_SUCCESS) {
1274				PRINTF(cb,
1275					"Completion wth error at %s:\n",
1276					cb->server ? "server" : "client");
1277				PRINTF(cb,
1278					"Failed status %d: wr_id %d\n",
1279					wc.status, (int) wc.wr_id);
1280				PRINTF(cb,
1281					"scnt=%d, rcnt=%d, ccnt=%d\n",
1282					scnt, rcnt, ccnt);
1283				return;
1284			}
1285		}
1286	}
1287	microtime(&stop_tv);
1288
1289        if (stop_tv.tv_usec < start_tv.tv_usec) {
1290                stop_tv.tv_usec += 1000000;
1291                stop_tv.tv_sec  -= 1;
1292        }
1293
1294	for (i=0; i < cycle_iters; i++) {
1295		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1296		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1297		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1298	}
1299	PRINTF(cb,
1300		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1301		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1302		stop_tv.tv_sec - start_tv.tv_sec,
1303		stop_tv.tv_usec - start_tv.tv_usec,
1304		scnt, cb->size, cycle_iters,
1305		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1306		(unsigned long long)sum_last_poll);
1307	kfree(post_cycles_start);
1308	kfree(post_cycles_stop);
1309	kfree(poll_cycles_start);
1310	kfree(poll_cycles_stop);
1311	kfree(last_poll_cycles_start);
1312}
1313
1314static void bw_test(struct krping_cb *cb)
1315{
1316	int ccnt, scnt, rcnt;
1317	int iters=cb->count;
1318	struct timeval start_tv, stop_tv;
1319	cycles_t *post_cycles_start, *post_cycles_stop;
1320	cycles_t *poll_cycles_start, *poll_cycles_stop;
1321	cycles_t *last_poll_cycles_start;
1322	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1323	int i;
1324	int cycle_iters = 1000;
1325
1326	ccnt = 0;
1327	scnt = 0;
1328	rcnt = 0;
1329
1330	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1331	if (!post_cycles_start) {
1332		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1333		return;
1334	}
1335	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1336	if (!post_cycles_stop) {
1337		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1338		return;
1339	}
1340	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1341	if (!poll_cycles_start) {
1342		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1343		return;
1344	}
1345	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1346	if (!poll_cycles_stop) {
1347		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1348		return;
1349	}
1350	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1351		GFP_KERNEL);
1352	if (!last_poll_cycles_start) {
1353		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1354		return;
1355	}
1356	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1357	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1358	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1359	cb->rdma_sq_wr.sg_list->length = cb->size;
1360
1361	if (cycle_iters > iters)
1362		cycle_iters = iters;
1363	microtime(&start_tv);
1364	while (scnt < iters || ccnt < iters) {
1365
1366		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1367			struct ib_send_wr *bad_wr;
1368
1369			if (scnt < cycle_iters)
1370				post_cycles_start[scnt] = get_cycles();
1371			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1372				PRINTF(cb,
1373					"Couldn't post send: scnt=%d\n",
1374					scnt);
1375				return;
1376			}
1377			if (scnt < cycle_iters)
1378				post_cycles_stop[scnt] = get_cycles();
1379			++scnt;
1380		}
1381
1382		if (ccnt < iters) {
1383			int ne;
1384			struct ib_wc wc;
1385
1386			if (ccnt < cycle_iters)
1387				poll_cycles_start[ccnt] = get_cycles();
1388			do {
1389				if (ccnt < cycle_iters)
1390					last_poll_cycles_start[ccnt] =
1391						get_cycles();
1392				ne = ib_poll_cq(cb->cq, 1, &wc);
1393			} while (ne == 0);
1394			if (ccnt < cycle_iters)
1395				poll_cycles_stop[ccnt] = get_cycles();
1396			ccnt += 1;
1397
1398			if (ne < 0) {
1399				PRINTF(cb, "poll CQ failed %d\n", ne);
1400				return;
1401			}
1402			if (wc.status != IB_WC_SUCCESS) {
1403				PRINTF(cb,
1404					"Completion wth error at %s:\n",
1405					cb->server ? "server" : "client");
1406				PRINTF(cb,
1407					"Failed status %d: wr_id %d\n",
1408					wc.status, (int) wc.wr_id);
1409				return;
1410			}
1411		}
1412	}
1413	microtime(&stop_tv);
1414
1415        if (stop_tv.tv_usec < start_tv.tv_usec) {
1416                stop_tv.tv_usec += 1000000;
1417                stop_tv.tv_sec  -= 1;
1418        }
1419
1420	for (i=0; i < cycle_iters; i++) {
1421		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1422		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1423		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1424	}
1425	PRINTF(cb,
1426		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1427		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1428		stop_tv.tv_sec - start_tv.tv_sec,
1429		stop_tv.tv_usec - start_tv.tv_usec,
1430		scnt, cb->size, cycle_iters,
1431		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1432		(unsigned long long)sum_last_poll);
1433	kfree(post_cycles_start);
1434	kfree(post_cycles_stop);
1435	kfree(poll_cycles_start);
1436	kfree(poll_cycles_stop);
1437	kfree(last_poll_cycles_start);
1438}
1439
1440static void krping_rlat_test_server(struct krping_cb *cb)
1441{
1442	struct ib_send_wr *bad_wr;
1443	struct ib_wc wc;
1444	int ret;
1445
1446	/* Spin waiting for client's Start STAG/TO/Len */
1447	while (cb->state < RDMA_READ_ADV) {
1448		krping_cq_event_handler(cb->cq, cb);
1449	}
1450
1451	/* Send STAG/TO/Len to client */
1452	krping_format_send(cb, cb->start_dma_addr);
1453	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1454	if (ret) {
1455		PRINTF(cb, "post send error %d\n", ret);
1456		return;
1457	}
1458
1459	/* Spin waiting for send completion */
1460	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1461	if (ret < 0) {
1462		PRINTF(cb, "poll error %d\n", ret);
1463		return;
1464	}
1465	if (wc.status) {
1466		PRINTF(cb, "send completiong error %d\n", wc.status);
1467		return;
1468	}
1469
1470	wait_event_interruptible(cb->sem, cb->state == ERROR);
1471}
1472
1473static void krping_wlat_test_server(struct krping_cb *cb)
1474{
1475	struct ib_send_wr *bad_wr;
1476	struct ib_wc wc;
1477	int ret;
1478
1479	/* Spin waiting for client's Start STAG/TO/Len */
1480	while (cb->state < RDMA_READ_ADV) {
1481		krping_cq_event_handler(cb->cq, cb);
1482	}
1483
1484	/* Send STAG/TO/Len to client */
1485	krping_format_send(cb, cb->start_dma_addr);
1486	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1487	if (ret) {
1488		PRINTF(cb, "post send error %d\n", ret);
1489		return;
1490	}
1491
1492	/* Spin waiting for send completion */
1493	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1494	if (ret < 0) {
1495		PRINTF(cb, "poll error %d\n", ret);
1496		return;
1497	}
1498	if (wc.status) {
1499		PRINTF(cb, "send completiong error %d\n", wc.status);
1500		return;
1501	}
1502
1503	wlat_test(cb);
1504	wait_event_interruptible(cb->sem, cb->state == ERROR);
1505}
1506
1507static void krping_bw_test_server(struct krping_cb *cb)
1508{
1509	struct ib_send_wr *bad_wr;
1510	struct ib_wc wc;
1511	int ret;
1512
1513	/* Spin waiting for client's Start STAG/TO/Len */
1514	while (cb->state < RDMA_READ_ADV) {
1515		krping_cq_event_handler(cb->cq, cb);
1516	}
1517
1518	/* Send STAG/TO/Len to client */
1519	krping_format_send(cb, cb->start_dma_addr);
1520	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1521	if (ret) {
1522		PRINTF(cb, "post send error %d\n", ret);
1523		return;
1524	}
1525
1526	/* Spin waiting for send completion */
1527	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1528	if (ret < 0) {
1529		PRINTF(cb, "poll error %d\n", ret);
1530		return;
1531	}
1532	if (wc.status) {
1533		PRINTF(cb, "send completiong error %d\n", wc.status);
1534		return;
1535	}
1536
1537	if (cb->duplex)
1538		bw_test(cb);
1539	wait_event_interruptible(cb->sem, cb->state == ERROR);
1540}
1541
1542static int fastreg_supported(struct krping_cb *cb)
1543{
1544	struct ib_device *dev = cb->child_cm_id->device;
1545	struct ib_device_attr attr;
1546	int ret;
1547
1548	ret = ib_query_device(dev, &attr);
1549	if (ret) {
1550		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1551		return 0;
1552	}
1553	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1554		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1555		    attr.device_cap_flags);
1556		return 0;
1557	}
1558	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1559		attr.device_cap_flags);
1560	return 1;
1561}
1562
1563static int krping_bind_server(struct krping_cb *cb)
1564{
1565	struct sockaddr_in sin;
1566	int ret;
1567
1568	memset(&sin, 0, sizeof(sin));
1569	sin.sin_len = sizeof sin;
1570	sin.sin_family = AF_INET;
1571	sin.sin_addr.s_addr = cb->addr.s_addr;
1572	sin.sin_port = cb->port;
1573
1574	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1575	if (ret) {
1576		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1577		return ret;
1578	}
1579	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1580
1581	DEBUG_LOG(cb, "rdma_listen\n");
1582	ret = rdma_listen(cb->cm_id, 3);
1583	if (ret) {
1584		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1585		return ret;
1586	}
1587
1588	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1589	if (cb->state != CONNECT_REQUEST) {
1590		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1591			cb->state);
1592		return -1;
1593	}
1594
1595	if (cb->mem == FASTREG && !fastreg_supported(cb))
1596		return -EINVAL;
1597
1598	return 0;
1599}
1600
1601static void krping_run_server(struct krping_cb *cb)
1602{
1603	struct ib_recv_wr *bad_wr;
1604	int ret;
1605
1606	ret = krping_bind_server(cb);
1607	if (ret)
1608		return;
1609
1610	ret = krping_setup_qp(cb, cb->child_cm_id);
1611	if (ret) {
1612		PRINTF(cb, "setup_qp failed: %d\n", ret);
1613		goto err0;
1614	}
1615
1616	ret = krping_setup_buffers(cb);
1617	if (ret) {
1618		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1619		goto err1;
1620	}
1621
1622	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1623	if (ret) {
1624		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1625		goto err2;
1626	}
1627
1628	ret = krping_accept(cb);
1629	if (ret) {
1630		PRINTF(cb, "connect error %d\n", ret);
1631		goto err2;
1632	}
1633
1634	if (cb->wlat)
1635		krping_wlat_test_server(cb);
1636	else if (cb->rlat)
1637		krping_rlat_test_server(cb);
1638	else if (cb->bw)
1639		krping_bw_test_server(cb);
1640	else
1641		krping_test_server(cb);
1642	rdma_disconnect(cb->child_cm_id);
1643err2:
1644	krping_free_buffers(cb);
1645err1:
1646	krping_free_qp(cb);
1647err0:
1648	rdma_destroy_id(cb->child_cm_id);
1649}
1650
1651static void krping_test_client(struct krping_cb *cb)
1652{
1653	int ping, start, cc, i, ret;
1654	struct ib_send_wr *bad_wr;
1655	unsigned char c;
1656
1657	start = 65;
1658	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1659		cb->state = RDMA_READ_ADV;
1660
1661		/* Put some ascii text in the buffer. */
1662		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1663		for (i = cc, c = start; i < cb->size; i++) {
1664			cb->start_buf[i] = c;
1665			c++;
1666			if (c > 122)
1667				c = 65;
1668		}
1669		start++;
1670		if (start > 122)
1671			start = 65;
1672		cb->start_buf[cb->size - 1] = 0;
1673
1674		krping_format_send(cb, cb->start_dma_addr);
1675		if (cb->state == ERROR) {
1676			PRINTF(cb, "krping_format_send failed\n");
1677			break;
1678		}
1679		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1680		if (ret) {
1681			PRINTF(cb, "post send error %d\n", ret);
1682			break;
1683		}
1684
1685		/* Wait for server to ACK */
1686		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1687		if (cb->state != RDMA_WRITE_ADV) {
1688			PRINTF(cb,
1689			       "wait for RDMA_WRITE_ADV state %d\n",
1690			       cb->state);
1691			break;
1692		}
1693
1694		krping_format_send(cb, cb->rdma_dma_addr);
1695		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1696		if (ret) {
1697			PRINTF(cb, "post send error %d\n", ret);
1698			break;
1699		}
1700
1701		/* Wait for the server to say the RDMA Write is complete. */
1702		wait_event_interruptible(cb->sem,
1703					 cb->state >= RDMA_WRITE_COMPLETE);
1704		if (cb->state != RDMA_WRITE_COMPLETE) {
1705			PRINTF(cb,
1706			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1707			       cb->state);
1708			break;
1709		}
1710
1711		if (cb->validate)
1712			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1713				PRINTF(cb, "data mismatch!\n");
1714				break;
1715			}
1716
1717		if (cb->verbose)
1718			PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1719#ifdef SLOW_KRPING
1720		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1721#endif
1722	}
1723}
1724
1725static void krping_rlat_test_client(struct krping_cb *cb)
1726{
1727	struct ib_send_wr *bad_wr;
1728	struct ib_wc wc;
1729	int ret;
1730
1731	cb->state = RDMA_READ_ADV;
1732
1733	/* Send STAG/TO/Len to client */
1734	krping_format_send(cb, cb->start_dma_addr);
1735	if (cb->state == ERROR) {
1736		PRINTF(cb, "krping_format_send failed\n");
1737		return;
1738	}
1739	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1740	if (ret) {
1741		PRINTF(cb, "post send error %d\n", ret);
1742		return;
1743	}
1744
1745	/* Spin waiting for send completion */
1746	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1747	if (ret < 0) {
1748		PRINTF(cb, "poll error %d\n", ret);
1749		return;
1750	}
1751	if (wc.status) {
1752		PRINTF(cb, "send completion error %d\n", wc.status);
1753		return;
1754	}
1755
1756	/* Spin waiting for server's Start STAG/TO/Len */
1757	while (cb->state < RDMA_WRITE_ADV) {
1758		krping_cq_event_handler(cb->cq, cb);
1759	}
1760
1761#if 0
1762{
1763	int i;
1764	struct timeval start, stop;
1765	time_t sec;
1766	suseconds_t usec;
1767	unsigned long long elapsed;
1768	struct ib_wc wc;
1769	struct ib_send_wr *bad_wr;
1770	int ne;
1771
1772	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1773	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1774	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1775	cb->rdma_sq_wr.sg_list->length = 0;
1776	cb->rdma_sq_wr.num_sge = 0;
1777
1778	microtime(&start);
1779	for (i=0; i < 100000; i++) {
1780		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1781			PRINTF(cb, "Couldn't post send\n");
1782			return;
1783		}
1784		do {
1785			ne = ib_poll_cq(cb->cq, 1, &wc);
1786		} while (ne == 0);
1787		if (ne < 0) {
1788			PRINTF(cb, "poll CQ failed %d\n", ne);
1789			return;
1790		}
1791		if (wc.status != IB_WC_SUCCESS) {
1792			PRINTF(cb, "Completion wth error at %s:\n",
1793				cb->server ? "server" : "client");
1794			PRINTF(cb, "Failed status %d: wr_id %d\n",
1795				wc.status, (int) wc.wr_id);
1796			return;
1797		}
1798	}
1799	microtime(&stop);
1800
1801	if (stop.tv_usec < start.tv_usec) {
1802		stop.tv_usec += 1000000;
1803		stop.tv_sec  -= 1;
1804	}
1805	sec     = stop.tv_sec - start.tv_sec;
1806	usec    = stop.tv_usec - start.tv_usec;
1807	elapsed = sec * 1000000 + usec;
1808	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1809}
1810#endif
1811
1812	rlat_test(cb);
1813}
1814
1815static void krping_wlat_test_client(struct krping_cb *cb)
1816{
1817	struct ib_send_wr *bad_wr;
1818	struct ib_wc wc;
1819	int ret;
1820
1821	cb->state = RDMA_READ_ADV;
1822
1823	/* Send STAG/TO/Len to client */
1824	krping_format_send(cb, cb->start_dma_addr);
1825	if (cb->state == ERROR) {
1826		PRINTF(cb, "krping_format_send failed\n");
1827		return;
1828	}
1829	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1830	if (ret) {
1831		PRINTF(cb, "post send error %d\n", ret);
1832		return;
1833	}
1834
1835	/* Spin waiting for send completion */
1836	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1837	if (ret < 0) {
1838		PRINTF(cb, "poll error %d\n", ret);
1839		return;
1840	}
1841	if (wc.status) {
1842		PRINTF(cb, "send completion error %d\n", wc.status);
1843		return;
1844	}
1845
1846	/* Spin waiting for server's Start STAG/TO/Len */
1847	while (cb->state < RDMA_WRITE_ADV) {
1848		krping_cq_event_handler(cb->cq, cb);
1849	}
1850
1851	wlat_test(cb);
1852}
1853
1854static void krping_bw_test_client(struct krping_cb *cb)
1855{
1856	struct ib_send_wr *bad_wr;
1857	struct ib_wc wc;
1858	int ret;
1859
1860	cb->state = RDMA_READ_ADV;
1861
1862	/* Send STAG/TO/Len to client */
1863	krping_format_send(cb, cb->start_dma_addr);
1864	if (cb->state == ERROR) {
1865		PRINTF(cb, "krping_format_send failed\n");
1866		return;
1867	}
1868	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1869	if (ret) {
1870		PRINTF(cb, "post send error %d\n", ret);
1871		return;
1872	}
1873
1874	/* Spin waiting for send completion */
1875	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1876	if (ret < 0) {
1877		PRINTF(cb, "poll error %d\n", ret);
1878		return;
1879	}
1880	if (wc.status) {
1881		PRINTF(cb, "send completion error %d\n", wc.status);
1882		return;
1883	}
1884
1885	/* Spin waiting for server's Start STAG/TO/Len */
1886	while (cb->state < RDMA_WRITE_ADV) {
1887		krping_cq_event_handler(cb->cq, cb);
1888	}
1889
1890	bw_test(cb);
1891}
1892
1893static void krping_fr_test(struct krping_cb *cb)
1894{
1895	struct ib_fast_reg_page_list *pl;
1896	struct ib_send_wr fr, inv, *bad;
1897	struct ib_wc wc;
1898	u8 key = 0;
1899	struct ib_mr *mr;
1900	int i;
1901	int ret;
1902	int size = cb->size;
1903	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1904	time_t start;
1905	int count = 0;
1906	int scnt = 0;
1907
1908	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1909	if (IS_ERR(pl)) {
1910		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1911		return;
1912	}
1913
1914	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1915	if (IS_ERR(mr)) {
1916		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1917		goto err1;
1918	}
1919
1920	for (i=0; i<plen; i++)
1921		pl->page_list[i] = 0xcafebabe | i;
1922
1923	memset(&fr, 0, sizeof fr);
1924	fr.opcode = IB_WR_FAST_REG_MR;
1925	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1926	fr.wr.fast_reg.length = size;
1927	fr.wr.fast_reg.page_list = pl;
1928	fr.wr.fast_reg.page_list_len = plen;
1929	fr.wr.fast_reg.iova_start = 0;
1930	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1931	fr.next = &inv;
1932	memset(&inv, 0, sizeof inv);
1933	inv.opcode = IB_WR_LOCAL_INV;
1934	inv.send_flags = IB_SEND_SIGNALED;
1935
1936	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1937	start = time_uptime;
1938	while (1) {
1939		if ((time_uptime - start) >= 9) {
1940			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1941			wait_event_interruptible(cb->sem, cb->state == ERROR);
1942			if (cb->state == ERROR)
1943				break;
1944			start = time_uptime;
1945		}
1946		while (scnt < (cb->txdepth>>1)) {
1947			ib_update_fast_reg_key(mr, ++key);
1948			fr.wr.fast_reg.rkey = mr->rkey;
1949			inv.ex.invalidate_rkey = mr->rkey;
1950			size = arc4random() % cb->size;
1951			if (size == 0)
1952				size = cb->size;
1953			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1954			fr.wr.fast_reg.length = size;
1955			fr.wr.fast_reg.page_list_len = plen;
1956			ret = ib_post_send(cb->qp, &fr, &bad);
1957			if (ret) {
1958				PRINTF(cb, "ib_post_send failed %d\n", ret);
1959				goto err2;
1960			}
1961			scnt++;
1962		}
1963
1964		do {
1965			ret = ib_poll_cq(cb->cq, 1, &wc);
1966			if (ret < 0) {
1967				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1968				goto err2;
1969			}
1970			if (ret == 1) {
1971				if (wc.status) {
1972					PRINTF(cb, "completion error %u\n", wc.status);
1973					goto err2;
1974				}
1975				count++;
1976				scnt--;
1977			}
1978			else if (krping_sigpending()) {
1979				PRINTF(cb, "signal!\n");
1980				goto err2;
1981			}
1982		} while (ret == 1);
1983	}
1984err2:
1985#if 0
1986	DEBUG_LOG(cb, "sleeping 1 second\n");
1987	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1988#endif
1989	DEBUG_LOG(cb, "draining the cq...\n");
1990	do {
1991		ret = ib_poll_cq(cb->cq, 1, &wc);
1992		if (ret < 0) {
1993			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1994			break;
1995		}
1996		if (ret == 1) {
1997			if (wc.status) {
1998				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
1999			}
2000		}
2001	} while (ret == 1);
2002	DEBUG_LOG(cb, "fr_test: done!\n");
2003	ib_dereg_mr(mr);
2004err1:
2005	ib_free_fast_reg_page_list(pl);
2006}
2007
2008static int krping_connect_client(struct krping_cb *cb)
2009{
2010	struct rdma_conn_param conn_param;
2011	int ret;
2012
2013	memset(&conn_param, 0, sizeof conn_param);
2014	conn_param.responder_resources = 1;
2015	conn_param.initiator_depth = 1;
2016	conn_param.retry_count = 10;
2017
2018	ret = rdma_connect(cb->cm_id, &conn_param);
2019	if (ret) {
2020		PRINTF(cb, "rdma_connect error %d\n", ret);
2021		return ret;
2022	}
2023
2024	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2025	if (cb->state == ERROR) {
2026		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2027		return -1;
2028	}
2029
2030	DEBUG_LOG(cb, "rdma_connect successful\n");
2031	return 0;
2032}
2033
2034static int krping_bind_client(struct krping_cb *cb)
2035{
2036	struct sockaddr_in sin;
2037	int ret;
2038
2039	memset(&sin, 0, sizeof(sin));
2040	sin.sin_len = sizeof sin;
2041	sin.sin_family = AF_INET;
2042	sin.sin_addr.s_addr = cb->addr.s_addr;
2043	sin.sin_port = cb->port;
2044
2045	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2046				2000);
2047	if (ret) {
2048		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2049		return ret;
2050	}
2051
2052	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2053	if (cb->state != ROUTE_RESOLVED) {
2054		PRINTF(cb,
2055		       "addr/route resolution did not resolve: state %d\n",
2056		       cb->state);
2057		return -EINTR;
2058	}
2059
2060	if (cb->mem == FASTREG && !fastreg_supported(cb))
2061		return -EINVAL;
2062
2063	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2064	return 0;
2065}
2066
2067static void krping_run_client(struct krping_cb *cb)
2068{
2069	struct ib_recv_wr *bad_wr;
2070	int ret;
2071
2072	ret = krping_bind_client(cb);
2073	if (ret)
2074		return;
2075
2076	ret = krping_setup_qp(cb, cb->cm_id);
2077	if (ret) {
2078		PRINTF(cb, "setup_qp failed: %d\n", ret);
2079		return;
2080	}
2081
2082	ret = krping_setup_buffers(cb);
2083	if (ret) {
2084		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2085		goto err1;
2086	}
2087
2088	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2089	if (ret) {
2090		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2091		goto err2;
2092	}
2093
2094	ret = krping_connect_client(cb);
2095	if (ret) {
2096		PRINTF(cb, "connect error %d\n", ret);
2097		goto err2;
2098	}
2099
2100	if (cb->wlat)
2101		krping_wlat_test_client(cb);
2102	else if (cb->rlat)
2103		krping_rlat_test_client(cb);
2104	else if (cb->bw)
2105		krping_bw_test_client(cb);
2106	else if (cb->frtest)
2107		krping_fr_test(cb);
2108	else
2109		krping_test_client(cb);
2110	rdma_disconnect(cb->cm_id);
2111err2:
2112	krping_free_buffers(cb);
2113err1:
2114	krping_free_qp(cb);
2115}
2116
2117int krping_doit(char *cmd, void *cookie)
2118{
2119	struct krping_cb *cb;
2120	int op;
2121	int ret = 0;
2122	char *optarg;
2123	unsigned long optint;
2124
2125	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2126	if (!cb)
2127		return -ENOMEM;
2128
2129	mutex_lock(&krping_mutex);
2130	list_add_tail(&cb->list, &krping_cbs);
2131	mutex_unlock(&krping_mutex);
2132
2133	cb->cookie = cookie;
2134	cb->server = -1;
2135	cb->state = IDLE;
2136	cb->size = 64;
2137	cb->txdepth = RPING_SQ_DEPTH;
2138	cb->mem = DMA;
2139	init_waitqueue_head(&cb->sem);
2140
2141	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2142			      &optint)) != 0) {
2143		switch (op) {
2144		case 'a':
2145			cb->addr_str = optarg;
2146			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2147			if (!inet_aton(optarg, &cb->addr)) {
2148				PRINTF(cb, "bad addr string %s\n",
2149				    optarg);
2150				ret = EINVAL;
2151			}
2152			break;
2153		case 'p':
2154			cb->port = htons(optint);
2155			DEBUG_LOG(cb, "port %d\n", (int)optint);
2156			break;
2157		case 'P':
2158			cb->poll = 1;
2159			DEBUG_LOG(cb, "server\n");
2160			break;
2161		case 's':
2162			cb->server = 1;
2163			DEBUG_LOG(cb, "server\n");
2164			break;
2165		case 'c':
2166			cb->server = 0;
2167			DEBUG_LOG(cb, "client\n");
2168			break;
2169		case 'S':
2170			cb->size = optint;
2171			if ((cb->size < 1) ||
2172			    (cb->size > RPING_BUFSIZE)) {
2173				PRINTF(cb, "Invalid size %d "
2174				       "(valid range is 1 to %d)\n",
2175				       cb->size, RPING_BUFSIZE);
2176				ret = EINVAL;
2177			} else
2178				DEBUG_LOG(cb, "size %d\n", (int)optint);
2179			break;
2180		case 'C':
2181			cb->count = optint;
2182			if (cb->count < 0) {
2183				PRINTF(cb, "Invalid count %d\n",
2184					cb->count);
2185				ret = EINVAL;
2186			} else
2187				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2188			break;
2189		case 'v':
2190			cb->verbose++;
2191			DEBUG_LOG(cb, "verbose\n");
2192			break;
2193		case 'V':
2194			cb->validate++;
2195			DEBUG_LOG(cb, "validate data\n");
2196			break;
2197		case 'l':
2198			cb->wlat++;
2199			break;
2200		case 'L':
2201			cb->rlat++;
2202			break;
2203		case 'B':
2204			cb->bw++;
2205			break;
2206		case 'd':
2207			cb->duplex++;
2208			break;
2209		case 'm':
2210			if (!strncmp(optarg, "dma", 3))
2211				cb->mem = DMA;
2212			else if (!strncmp(optarg, "fastreg", 7))
2213				cb->mem = FASTREG;
2214			else if (!strncmp(optarg, "mw", 2))
2215				cb->mem = MW;
2216			else if (!strncmp(optarg, "mr", 2))
2217				cb->mem = MR;
2218			else {
2219				PRINTF(cb, "unknown mem mode %s.  "
2220					"Must be dma, fastreg, mw, or mr\n",
2221					optarg);
2222				ret = -EINVAL;
2223				break;
2224			}
2225			break;
2226		case 'I':
2227			cb->server_invalidate = 1;
2228			break;
2229		case 'T':
2230			cb->txdepth = optint;
2231			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2232			break;
2233		case 'Z':
2234			cb->local_dma_lkey = 1;
2235			DEBUG_LOG(cb, "using local dma lkey\n");
2236			break;
2237		case 'R':
2238			cb->read_inv = 1;
2239			DEBUG_LOG(cb, "using read-with-inv\n");
2240			break;
2241		case 'f':
2242			cb->frtest = 1;
2243			DEBUG_LOG(cb, "fast-reg test!\n");
2244			break;
2245		default:
2246			PRINTF(cb, "unknown opt %s\n", optarg);
2247			ret = -EINVAL;
2248			break;
2249		}
2250	}
2251	if (ret)
2252		goto out;
2253
2254	if (cb->server == -1) {
2255		PRINTF(cb, "must be either client or server\n");
2256		ret = -EINVAL;
2257		goto out;
2258	}
2259
2260	if (cb->server && cb->frtest) {
2261		PRINTF(cb, "must be client to run frtest\n");
2262		ret = -EINVAL;
2263		goto out;
2264	}
2265
2266	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2267		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2268		ret = -EINVAL;
2269		goto out;
2270	}
2271
2272	if (cb->server_invalidate && cb->mem != FASTREG) {
2273		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2274		ret = -EINVAL;
2275		goto out;
2276	}
2277
2278	if (cb->read_inv && cb->mem != FASTREG) {
2279		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2280		ret = -EINVAL;
2281		goto out;
2282	}
2283
2284	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2285		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2286		ret = -EINVAL;
2287		goto out;
2288	}
2289
2290	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2291	if (IS_ERR(cb->cm_id)) {
2292		ret = PTR_ERR(cb->cm_id);
2293		PRINTF(cb, "rdma_create_id error %d\n", ret);
2294		goto out;
2295	}
2296	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2297
2298	if (cb->server)
2299		krping_run_server(cb);
2300	else
2301		krping_run_client(cb);
2302
2303	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2304	rdma_destroy_id(cb->cm_id);
2305out:
2306	mutex_lock(&krping_mutex);
2307	list_del(&cb->list);
2308	mutex_unlock(&krping_mutex);
2309	kfree(cb);
2310	return ret;
2311}
2312
2313void
2314krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2315{
2316	struct krping_cb *cb;
2317
2318	mutex_lock(&krping_mutex);
2319	list_for_each_entry(cb, &krping_cbs, list)
2320	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2321	mutex_unlock(&krping_mutex);
2322}
2323
2324void krping_init(void)
2325{
2326
2327	mutex_init(&krping_mutex);
2328}
2329