krping.c revision 256829
1/*
2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3 * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/contrib/rdma/krping/krping.c 256829 2013-10-21 06:31:56Z np $");
36
37#include <linux/module.h>
38#include <linux/moduleparam.h>
39#include <linux/init.h>
40#include <linux/slab.h>
41#include <linux/err.h>
42#include <linux/string.h>
43#include <linux/inet.h>
44#include <linux/list.h>
45#include <linux/in.h>
46#include <linux/device.h>
47#include <linux/pci.h>
48#include <linux/sched.h>
49#include <asm/system.h>
50
51#include <asm/atomic.h>
52
53#include <rdma/ib_verbs.h>
54#include <rdma/rdma_cm.h>
55
56#include "krping.h"
57#include "getopt.h"
58
59extern int krping_debug;
60#define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
61#define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
62
63MODULE_AUTHOR("Steve Wise");
64MODULE_DESCRIPTION("RDMA ping client/server");
65MODULE_LICENSE("Dual BSD/GPL");
66
67static __inline uint64_t
68get_cycles(void)
69{
70	uint32_t low, high;
71	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
72	return (low | ((u_int64_t)high << 32));
73}
74
75typedef uint64_t cycles_t;
76
77enum mem_type {
78	DMA = 1,
79	FASTREG = 2,
80	MW = 3,
81	MR = 4
82};
83
84static const struct krping_option krping_opts[] = {
85	{"count", OPT_INT, 'C'},
86	{"size", OPT_INT, 'S'},
87	{"addr", OPT_STRING, 'a'},
88	{"port", OPT_INT, 'p'},
89	{"verbose", OPT_NOPARAM, 'v'},
90	{"validate", OPT_NOPARAM, 'V'},
91	{"server", OPT_NOPARAM, 's'},
92	{"client", OPT_NOPARAM, 'c'},
93	{"mem_mode", OPT_STRING, 'm'},
94	{"server_inv", OPT_NOPARAM, 'I'},
95 	{"wlat", OPT_NOPARAM, 'l'},
96 	{"rlat", OPT_NOPARAM, 'L'},
97 	{"bw", OPT_NOPARAM, 'B'},
98 	{"duplex", OPT_NOPARAM, 'd'},
99 	{"txdepth", OPT_INT, 'T'},
100 	{"poll", OPT_NOPARAM, 'P'},
101 	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
102 	{"read_inv", OPT_NOPARAM, 'R'},
103 	{"fr", OPT_NOPARAM, 'f'},
104	{NULL, 0, 0}
105};
106
107#define htonll(x) cpu_to_be64((x))
108#define ntohll(x) cpu_to_be64((x))
109
110static struct mutex krping_mutex;
111
112/*
113 * List of running krping threads.
114 */
115static LIST_HEAD(krping_cbs);
116
117/*
118 * krping "ping/pong" loop:
119 * 	client sends source rkey/addr/len
120 *	server receives source rkey/add/len
121 *	server rdma reads "ping" data from source
122 * 	server sends "go ahead" on rdma read completion
123 *	client sends sink rkey/addr/len
124 * 	server receives sink rkey/addr/len
125 * 	server rdma writes "pong" data to sink
126 * 	server sends "go ahead" on rdma write completion
127 * 	<repeat loop>
128 */
129
130/*
131 * These states are used to signal events between the completion handler
132 * and the main client or server thread.
133 *
134 * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
135 * and RDMA_WRITE_COMPLETE for each ping.
136 */
137enum test_state {
138	IDLE = 1,
139	CONNECT_REQUEST,
140	ADDR_RESOLVED,
141	ROUTE_RESOLVED,
142	CONNECTED,
143	RDMA_READ_ADV,
144	RDMA_READ_COMPLETE,
145	RDMA_WRITE_ADV,
146	RDMA_WRITE_COMPLETE,
147	ERROR
148};
149
150struct krping_rdma_info {
151	uint64_t buf;
152	uint32_t rkey;
153	uint32_t size;
154};
155
156/*
157 * Default max buffer size for IO...
158 */
159#define RPING_BUFSIZE 128*1024
160#define RPING_SQ_DEPTH 64
161
162/*
163 * Control block struct.
164 */
165struct krping_cb {
166	void *cookie;
167	int server;			/* 0 iff client */
168	struct ib_cq *cq;
169	struct ib_pd *pd;
170	struct ib_qp *qp;
171
172	enum mem_type mem;
173	struct ib_mr *dma_mr;
174
175	struct ib_fast_reg_page_list *page_list;
176	int page_list_len;
177	struct ib_send_wr fastreg_wr;
178	struct ib_send_wr invalidate_wr;
179	struct ib_mr *fastreg_mr;
180	int server_invalidate;
181	int read_inv;
182	u8 key;
183
184	struct ib_mw *mw;
185	struct ib_mw_bind bind_attr;
186
187	struct ib_recv_wr rq_wr;	/* recv work request record */
188	struct ib_sge recv_sgl;		/* recv single SGE */
189	struct krping_rdma_info recv_buf;/* malloc'd buffer */
190	u64 recv_dma_addr;
191	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
192	struct ib_mr *recv_mr;
193
194	struct ib_send_wr sq_wr;	/* send work requrest record */
195	struct ib_sge send_sgl;
196	struct krping_rdma_info send_buf;/* single send buf */
197	u64 send_dma_addr;
198	DECLARE_PCI_UNMAP_ADDR(send_mapping)
199	struct ib_mr *send_mr;
200
201	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
202	struct ib_sge rdma_sgl;		/* rdma single SGE */
203	char *rdma_buf;			/* used as rdma sink */
204	u64  rdma_dma_addr;
205	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
206	struct ib_mr *rdma_mr;
207
208	uint32_t remote_rkey;		/* remote guys RKEY */
209	uint64_t remote_addr;		/* remote guys TO */
210	uint32_t remote_len;		/* remote guys LEN */
211
212	char *start_buf;		/* rdma read src */
213	u64  start_dma_addr;
214	DECLARE_PCI_UNMAP_ADDR(start_mapping)
215	struct ib_mr *start_mr;
216
217	enum test_state state;		/* used for cond/signalling */
218	wait_queue_head_t sem;
219	struct krping_stats stats;
220
221	uint16_t port;			/* dst port in NBO */
222	struct in_addr addr;		/* dst addr in NBO */
223	char *addr_str;			/* dst addr string */
224	int verbose;			/* verbose logging */
225	int count;			/* ping count */
226	int size;			/* ping data size */
227	int validate;			/* validate ping data */
228	int wlat;			/* run wlat test */
229	int rlat;			/* run rlat test */
230	int bw;				/* run bw test */
231	int duplex;			/* run bw full duplex test */
232	int poll;			/* poll or block for rlat test */
233	int txdepth;			/* SQ depth */
234	int local_dma_lkey;		/* use 0 for lkey */
235	int frtest;			/* fastreg test */
236
237	/* CM stuff */
238	struct rdma_cm_id *cm_id;	/* connection on client side,*/
239					/* listener on server side. */
240	struct rdma_cm_id *child_cm_id;	/* connection on server side */
241	struct list_head list;
242};
243
244static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
245				   struct rdma_cm_event *event)
246{
247	int ret;
248	struct krping_cb *cb = cma_id->context;
249
250	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
251	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
252
253	switch (event->event) {
254	case RDMA_CM_EVENT_ADDR_RESOLVED:
255		cb->state = ADDR_RESOLVED;
256		ret = rdma_resolve_route(cma_id, 2000);
257		if (ret) {
258			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
259			wake_up_interruptible(&cb->sem);
260		}
261		break;
262
263	case RDMA_CM_EVENT_ROUTE_RESOLVED:
264		cb->state = ROUTE_RESOLVED;
265		wake_up_interruptible(&cb->sem);
266		break;
267
268	case RDMA_CM_EVENT_CONNECT_REQUEST:
269		cb->state = CONNECT_REQUEST;
270		cb->child_cm_id = cma_id;
271		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
272		wake_up_interruptible(&cb->sem);
273		break;
274
275	case RDMA_CM_EVENT_ESTABLISHED:
276		DEBUG_LOG(cb, "ESTABLISHED\n");
277		if (!cb->server) {
278			cb->state = CONNECTED;
279		}
280		wake_up_interruptible(&cb->sem);
281		break;
282
283	case RDMA_CM_EVENT_ADDR_ERROR:
284	case RDMA_CM_EVENT_ROUTE_ERROR:
285	case RDMA_CM_EVENT_CONNECT_ERROR:
286	case RDMA_CM_EVENT_UNREACHABLE:
287	case RDMA_CM_EVENT_REJECTED:
288		PRINTF(cb, "cma event %d, error %d\n", event->event,
289		       event->status);
290		cb->state = ERROR;
291		wake_up_interruptible(&cb->sem);
292		break;
293
294	case RDMA_CM_EVENT_DISCONNECTED:
295		PRINTF(cb, "DISCONNECT EVENT...\n");
296		cb->state = ERROR;
297		wake_up_interruptible(&cb->sem);
298		break;
299
300	case RDMA_CM_EVENT_DEVICE_REMOVAL:
301		PRINTF(cb, "cma detected device removal!!!!\n");
302		break;
303
304	default:
305		PRINTF(cb, "oof bad type!\n");
306		wake_up_interruptible(&cb->sem);
307		break;
308	}
309	return 0;
310}
311
312static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
313{
314	if (wc->byte_len != sizeof(cb->recv_buf)) {
315		PRINTF(cb, "Received bogus data, size %d\n",
316		       wc->byte_len);
317		return -1;
318	}
319
320	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
321	cb->remote_addr = ntohll(cb->recv_buf.buf);
322	cb->remote_len  = ntohl(cb->recv_buf.size);
323	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
324		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
325		  cb->remote_len);
326
327	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
328		cb->state = RDMA_READ_ADV;
329	else
330		cb->state = RDMA_WRITE_ADV;
331
332	return 0;
333}
334
335static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
336{
337	if (wc->byte_len != sizeof(cb->recv_buf)) {
338		PRINTF(cb, "Received bogus data, size %d\n",
339		       wc->byte_len);
340		return -1;
341	}
342
343	if (cb->state == RDMA_READ_ADV)
344		cb->state = RDMA_WRITE_ADV;
345	else
346		cb->state = RDMA_WRITE_COMPLETE;
347
348	return 0;
349}
350
351static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
352{
353	struct krping_cb *cb = ctx;
354	struct ib_wc wc;
355	struct ib_recv_wr *bad_wr;
356	int ret;
357
358	BUG_ON(cb->cq != cq);
359	if (cb->state == ERROR) {
360		PRINTF(cb, "cq completion in ERROR state\n");
361		return;
362	}
363	if (cb->frtest) {
364		PRINTF(cb, "cq completion event in frtest!\n");
365		return;
366	}
367	if (!cb->wlat && !cb->rlat && !cb->bw)
368		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
369	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
370		if (wc.status) {
371			if (wc.status == IB_WC_WR_FLUSH_ERR) {
372				DEBUG_LOG(cb, "cq flushed\n");
373				continue;
374			} else {
375				PRINTF(cb, "cq completion failed with "
376				       "wr_id %Lx status %d opcode %d vender_err %x\n",
377					wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
378				goto error;
379			}
380		}
381
382		switch (wc.opcode) {
383		case IB_WC_SEND:
384			DEBUG_LOG(cb, "send completion\n");
385			cb->stats.send_bytes += cb->send_sgl.length;
386			cb->stats.send_msgs++;
387			break;
388
389		case IB_WC_RDMA_WRITE:
390			DEBUG_LOG(cb, "rdma write completion\n");
391			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
392			cb->stats.write_msgs++;
393			cb->state = RDMA_WRITE_COMPLETE;
394			wake_up_interruptible(&cb->sem);
395			break;
396
397		case IB_WC_RDMA_READ:
398			DEBUG_LOG(cb, "rdma read completion\n");
399			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
400			cb->stats.read_msgs++;
401			cb->state = RDMA_READ_COMPLETE;
402			wake_up_interruptible(&cb->sem);
403			break;
404
405		case IB_WC_RECV:
406			DEBUG_LOG(cb, "recv completion\n");
407			cb->stats.recv_bytes += sizeof(cb->recv_buf);
408			cb->stats.recv_msgs++;
409			if (cb->wlat || cb->rlat || cb->bw)
410				ret = server_recv(cb, &wc);
411			else
412				ret = cb->server ? server_recv(cb, &wc) :
413						   client_recv(cb, &wc);
414			if (ret) {
415				PRINTF(cb, "recv wc error: %d\n", ret);
416				goto error;
417			}
418
419			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
420			if (ret) {
421				PRINTF(cb, "post recv error: %d\n",
422				       ret);
423				goto error;
424			}
425			wake_up_interruptible(&cb->sem);
426			break;
427
428		default:
429			PRINTF(cb,
430			       "%s:%d Unexpected opcode %d, Shutting down\n",
431			       __func__, __LINE__, wc.opcode);
432			goto error;
433		}
434	}
435	if (ret) {
436		PRINTF(cb, "poll error %d\n", ret);
437		goto error;
438	}
439	return;
440error:
441	cb->state = ERROR;
442	wake_up_interruptible(&cb->sem);
443}
444
445static int krping_accept(struct krping_cb *cb)
446{
447	struct rdma_conn_param conn_param;
448	int ret;
449
450	DEBUG_LOG(cb, "accepting client connection request\n");
451
452	memset(&conn_param, 0, sizeof conn_param);
453	conn_param.responder_resources = 1;
454	conn_param.initiator_depth = 1;
455
456	ret = rdma_accept(cb->child_cm_id, &conn_param);
457	if (ret) {
458		PRINTF(cb, "rdma_accept error: %d\n", ret);
459		return ret;
460	}
461
462	if (!cb->wlat && !cb->rlat && !cb->bw) {
463		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
464		if (cb->state == ERROR) {
465			PRINTF(cb, "wait for CONNECTED state %d\n",
466				cb->state);
467			return -1;
468		}
469	}
470	return 0;
471}
472
473static void krping_setup_wr(struct krping_cb *cb)
474{
475	cb->recv_sgl.addr = cb->recv_dma_addr;
476	cb->recv_sgl.length = sizeof cb->recv_buf;
477	if (cb->local_dma_lkey)
478		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
479	else if (cb->mem == DMA)
480		cb->recv_sgl.lkey = cb->dma_mr->lkey;
481	else
482		cb->recv_sgl.lkey = cb->recv_mr->lkey;
483	cb->rq_wr.sg_list = &cb->recv_sgl;
484	cb->rq_wr.num_sge = 1;
485
486	cb->send_sgl.addr = cb->send_dma_addr;
487	cb->send_sgl.length = sizeof cb->send_buf;
488	if (cb->local_dma_lkey)
489		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
490	else if (cb->mem == DMA)
491		cb->send_sgl.lkey = cb->dma_mr->lkey;
492	else
493		cb->send_sgl.lkey = cb->send_mr->lkey;
494
495	cb->sq_wr.opcode = IB_WR_SEND;
496	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
497	cb->sq_wr.sg_list = &cb->send_sgl;
498	cb->sq_wr.num_sge = 1;
499
500	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
501		cb->rdma_sgl.addr = cb->rdma_dma_addr;
502		if (cb->mem == MR)
503			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
504		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
505		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
506		cb->rdma_sq_wr.num_sge = 1;
507	}
508
509	switch(cb->mem) {
510	case FASTREG:
511
512		/*
513		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
514		 * both unsignaled.  The client uses them to reregister
515		 * the rdma buffers with a new key each iteration.
516		 */
517		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
518		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
519		cb->fastreg_wr.wr.fast_reg.length = cb->size;
520		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
521		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
522
523		cb->invalidate_wr.next = &cb->fastreg_wr;
524		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
525		break;
526	case MW:
527		cb->bind_attr.wr_id = 0xabbaabba;
528		cb->bind_attr.send_flags = 0; /* unsignaled */
529		cb->bind_attr.length = cb->size;
530		break;
531	default:
532		break;
533	}
534}
535
536static int krping_setup_buffers(struct krping_cb *cb)
537{
538	int ret;
539	struct ib_phys_buf buf;
540	u64 iovbase;
541
542	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
543
544	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device,
545				   &cb->recv_buf,
546				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
547	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
548	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device,
549					   &cb->send_buf, sizeof(cb->send_buf),
550					   DMA_BIDIRECTIONAL);
551	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
552
553	if (cb->mem == DMA) {
554		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
555					   IB_ACCESS_REMOTE_READ|
556				           IB_ACCESS_REMOTE_WRITE);
557		if (IS_ERR(cb->dma_mr)) {
558			DEBUG_LOG(cb, "reg_dmamr failed\n");
559			ret = PTR_ERR(cb->dma_mr);
560			goto bail;
561		}
562	} else {
563		if (!cb->local_dma_lkey) {
564			buf.addr = cb->recv_dma_addr;
565			buf.size = sizeof cb->recv_buf;
566			DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr,
567				(int)buf.size);
568			iovbase = cb->recv_dma_addr;
569			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
570						     IB_ACCESS_LOCAL_WRITE,
571						     &iovbase);
572
573			if (IS_ERR(cb->recv_mr)) {
574				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
575				ret = PTR_ERR(cb->recv_mr);
576				goto bail;
577			}
578
579			buf.addr = cb->send_dma_addr;
580			buf.size = sizeof cb->send_buf;
581			DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr,
582				(int)buf.size);
583			iovbase = cb->send_dma_addr;
584			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
585						     0, &iovbase);
586
587			if (IS_ERR(cb->send_mr)) {
588				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
589				ret = PTR_ERR(cb->send_mr);
590				goto bail;
591			}
592		}
593	}
594
595	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
596	if (!cb->rdma_buf) {
597		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
598		ret = -ENOMEM;
599		goto bail;
600	}
601
602	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device,
603			       cb->rdma_buf, cb->size,
604			       DMA_BIDIRECTIONAL);
605	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
606	if (cb->mem != DMA) {
607		switch (cb->mem) {
608		case FASTREG:
609			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
610				PAGE_SIZE) >> PAGE_SHIFT;
611			cb->page_list = ib_alloc_fast_reg_page_list(
612						cb->pd->device,
613						cb->page_list_len);
614			if (IS_ERR(cb->page_list)) {
615				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
616				ret = PTR_ERR(cb->page_list);
617				goto bail;
618			}
619			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
620					cb->page_list->max_page_list_len);
621			if (IS_ERR(cb->fastreg_mr)) {
622				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
623				ret = PTR_ERR(cb->fastreg_mr);
624				goto bail;
625			}
626			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
627				" page_list_len %u\n", cb->fastreg_mr->rkey,
628				cb->page_list, cb->page_list_len);
629			break;
630		case MW:
631			cb->mw = ib_alloc_mw(cb->pd);
632			if (IS_ERR(cb->mw)) {
633				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
634				ret = PTR_ERR(cb->mw);
635				goto bail;
636			}
637			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
638			/*FALLTHROUGH*/
639		case MR:
640			buf.addr = cb->rdma_dma_addr;
641			buf.size = cb->size;
642			iovbase = cb->rdma_dma_addr;
643			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
644					     IB_ACCESS_REMOTE_READ|
645					     IB_ACCESS_REMOTE_WRITE,
646					     &iovbase);
647			if (IS_ERR(cb->rdma_mr)) {
648				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
649				ret = PTR_ERR(cb->rdma_mr);
650				goto bail;
651			}
652			DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n",
653				buf.addr, (int)buf.size, cb->rdma_mr->rkey);
654			break;
655		default:
656			ret = -EINVAL;
657			goto bail;
658			break;
659		}
660	}
661
662	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
663
664		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
665		if (!cb->start_buf) {
666			DEBUG_LOG(cb, "start_buf malloc failed\n");
667			ret = -ENOMEM;
668			goto bail;
669		}
670
671		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device,
672						   cb->start_buf, cb->size,
673						   DMA_BIDIRECTIONAL);
674		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
675
676		if (cb->mem == MR || cb->mem == MW) {
677			unsigned flags = IB_ACCESS_REMOTE_READ;
678
679			if (cb->wlat || cb->rlat || cb->bw)
680				flags |= IB_ACCESS_REMOTE_WRITE;
681
682			buf.addr = cb->start_dma_addr;
683			buf.size = cb->size;
684			DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n",
685				buf.addr, (int)buf.size);
686			iovbase = cb->start_dma_addr;
687			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
688					     flags,
689					     &iovbase);
690
691			if (IS_ERR(cb->start_mr)) {
692				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
693				ret = PTR_ERR(cb->start_mr);
694				goto bail;
695			}
696		}
697	}
698
699	krping_setup_wr(cb);
700	DEBUG_LOG(cb, "allocated & registered buffers...\n");
701	return 0;
702bail:
703	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
704		ib_dereg_mr(cb->fastreg_mr);
705	if (cb->mw && !IS_ERR(cb->mw))
706		ib_dealloc_mw(cb->mw);
707	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
708		ib_dereg_mr(cb->rdma_mr);
709	if (cb->page_list && !IS_ERR(cb->page_list))
710		ib_free_fast_reg_page_list(cb->page_list);
711	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
712		ib_dereg_mr(cb->dma_mr);
713	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
714		ib_dereg_mr(cb->recv_mr);
715	if (cb->send_mr && !IS_ERR(cb->send_mr))
716		ib_dereg_mr(cb->send_mr);
717	if (cb->rdma_buf)
718		kfree(cb->rdma_buf);
719	if (cb->start_buf)
720		kfree(cb->start_buf);
721	return ret;
722}
723
724static void krping_free_buffers(struct krping_cb *cb)
725{
726	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
727
728	if (cb->dma_mr)
729		ib_dereg_mr(cb->dma_mr);
730	if (cb->send_mr)
731		ib_dereg_mr(cb->send_mr);
732	if (cb->recv_mr)
733		ib_dereg_mr(cb->recv_mr);
734	if (cb->rdma_mr)
735		ib_dereg_mr(cb->rdma_mr);
736	if (cb->start_mr)
737		ib_dereg_mr(cb->start_mr);
738	if (cb->fastreg_mr)
739		ib_dereg_mr(cb->fastreg_mr);
740	if (cb->mw)
741		ib_dealloc_mw(cb->mw);
742
743	dma_unmap_single(cb->pd->device->dma_device,
744			 pci_unmap_addr(cb, recv_mapping),
745			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
746	dma_unmap_single(cb->pd->device->dma_device,
747			 pci_unmap_addr(cb, send_mapping),
748			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
749	dma_unmap_single(cb->pd->device->dma_device,
750			 pci_unmap_addr(cb, rdma_mapping),
751			 cb->size, DMA_BIDIRECTIONAL);
752	kfree(cb->rdma_buf);
753	if (cb->start_buf) {
754		dma_unmap_single(cb->pd->device->dma_device,
755			 pci_unmap_addr(cb, start_mapping),
756			 cb->size, DMA_BIDIRECTIONAL);
757		kfree(cb->start_buf);
758	}
759}
760
761static int krping_create_qp(struct krping_cb *cb)
762{
763	struct ib_qp_init_attr init_attr;
764	int ret;
765
766	memset(&init_attr, 0, sizeof(init_attr));
767	init_attr.cap.max_send_wr = cb->txdepth;
768	init_attr.cap.max_recv_wr = 2;
769	init_attr.cap.max_recv_sge = 1;
770	init_attr.cap.max_send_sge = 1;
771	init_attr.qp_type = IB_QPT_RC;
772	init_attr.send_cq = cb->cq;
773	init_attr.recv_cq = cb->cq;
774	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
775
776	if (cb->server) {
777		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
778		if (!ret)
779			cb->qp = cb->child_cm_id->qp;
780	} else {
781		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
782		if (!ret)
783			cb->qp = cb->cm_id->qp;
784	}
785
786	return ret;
787}
788
789static void krping_free_qp(struct krping_cb *cb)
790{
791	ib_destroy_qp(cb->qp);
792	ib_destroy_cq(cb->cq);
793	ib_dealloc_pd(cb->pd);
794}
795
796static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
797{
798	int ret;
799	cb->pd = ib_alloc_pd(cm_id->device);
800	if (IS_ERR(cb->pd)) {
801		PRINTF(cb, "ib_alloc_pd failed\n");
802		return PTR_ERR(cb->pd);
803	}
804	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
805
806	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
807
808	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
809			      cb, cb->txdepth * 2, 0);
810	if (IS_ERR(cb->cq)) {
811		PRINTF(cb, "ib_create_cq failed\n");
812		ret = PTR_ERR(cb->cq);
813		goto err1;
814	}
815	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
816
817	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
818		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
819		if (ret) {
820			PRINTF(cb, "ib_create_cq failed\n");
821			goto err2;
822		}
823	}
824
825	ret = krping_create_qp(cb);
826	if (ret) {
827		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
828		goto err2;
829	}
830	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
831	return 0;
832err2:
833	ib_destroy_cq(cb->cq);
834err1:
835	ib_dealloc_pd(cb->pd);
836	return ret;
837}
838
839/*
840 * return the (possibly rebound) rkey for the rdma buffer.
841 * FASTREG mode: invalidate and rebind via fastreg wr.
842 * MW mode: rebind the MW.
843 * other modes: just return the mr rkey.
844 */
845static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
846{
847	u32 rkey = 0xffffffff;
848	u64 p;
849	struct ib_send_wr *bad_wr;
850	int i;
851	int ret;
852
853	switch (cb->mem) {
854	case FASTREG:
855		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
856
857		/*
858		 * Update the fastreg key.
859		 */
860		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
861		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
862
863		/*
864		 * Update the fastreg WR with new buf info.
865		 */
866		if (buf == (u64)cb->start_dma_addr)
867			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
868		else
869			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
870		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
871		p = (u64)(buf & PAGE_MASK);
872		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
873		     i++, p += PAGE_SIZE) {
874			cb->page_list->page_list[i] = p;
875			DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
876		}
877
878		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
879			" iova_start %llx page_list_len %u\n",
880			post_inv,
881			cb->fastreg_wr.wr.fast_reg.rkey,
882			cb->fastreg_wr.wr.fast_reg.page_shift,
883			cb->fastreg_wr.wr.fast_reg.length,
884			cb->fastreg_wr.wr.fast_reg.iova_start,
885			cb->fastreg_wr.wr.fast_reg.page_list_len);
886
887		if (post_inv)
888			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
889		else
890			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
891		if (ret) {
892			PRINTF(cb, "post send error %d\n", ret);
893			cb->state = ERROR;
894		}
895		rkey = cb->fastreg_mr->rkey;
896		break;
897	case MW:
898		/*
899		 * Update the MW with new buf info.
900		 */
901		if (buf == (u64)cb->start_dma_addr) {
902			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
903			cb->bind_attr.mr = cb->start_mr;
904		} else {
905			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
906			cb->bind_attr.mr = cb->rdma_mr;
907		}
908		cb->bind_attr.addr = buf;
909		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
910			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
911		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
912		if (ret) {
913			PRINTF(cb, "bind mw error %d\n", ret);
914			cb->state = ERROR;
915		} else
916			rkey = cb->mw->rkey;
917		break;
918	case MR:
919		if (buf == (u64)cb->start_dma_addr)
920			rkey = cb->start_mr->rkey;
921		else
922			rkey = cb->rdma_mr->rkey;
923		break;
924	case DMA:
925		rkey = cb->dma_mr->rkey;
926		break;
927	default:
928		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
929		cb->state = ERROR;
930		break;
931	}
932	return rkey;
933}
934
935static void krping_format_send(struct krping_cb *cb, u64 buf)
936{
937	struct krping_rdma_info *info = &cb->send_buf;
938	u32 rkey;
939
940	/*
941	 * Client side will do fastreg or mw bind before
942	 * advertising the rdma buffer.  Server side
943	 * sends have no data.
944	 */
945	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
946		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
947		info->buf = htonll(buf);
948		info->rkey = htonl(rkey);
949		info->size = htonl(cb->size);
950		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
951			  (unsigned long long)buf, rkey, cb->size);
952	}
953}
954
955static void krping_test_server(struct krping_cb *cb)
956{
957	struct ib_send_wr *bad_wr, inv;
958	int ret;
959
960	while (1) {
961		/* Wait for client's Start STAG/TO/Len */
962		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
963		if (cb->state != RDMA_READ_ADV) {
964			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
965				cb->state);
966			break;
967		}
968
969		DEBUG_LOG(cb, "server received sink adv\n");
970
971		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
972		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
973		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
974		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
975		cb->rdma_sq_wr.next = NULL;
976
977		/* Issue RDMA Read. */
978		if (cb->read_inv)
979			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
980		else {
981
982			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
983			if (cb->mem == FASTREG) {
984				/*
985				 * Immediately follow the read with a
986				 * fenced LOCAL_INV.
987				 */
988				cb->rdma_sq_wr.next = &inv;
989				memset(&inv, 0, sizeof inv);
990				inv.opcode = IB_WR_LOCAL_INV;
991				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
992				inv.send_flags = IB_SEND_FENCE;
993			}
994		}
995
996		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
997		if (ret) {
998			PRINTF(cb, "post send error %d\n", ret);
999			break;
1000		}
1001		cb->rdma_sq_wr.next = NULL;
1002
1003		DEBUG_LOG(cb, "server posted rdma read req \n");
1004
1005		/* Wait for read completion */
1006		wait_event_interruptible(cb->sem,
1007					 cb->state >= RDMA_READ_COMPLETE);
1008		if (cb->state != RDMA_READ_COMPLETE) {
1009			PRINTF(cb,
1010			       "wait for RDMA_READ_COMPLETE state %d\n",
1011			       cb->state);
1012			break;
1013		}
1014		DEBUG_LOG(cb, "server received read complete\n");
1015
1016		/* Display data in recv buf */
1017		if (cb->verbose)
1018			PRINTF(cb, "server ping data: %s\n",
1019				cb->rdma_buf);
1020
1021		/* Tell client to continue */
1022		if (cb->server && cb->server_invalidate) {
1023			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1024			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1025			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1026		}
1027		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1028		if (ret) {
1029			PRINTF(cb, "post send error %d\n", ret);
1030			break;
1031		}
1032		DEBUG_LOG(cb, "server posted go ahead\n");
1033
1034		/* Wait for client's RDMA STAG/TO/Len */
1035		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1036		if (cb->state != RDMA_WRITE_ADV) {
1037			PRINTF(cb,
1038			       "wait for RDMA_WRITE_ADV state %d\n",
1039			       cb->state);
1040			break;
1041		}
1042		DEBUG_LOG(cb, "server received sink adv\n");
1043
1044		/* RDMA Write echo data */
1045		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1046		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1047		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1048		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1049		if (cb->local_dma_lkey)
1050			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1051		else
1052			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1053
1054		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1055			  cb->rdma_sq_wr.sg_list->lkey,
1056			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1057			  cb->rdma_sq_wr.sg_list->length);
1058
1059		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1060		if (ret) {
1061			PRINTF(cb, "post send error %d\n", ret);
1062			break;
1063		}
1064
1065		/* Wait for completion */
1066		ret = wait_event_interruptible(cb->sem, cb->state >=
1067							 RDMA_WRITE_COMPLETE);
1068		if (cb->state != RDMA_WRITE_COMPLETE) {
1069			PRINTF(cb,
1070			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1071			       cb->state);
1072			break;
1073		}
1074		DEBUG_LOG(cb, "server rdma write complete \n");
1075
1076		cb->state = CONNECTED;
1077
1078		/* Tell client to begin again */
1079		if (cb->server && cb->server_invalidate) {
1080			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1081			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1082			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1083		}
1084		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1085		if (ret) {
1086			PRINTF(cb, "post send error %d\n", ret);
1087			break;
1088		}
1089		DEBUG_LOG(cb, "server posted go ahead\n");
1090	}
1091}
1092
1093static void rlat_test(struct krping_cb *cb)
1094{
1095	int scnt;
1096	int iters = cb->count;
1097	struct timeval start_tv, stop_tv;
1098	int ret;
1099	struct ib_wc wc;
1100	struct ib_send_wr *bad_wr;
1101	int ne;
1102
1103	scnt = 0;
1104	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1105	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1106	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1107	cb->rdma_sq_wr.sg_list->length = cb->size;
1108
1109	microtime(&start_tv);
1110	if (!cb->poll) {
1111		cb->state = RDMA_READ_ADV;
1112		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1113	}
1114	while (scnt < iters) {
1115
1116		cb->state = RDMA_READ_ADV;
1117		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1118		if (ret) {
1119			PRINTF(cb,
1120				"Couldn't post send: ret=%d scnt %d\n",
1121				ret, scnt);
1122			return;
1123		}
1124
1125		do {
1126			if (!cb->poll) {
1127				wait_event_interruptible(cb->sem,
1128					cb->state != RDMA_READ_ADV);
1129				if (cb->state == RDMA_READ_COMPLETE) {
1130					ne = 1;
1131					ib_req_notify_cq(cb->cq,
1132						IB_CQ_NEXT_COMP);
1133				} else {
1134					ne = -1;
1135				}
1136			} else
1137				ne = ib_poll_cq(cb->cq, 1, &wc);
1138			if (cb->state == ERROR) {
1139				PRINTF(cb,
1140					"state == ERROR...bailing scnt %d\n",
1141					scnt);
1142				return;
1143			}
1144		} while (ne == 0);
1145
1146		if (ne < 0) {
1147			PRINTF(cb, "poll CQ failed %d\n", ne);
1148			return;
1149		}
1150		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1151			PRINTF(cb, "Completion wth error at %s:\n",
1152				cb->server ? "server" : "client");
1153			PRINTF(cb, "Failed status %d: wr_id %d\n",
1154				wc.status, (int) wc.wr_id);
1155			return;
1156		}
1157		++scnt;
1158	}
1159	microtime(&stop_tv);
1160
1161        if (stop_tv.tv_usec < start_tv.tv_usec) {
1162                stop_tv.tv_usec += 1000000;
1163                stop_tv.tv_sec  -= 1;
1164        }
1165
1166	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1167		stop_tv.tv_sec - start_tv.tv_sec,
1168		stop_tv.tv_usec - start_tv.tv_usec,
1169		scnt, cb->size);
1170}
1171
1172static void wlat_test(struct krping_cb *cb)
1173{
1174	int ccnt, scnt, rcnt;
1175	int iters=cb->count;
1176	volatile char *poll_buf = (char *) cb->start_buf;
1177	char *buf = (char *)cb->rdma_buf;
1178	struct timeval start_tv, stop_tv;
1179	cycles_t *post_cycles_start, *post_cycles_stop;
1180	cycles_t *poll_cycles_start, *poll_cycles_stop;
1181	cycles_t *last_poll_cycles_start;
1182	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1183	int i;
1184	int cycle_iters = 1000;
1185
1186	ccnt = 0;
1187	scnt = 0;
1188	rcnt = 0;
1189
1190	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1191	if (!post_cycles_start) {
1192		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1193		return;
1194	}
1195	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1196	if (!post_cycles_stop) {
1197		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1198		return;
1199	}
1200	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1201	if (!poll_cycles_start) {
1202		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1203		return;
1204	}
1205	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1206	if (!poll_cycles_stop) {
1207		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1208		return;
1209	}
1210	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1211		GFP_KERNEL);
1212	if (!last_poll_cycles_start) {
1213		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1214		return;
1215	}
1216	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1217	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1218	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1219	cb->rdma_sq_wr.sg_list->length = cb->size;
1220
1221	if (cycle_iters > iters)
1222		cycle_iters = iters;
1223	microtime(&start_tv);
1224	while (scnt < iters || ccnt < iters || rcnt < iters) {
1225
1226		/* Wait till buffer changes. */
1227		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1228			++rcnt;
1229			while (*poll_buf != (char)rcnt) {
1230				if (cb->state == ERROR) {
1231					PRINTF(cb,
1232						"state = ERROR, bailing\n");
1233					return;
1234				}
1235			}
1236		}
1237
1238		if (scnt < iters) {
1239			struct ib_send_wr *bad_wr;
1240
1241			*buf = (char)scnt+1;
1242			if (scnt < cycle_iters)
1243				post_cycles_start[scnt] = get_cycles();
1244			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1245				PRINTF(cb,
1246					"Couldn't post send: scnt=%d\n",
1247					scnt);
1248				return;
1249			}
1250			if (scnt < cycle_iters)
1251				post_cycles_stop[scnt] = get_cycles();
1252			scnt++;
1253		}
1254
1255		if (ccnt < iters) {
1256			struct ib_wc wc;
1257			int ne;
1258
1259			if (ccnt < cycle_iters)
1260				poll_cycles_start[ccnt] = get_cycles();
1261			do {
1262				if (ccnt < cycle_iters)
1263					last_poll_cycles_start[ccnt] =
1264						get_cycles();
1265				ne = ib_poll_cq(cb->cq, 1, &wc);
1266			} while (ne == 0);
1267			if (ccnt < cycle_iters)
1268				poll_cycles_stop[ccnt] = get_cycles();
1269			++ccnt;
1270
1271			if (ne < 0) {
1272				PRINTF(cb, "poll CQ failed %d\n", ne);
1273				return;
1274			}
1275			if (wc.status != IB_WC_SUCCESS) {
1276				PRINTF(cb,
1277					"Completion wth error at %s:\n",
1278					cb->server ? "server" : "client");
1279				PRINTF(cb,
1280					"Failed status %d: wr_id %d\n",
1281					wc.status, (int) wc.wr_id);
1282				PRINTF(cb,
1283					"scnt=%d, rcnt=%d, ccnt=%d\n",
1284					scnt, rcnt, ccnt);
1285				return;
1286			}
1287		}
1288	}
1289	microtime(&stop_tv);
1290
1291        if (stop_tv.tv_usec < start_tv.tv_usec) {
1292                stop_tv.tv_usec += 1000000;
1293                stop_tv.tv_sec  -= 1;
1294        }
1295
1296	for (i=0; i < cycle_iters; i++) {
1297		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1298		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1299		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1300	}
1301	PRINTF(cb,
1302		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1303		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1304		stop_tv.tv_sec - start_tv.tv_sec,
1305		stop_tv.tv_usec - start_tv.tv_usec,
1306		scnt, cb->size, cycle_iters,
1307		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1308		(unsigned long long)sum_last_poll);
1309	kfree(post_cycles_start);
1310	kfree(post_cycles_stop);
1311	kfree(poll_cycles_start);
1312	kfree(poll_cycles_stop);
1313	kfree(last_poll_cycles_start);
1314}
1315
1316static void bw_test(struct krping_cb *cb)
1317{
1318	int ccnt, scnt, rcnt;
1319	int iters=cb->count;
1320	struct timeval start_tv, stop_tv;
1321	cycles_t *post_cycles_start, *post_cycles_stop;
1322	cycles_t *poll_cycles_start, *poll_cycles_stop;
1323	cycles_t *last_poll_cycles_start;
1324	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1325	int i;
1326	int cycle_iters = 1000;
1327
1328	ccnt = 0;
1329	scnt = 0;
1330	rcnt = 0;
1331
1332	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1333	if (!post_cycles_start) {
1334		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1335		return;
1336	}
1337	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1338	if (!post_cycles_stop) {
1339		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1340		return;
1341	}
1342	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1343	if (!poll_cycles_start) {
1344		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1345		return;
1346	}
1347	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1348	if (!poll_cycles_stop) {
1349		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1350		return;
1351	}
1352	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1353		GFP_KERNEL);
1354	if (!last_poll_cycles_start) {
1355		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1356		return;
1357	}
1358	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1359	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1360	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1361	cb->rdma_sq_wr.sg_list->length = cb->size;
1362
1363	if (cycle_iters > iters)
1364		cycle_iters = iters;
1365	microtime(&start_tv);
1366	while (scnt < iters || ccnt < iters) {
1367
1368		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1369			struct ib_send_wr *bad_wr;
1370
1371			if (scnt < cycle_iters)
1372				post_cycles_start[scnt] = get_cycles();
1373			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1374				PRINTF(cb,
1375					"Couldn't post send: scnt=%d\n",
1376					scnt);
1377				return;
1378			}
1379			if (scnt < cycle_iters)
1380				post_cycles_stop[scnt] = get_cycles();
1381			++scnt;
1382		}
1383
1384		if (ccnt < iters) {
1385			int ne;
1386			struct ib_wc wc;
1387
1388			if (ccnt < cycle_iters)
1389				poll_cycles_start[ccnt] = get_cycles();
1390			do {
1391				if (ccnt < cycle_iters)
1392					last_poll_cycles_start[ccnt] =
1393						get_cycles();
1394				ne = ib_poll_cq(cb->cq, 1, &wc);
1395			} while (ne == 0);
1396			if (ccnt < cycle_iters)
1397				poll_cycles_stop[ccnt] = get_cycles();
1398			ccnt += 1;
1399
1400			if (ne < 0) {
1401				PRINTF(cb, "poll CQ failed %d\n", ne);
1402				return;
1403			}
1404			if (wc.status != IB_WC_SUCCESS) {
1405				PRINTF(cb,
1406					"Completion wth error at %s:\n",
1407					cb->server ? "server" : "client");
1408				PRINTF(cb,
1409					"Failed status %d: wr_id %d\n",
1410					wc.status, (int) wc.wr_id);
1411				return;
1412			}
1413		}
1414	}
1415	microtime(&stop_tv);
1416
1417        if (stop_tv.tv_usec < start_tv.tv_usec) {
1418                stop_tv.tv_usec += 1000000;
1419                stop_tv.tv_sec  -= 1;
1420        }
1421
1422	for (i=0; i < cycle_iters; i++) {
1423		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1424		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1425		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1426	}
1427	PRINTF(cb,
1428		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1429		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1430		stop_tv.tv_sec - start_tv.tv_sec,
1431		stop_tv.tv_usec - start_tv.tv_usec,
1432		scnt, cb->size, cycle_iters,
1433		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1434		(unsigned long long)sum_last_poll);
1435	kfree(post_cycles_start);
1436	kfree(post_cycles_stop);
1437	kfree(poll_cycles_start);
1438	kfree(poll_cycles_stop);
1439	kfree(last_poll_cycles_start);
1440}
1441
1442static void krping_rlat_test_server(struct krping_cb *cb)
1443{
1444	struct ib_send_wr *bad_wr;
1445	struct ib_wc wc;
1446	int ret;
1447
1448	/* Spin waiting for client's Start STAG/TO/Len */
1449	while (cb->state < RDMA_READ_ADV) {
1450		krping_cq_event_handler(cb->cq, cb);
1451	}
1452
1453	/* Send STAG/TO/Len to client */
1454	krping_format_send(cb, cb->start_dma_addr);
1455	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1456	if (ret) {
1457		PRINTF(cb, "post send error %d\n", ret);
1458		return;
1459	}
1460
1461	/* Spin waiting for send completion */
1462	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1463	if (ret < 0) {
1464		PRINTF(cb, "poll error %d\n", ret);
1465		return;
1466	}
1467	if (wc.status) {
1468		PRINTF(cb, "send completiong error %d\n", wc.status);
1469		return;
1470	}
1471
1472	wait_event_interruptible(cb->sem, cb->state == ERROR);
1473}
1474
1475static void krping_wlat_test_server(struct krping_cb *cb)
1476{
1477	struct ib_send_wr *bad_wr;
1478	struct ib_wc wc;
1479	int ret;
1480
1481	/* Spin waiting for client's Start STAG/TO/Len */
1482	while (cb->state < RDMA_READ_ADV) {
1483		krping_cq_event_handler(cb->cq, cb);
1484	}
1485
1486	/* Send STAG/TO/Len to client */
1487	krping_format_send(cb, cb->start_dma_addr);
1488	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1489	if (ret) {
1490		PRINTF(cb, "post send error %d\n", ret);
1491		return;
1492	}
1493
1494	/* Spin waiting for send completion */
1495	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1496	if (ret < 0) {
1497		PRINTF(cb, "poll error %d\n", ret);
1498		return;
1499	}
1500	if (wc.status) {
1501		PRINTF(cb, "send completiong error %d\n", wc.status);
1502		return;
1503	}
1504
1505	wlat_test(cb);
1506	wait_event_interruptible(cb->sem, cb->state == ERROR);
1507}
1508
1509static void krping_bw_test_server(struct krping_cb *cb)
1510{
1511	struct ib_send_wr *bad_wr;
1512	struct ib_wc wc;
1513	int ret;
1514
1515	/* Spin waiting for client's Start STAG/TO/Len */
1516	while (cb->state < RDMA_READ_ADV) {
1517		krping_cq_event_handler(cb->cq, cb);
1518	}
1519
1520	/* Send STAG/TO/Len to client */
1521	krping_format_send(cb, cb->start_dma_addr);
1522	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1523	if (ret) {
1524		PRINTF(cb, "post send error %d\n", ret);
1525		return;
1526	}
1527
1528	/* Spin waiting for send completion */
1529	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1530	if (ret < 0) {
1531		PRINTF(cb, "poll error %d\n", ret);
1532		return;
1533	}
1534	if (wc.status) {
1535		PRINTF(cb, "send completiong error %d\n", wc.status);
1536		return;
1537	}
1538
1539	if (cb->duplex)
1540		bw_test(cb);
1541	wait_event_interruptible(cb->sem, cb->state == ERROR);
1542}
1543
1544static int fastreg_supported(struct krping_cb *cb)
1545{
1546	struct ib_device *dev = cb->child_cm_id->device;
1547	struct ib_device_attr attr;
1548	int ret;
1549
1550	ret = ib_query_device(dev, &attr);
1551	if (ret) {
1552		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1553		return 0;
1554	}
1555	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1556		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1557		    attr.device_cap_flags);
1558		return 0;
1559	}
1560	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1561		attr.device_cap_flags);
1562	return 1;
1563}
1564
1565static int krping_bind_server(struct krping_cb *cb)
1566{
1567	struct sockaddr_in sin;
1568	int ret;
1569
1570	memset(&sin, 0, sizeof(sin));
1571	sin.sin_len = sizeof sin;
1572	sin.sin_family = AF_INET;
1573	sin.sin_addr.s_addr = cb->addr.s_addr;
1574	sin.sin_port = cb->port;
1575
1576	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1577	if (ret) {
1578		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1579		return ret;
1580	}
1581	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1582
1583	DEBUG_LOG(cb, "rdma_listen\n");
1584	ret = rdma_listen(cb->cm_id, 3);
1585	if (ret) {
1586		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1587		return ret;
1588	}
1589
1590	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1591	if (cb->state != CONNECT_REQUEST) {
1592		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1593			cb->state);
1594		return -1;
1595	}
1596
1597	if (cb->mem == FASTREG && !fastreg_supported(cb))
1598		return -EINVAL;
1599
1600	return 0;
1601}
1602
1603static void krping_run_server(struct krping_cb *cb)
1604{
1605	struct ib_recv_wr *bad_wr;
1606	int ret;
1607
1608	ret = krping_bind_server(cb);
1609	if (ret)
1610		return;
1611
1612	ret = krping_setup_qp(cb, cb->child_cm_id);
1613	if (ret) {
1614		PRINTF(cb, "setup_qp failed: %d\n", ret);
1615		goto err0;
1616	}
1617
1618	ret = krping_setup_buffers(cb);
1619	if (ret) {
1620		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1621		goto err1;
1622	}
1623
1624	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1625	if (ret) {
1626		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1627		goto err2;
1628	}
1629
1630	ret = krping_accept(cb);
1631	if (ret) {
1632		PRINTF(cb, "connect error %d\n", ret);
1633		goto err2;
1634	}
1635
1636	if (cb->wlat)
1637		krping_wlat_test_server(cb);
1638	else if (cb->rlat)
1639		krping_rlat_test_server(cb);
1640	else if (cb->bw)
1641		krping_bw_test_server(cb);
1642	else
1643		krping_test_server(cb);
1644	rdma_disconnect(cb->child_cm_id);
1645err2:
1646	krping_free_buffers(cb);
1647err1:
1648	krping_free_qp(cb);
1649err0:
1650	rdma_destroy_id(cb->child_cm_id);
1651}
1652
1653static void krping_test_client(struct krping_cb *cb)
1654{
1655	int ping, start, cc, i, ret;
1656	struct ib_send_wr *bad_wr;
1657	unsigned char c;
1658
1659	start = 65;
1660	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1661		cb->state = RDMA_READ_ADV;
1662
1663		/* Put some ascii text in the buffer. */
1664		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1665		for (i = cc, c = start; i < cb->size; i++) {
1666			cb->start_buf[i] = c;
1667			c++;
1668			if (c > 122)
1669				c = 65;
1670		}
1671		start++;
1672		if (start > 122)
1673			start = 65;
1674		cb->start_buf[cb->size - 1] = 0;
1675
1676		krping_format_send(cb, cb->start_dma_addr);
1677		if (cb->state == ERROR) {
1678			PRINTF(cb, "krping_format_send failed\n");
1679			break;
1680		}
1681		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1682		if (ret) {
1683			PRINTF(cb, "post send error %d\n", ret);
1684			break;
1685		}
1686
1687		/* Wait for server to ACK */
1688		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1689		if (cb->state != RDMA_WRITE_ADV) {
1690			PRINTF(cb,
1691			       "wait for RDMA_WRITE_ADV state %d\n",
1692			       cb->state);
1693			break;
1694		}
1695
1696		krping_format_send(cb, cb->rdma_dma_addr);
1697		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1698		if (ret) {
1699			PRINTF(cb, "post send error %d\n", ret);
1700			break;
1701		}
1702
1703		/* Wait for the server to say the RDMA Write is complete. */
1704		wait_event_interruptible(cb->sem,
1705					 cb->state >= RDMA_WRITE_COMPLETE);
1706		if (cb->state != RDMA_WRITE_COMPLETE) {
1707			PRINTF(cb,
1708			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1709			       cb->state);
1710			break;
1711		}
1712
1713		if (cb->validate)
1714			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1715				PRINTF(cb, "data mismatch!\n");
1716				break;
1717			}
1718
1719		if (cb->verbose)
1720			PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1721#ifdef SLOW_KRPING
1722		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1723#endif
1724	}
1725}
1726
1727static void krping_rlat_test_client(struct krping_cb *cb)
1728{
1729	struct ib_send_wr *bad_wr;
1730	struct ib_wc wc;
1731	int ret;
1732
1733	cb->state = RDMA_READ_ADV;
1734
1735	/* Send STAG/TO/Len to client */
1736	krping_format_send(cb, cb->start_dma_addr);
1737	if (cb->state == ERROR) {
1738		PRINTF(cb, "krping_format_send failed\n");
1739		return;
1740	}
1741	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1742	if (ret) {
1743		PRINTF(cb, "post send error %d\n", ret);
1744		return;
1745	}
1746
1747	/* Spin waiting for send completion */
1748	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1749	if (ret < 0) {
1750		PRINTF(cb, "poll error %d\n", ret);
1751		return;
1752	}
1753	if (wc.status) {
1754		PRINTF(cb, "send completion error %d\n", wc.status);
1755		return;
1756	}
1757
1758	/* Spin waiting for server's Start STAG/TO/Len */
1759	while (cb->state < RDMA_WRITE_ADV) {
1760		krping_cq_event_handler(cb->cq, cb);
1761	}
1762
1763#if 0
1764{
1765	int i;
1766	struct timeval start, stop;
1767	time_t sec;
1768	suseconds_t usec;
1769	unsigned long long elapsed;
1770	struct ib_wc wc;
1771	struct ib_send_wr *bad_wr;
1772	int ne;
1773
1774	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1775	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1776	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1777	cb->rdma_sq_wr.sg_list->length = 0;
1778	cb->rdma_sq_wr.num_sge = 0;
1779
1780	microtime(&start);
1781	for (i=0; i < 100000; i++) {
1782		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1783			PRINTF(cb, "Couldn't post send\n");
1784			return;
1785		}
1786		do {
1787			ne = ib_poll_cq(cb->cq, 1, &wc);
1788		} while (ne == 0);
1789		if (ne < 0) {
1790			PRINTF(cb, "poll CQ failed %d\n", ne);
1791			return;
1792		}
1793		if (wc.status != IB_WC_SUCCESS) {
1794			PRINTF(cb, "Completion wth error at %s:\n",
1795				cb->server ? "server" : "client");
1796			PRINTF(cb, "Failed status %d: wr_id %d\n",
1797				wc.status, (int) wc.wr_id);
1798			return;
1799		}
1800	}
1801	microtime(&stop);
1802
1803	if (stop.tv_usec < start.tv_usec) {
1804		stop.tv_usec += 1000000;
1805		stop.tv_sec  -= 1;
1806	}
1807	sec     = stop.tv_sec - start.tv_sec;
1808	usec    = stop.tv_usec - start.tv_usec;
1809	elapsed = sec * 1000000 + usec;
1810	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1811}
1812#endif
1813
1814	rlat_test(cb);
1815}
1816
1817static void krping_wlat_test_client(struct krping_cb *cb)
1818{
1819	struct ib_send_wr *bad_wr;
1820	struct ib_wc wc;
1821	int ret;
1822
1823	cb->state = RDMA_READ_ADV;
1824
1825	/* Send STAG/TO/Len to client */
1826	krping_format_send(cb, cb->start_dma_addr);
1827	if (cb->state == ERROR) {
1828		PRINTF(cb, "krping_format_send failed\n");
1829		return;
1830	}
1831	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1832	if (ret) {
1833		PRINTF(cb, "post send error %d\n", ret);
1834		return;
1835	}
1836
1837	/* Spin waiting for send completion */
1838	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1839	if (ret < 0) {
1840		PRINTF(cb, "poll error %d\n", ret);
1841		return;
1842	}
1843	if (wc.status) {
1844		PRINTF(cb, "send completion error %d\n", wc.status);
1845		return;
1846	}
1847
1848	/* Spin waiting for server's Start STAG/TO/Len */
1849	while (cb->state < RDMA_WRITE_ADV) {
1850		krping_cq_event_handler(cb->cq, cb);
1851	}
1852
1853	wlat_test(cb);
1854}
1855
1856static void krping_bw_test_client(struct krping_cb *cb)
1857{
1858	struct ib_send_wr *bad_wr;
1859	struct ib_wc wc;
1860	int ret;
1861
1862	cb->state = RDMA_READ_ADV;
1863
1864	/* Send STAG/TO/Len to client */
1865	krping_format_send(cb, cb->start_dma_addr);
1866	if (cb->state == ERROR) {
1867		PRINTF(cb, "krping_format_send failed\n");
1868		return;
1869	}
1870	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1871	if (ret) {
1872		PRINTF(cb, "post send error %d\n", ret);
1873		return;
1874	}
1875
1876	/* Spin waiting for send completion */
1877	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1878	if (ret < 0) {
1879		PRINTF(cb, "poll error %d\n", ret);
1880		return;
1881	}
1882	if (wc.status) {
1883		PRINTF(cb, "send completion error %d\n", wc.status);
1884		return;
1885	}
1886
1887	/* Spin waiting for server's Start STAG/TO/Len */
1888	while (cb->state < RDMA_WRITE_ADV) {
1889		krping_cq_event_handler(cb->cq, cb);
1890	}
1891
1892	bw_test(cb);
1893}
1894
1895static void krping_fr_test(struct krping_cb *cb)
1896{
1897	struct ib_fast_reg_page_list *pl;
1898	struct ib_send_wr fr, inv, *bad;
1899	struct ib_wc wc;
1900	u8 key = 0;
1901	struct ib_mr *mr;
1902	int i;
1903	int ret;
1904	int size = cb->size;
1905	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1906	time_t start;
1907	int count = 0;
1908	int scnt = 0;
1909
1910	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1911	if (IS_ERR(pl)) {
1912		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1913		return;
1914	}
1915
1916	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1917	if (IS_ERR(mr)) {
1918		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1919		goto err1;
1920	}
1921
1922	for (i=0; i<plen; i++)
1923		pl->page_list[i] = 0xcafebabe | i;
1924
1925	memset(&fr, 0, sizeof fr);
1926	fr.opcode = IB_WR_FAST_REG_MR;
1927	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1928	fr.wr.fast_reg.length = size;
1929	fr.wr.fast_reg.page_list = pl;
1930	fr.wr.fast_reg.page_list_len = plen;
1931	fr.wr.fast_reg.iova_start = 0;
1932	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1933	fr.next = &inv;
1934	memset(&inv, 0, sizeof inv);
1935	inv.opcode = IB_WR_LOCAL_INV;
1936	inv.send_flags = IB_SEND_SIGNALED;
1937
1938	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1939	start = time_uptime;
1940	while (1) {
1941		if ((time_uptime - start) >= 9) {
1942			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1943			wait_event_interruptible(cb->sem, cb->state == ERROR);
1944			if (cb->state == ERROR)
1945				break;
1946			start = time_uptime;
1947		}
1948		while (scnt < (cb->txdepth>>1)) {
1949			ib_update_fast_reg_key(mr, ++key);
1950			fr.wr.fast_reg.rkey = mr->rkey;
1951			inv.ex.invalidate_rkey = mr->rkey;
1952			size = arc4random() % cb->size;
1953			if (size == 0)
1954				size = cb->size;
1955			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1956			fr.wr.fast_reg.length = size;
1957			fr.wr.fast_reg.page_list_len = plen;
1958			ret = ib_post_send(cb->qp, &fr, &bad);
1959			if (ret) {
1960				PRINTF(cb, "ib_post_send failed %d\n", ret);
1961				goto err2;
1962			}
1963			scnt++;
1964		}
1965
1966		do {
1967			ret = ib_poll_cq(cb->cq, 1, &wc);
1968			if (ret < 0) {
1969				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1970				goto err2;
1971			}
1972			if (ret == 1) {
1973				if (wc.status) {
1974					PRINTF(cb, "completion error %u\n", wc.status);
1975					goto err2;
1976				}
1977				count++;
1978				scnt--;
1979			}
1980			else if (krping_sigpending()) {
1981				PRINTF(cb, "signal!\n");
1982				goto err2;
1983			}
1984		} while (ret == 1);
1985	}
1986err2:
1987#if 0
1988	DEBUG_LOG(cb, "sleeping 1 second\n");
1989	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1990#endif
1991	DEBUG_LOG(cb, "draining the cq...\n");
1992	do {
1993		ret = ib_poll_cq(cb->cq, 1, &wc);
1994		if (ret < 0) {
1995			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1996			break;
1997		}
1998		if (ret == 1) {
1999			if (wc.status) {
2000				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2001			}
2002		}
2003	} while (ret == 1);
2004	DEBUG_LOG(cb, "fr_test: done!\n");
2005	ib_dereg_mr(mr);
2006err1:
2007	ib_free_fast_reg_page_list(pl);
2008}
2009
2010static int krping_connect_client(struct krping_cb *cb)
2011{
2012	struct rdma_conn_param conn_param;
2013	int ret;
2014
2015	memset(&conn_param, 0, sizeof conn_param);
2016	conn_param.responder_resources = 1;
2017	conn_param.initiator_depth = 1;
2018	conn_param.retry_count = 10;
2019
2020	ret = rdma_connect(cb->cm_id, &conn_param);
2021	if (ret) {
2022		PRINTF(cb, "rdma_connect error %d\n", ret);
2023		return ret;
2024	}
2025
2026	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2027	if (cb->state == ERROR) {
2028		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2029		return -1;
2030	}
2031
2032	DEBUG_LOG(cb, "rdma_connect successful\n");
2033	return 0;
2034}
2035
2036static int krping_bind_client(struct krping_cb *cb)
2037{
2038	struct sockaddr_in sin;
2039	int ret;
2040
2041	memset(&sin, 0, sizeof(sin));
2042	sin.sin_len = sizeof sin;
2043	sin.sin_family = AF_INET;
2044	sin.sin_addr.s_addr = cb->addr.s_addr;
2045	sin.sin_port = cb->port;
2046
2047	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2048				2000);
2049	if (ret) {
2050		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2051		return ret;
2052	}
2053
2054	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2055	if (cb->state != ROUTE_RESOLVED) {
2056		PRINTF(cb,
2057		       "addr/route resolution did not resolve: state %d\n",
2058		       cb->state);
2059		return -EINTR;
2060	}
2061
2062	if (cb->mem == FASTREG && !fastreg_supported(cb))
2063		return -EINVAL;
2064
2065	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2066	return 0;
2067}
2068
2069static void krping_run_client(struct krping_cb *cb)
2070{
2071	struct ib_recv_wr *bad_wr;
2072	int ret;
2073
2074	ret = krping_bind_client(cb);
2075	if (ret)
2076		return;
2077
2078	ret = krping_setup_qp(cb, cb->cm_id);
2079	if (ret) {
2080		PRINTF(cb, "setup_qp failed: %d\n", ret);
2081		return;
2082	}
2083
2084	ret = krping_setup_buffers(cb);
2085	if (ret) {
2086		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2087		goto err1;
2088	}
2089
2090	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2091	if (ret) {
2092		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2093		goto err2;
2094	}
2095
2096	ret = krping_connect_client(cb);
2097	if (ret) {
2098		PRINTF(cb, "connect error %d\n", ret);
2099		goto err2;
2100	}
2101
2102	if (cb->wlat)
2103		krping_wlat_test_client(cb);
2104	else if (cb->rlat)
2105		krping_rlat_test_client(cb);
2106	else if (cb->bw)
2107		krping_bw_test_client(cb);
2108	else if (cb->frtest)
2109		krping_fr_test(cb);
2110	else
2111		krping_test_client(cb);
2112	rdma_disconnect(cb->cm_id);
2113err2:
2114	krping_free_buffers(cb);
2115err1:
2116	krping_free_qp(cb);
2117}
2118
2119int krping_doit(char *cmd, void *cookie)
2120{
2121	struct krping_cb *cb;
2122	int op;
2123	int ret = 0;
2124	char *optarg;
2125	unsigned long optint;
2126
2127	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2128	if (!cb)
2129		return -ENOMEM;
2130
2131	mutex_lock(&krping_mutex);
2132	list_add_tail(&cb->list, &krping_cbs);
2133	mutex_unlock(&krping_mutex);
2134
2135	cb->cookie = cookie;
2136	cb->server = -1;
2137	cb->state = IDLE;
2138	cb->size = 64;
2139	cb->txdepth = RPING_SQ_DEPTH;
2140	cb->mem = DMA;
2141	init_waitqueue_head(&cb->sem);
2142
2143	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2144			      &optint)) != 0) {
2145		switch (op) {
2146		case 'a':
2147			cb->addr_str = optarg;
2148			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2149			if (!inet_aton(optarg, &cb->addr)) {
2150				PRINTF(cb, "bad addr string %s\n",
2151				    optarg);
2152				ret = EINVAL;
2153			}
2154			break;
2155		case 'p':
2156			cb->port = htons(optint);
2157			DEBUG_LOG(cb, "port %d\n", (int)optint);
2158			break;
2159		case 'P':
2160			cb->poll = 1;
2161			DEBUG_LOG(cb, "server\n");
2162			break;
2163		case 's':
2164			cb->server = 1;
2165			DEBUG_LOG(cb, "server\n");
2166			break;
2167		case 'c':
2168			cb->server = 0;
2169			DEBUG_LOG(cb, "client\n");
2170			break;
2171		case 'S':
2172			cb->size = optint;
2173			if ((cb->size < 1) ||
2174			    (cb->size > RPING_BUFSIZE)) {
2175				PRINTF(cb, "Invalid size %d "
2176				       "(valid range is 1 to %d)\n",
2177				       cb->size, RPING_BUFSIZE);
2178				ret = EINVAL;
2179			} else
2180				DEBUG_LOG(cb, "size %d\n", (int)optint);
2181			break;
2182		case 'C':
2183			cb->count = optint;
2184			if (cb->count < 0) {
2185				PRINTF(cb, "Invalid count %d\n",
2186					cb->count);
2187				ret = EINVAL;
2188			} else
2189				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2190			break;
2191		case 'v':
2192			cb->verbose++;
2193			DEBUG_LOG(cb, "verbose\n");
2194			break;
2195		case 'V':
2196			cb->validate++;
2197			DEBUG_LOG(cb, "validate data\n");
2198			break;
2199		case 'l':
2200			cb->wlat++;
2201			break;
2202		case 'L':
2203			cb->rlat++;
2204			break;
2205		case 'B':
2206			cb->bw++;
2207			break;
2208		case 'd':
2209			cb->duplex++;
2210			break;
2211		case 'm':
2212			if (!strncmp(optarg, "dma", 3))
2213				cb->mem = DMA;
2214			else if (!strncmp(optarg, "fastreg", 7))
2215				cb->mem = FASTREG;
2216			else if (!strncmp(optarg, "mw", 2))
2217				cb->mem = MW;
2218			else if (!strncmp(optarg, "mr", 2))
2219				cb->mem = MR;
2220			else {
2221				PRINTF(cb, "unknown mem mode %s.  "
2222					"Must be dma, fastreg, mw, or mr\n",
2223					optarg);
2224				ret = -EINVAL;
2225				break;
2226			}
2227			break;
2228		case 'I':
2229			cb->server_invalidate = 1;
2230			break;
2231		case 'T':
2232			cb->txdepth = optint;
2233			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2234			break;
2235		case 'Z':
2236			cb->local_dma_lkey = 1;
2237			DEBUG_LOG(cb, "using local dma lkey\n");
2238			break;
2239		case 'R':
2240			cb->read_inv = 1;
2241			DEBUG_LOG(cb, "using read-with-inv\n");
2242			break;
2243		case 'f':
2244			cb->frtest = 1;
2245			DEBUG_LOG(cb, "fast-reg test!\n");
2246			break;
2247		default:
2248			PRINTF(cb, "unknown opt %s\n", optarg);
2249			ret = -EINVAL;
2250			break;
2251		}
2252	}
2253	if (ret)
2254		goto out;
2255
2256	if (cb->server == -1) {
2257		PRINTF(cb, "must be either client or server\n");
2258		ret = -EINVAL;
2259		goto out;
2260	}
2261
2262	if (cb->server && cb->frtest) {
2263		PRINTF(cb, "must be client to run frtest\n");
2264		ret = -EINVAL;
2265		goto out;
2266	}
2267
2268	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2269		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2270		ret = -EINVAL;
2271		goto out;
2272	}
2273
2274	if (cb->server_invalidate && cb->mem != FASTREG) {
2275		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2276		ret = -EINVAL;
2277		goto out;
2278	}
2279
2280	if (cb->read_inv && cb->mem != FASTREG) {
2281		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2282		ret = -EINVAL;
2283		goto out;
2284	}
2285
2286	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2287		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2288		ret = -EINVAL;
2289		goto out;
2290	}
2291
2292	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2293	if (IS_ERR(cb->cm_id)) {
2294		ret = PTR_ERR(cb->cm_id);
2295		PRINTF(cb, "rdma_create_id error %d\n", ret);
2296		goto out;
2297	}
2298	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2299
2300	if (cb->server)
2301		krping_run_server(cb);
2302	else
2303		krping_run_client(cb);
2304
2305	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2306	rdma_destroy_id(cb->cm_id);
2307out:
2308	mutex_lock(&krping_mutex);
2309	list_del(&cb->list);
2310	mutex_unlock(&krping_mutex);
2311	kfree(cb);
2312	return ret;
2313}
2314
2315void
2316krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2317{
2318	struct krping_cb *cb;
2319
2320	mutex_lock(&krping_mutex);
2321	list_for_each_entry(cb, &krping_cbs, list)
2322	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2323	mutex_unlock(&krping_mutex);
2324}
2325
2326void krping_init(void)
2327{
2328
2329	mutex_init(&krping_mutex);
2330}
2331