krping.c revision 273246
1/*
2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3 * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/contrib/rdma/krping/krping.c 273246 2014-10-18 07:07:34Z hselasky $");
36
37#include <linux/module.h>
38#include <linux/moduleparam.h>
39#include <linux/slab.h>
40#include <linux/err.h>
41#include <linux/string.h>
42#include <linux/list.h>
43#include <linux/in.h>
44#include <linux/device.h>
45#include <linux/pci.h>
46#include <linux/sched.h>
47
48#include <asm/atomic.h>
49
50#include <rdma/ib_verbs.h>
51#include <rdma/rdma_cm.h>
52
53#include "krping.h"
54#include "getopt.h"
55
56extern int krping_debug;
57#define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
58#define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
59
60MODULE_AUTHOR("Steve Wise");
61MODULE_DESCRIPTION("RDMA ping client/server");
62MODULE_LICENSE("Dual BSD/GPL");
63
64static __inline uint64_t
65get_cycles(void)
66{
67	uint32_t low, high;
68	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
69	return (low | ((u_int64_t)high << 32));
70}
71
72typedef uint64_t cycles_t;
73
74enum mem_type {
75	DMA = 1,
76	FASTREG = 2,
77	MW = 3,
78	MR = 4
79};
80
81static const struct krping_option krping_opts[] = {
82	{"count", OPT_INT, 'C'},
83	{"size", OPT_INT, 'S'},
84	{"addr", OPT_STRING, 'a'},
85	{"port", OPT_INT, 'p'},
86	{"verbose", OPT_NOPARAM, 'v'},
87	{"validate", OPT_NOPARAM, 'V'},
88	{"server", OPT_NOPARAM, 's'},
89	{"client", OPT_NOPARAM, 'c'},
90	{"mem_mode", OPT_STRING, 'm'},
91	{"server_inv", OPT_NOPARAM, 'I'},
92 	{"wlat", OPT_NOPARAM, 'l'},
93 	{"rlat", OPT_NOPARAM, 'L'},
94 	{"bw", OPT_NOPARAM, 'B'},
95 	{"duplex", OPT_NOPARAM, 'd'},
96 	{"txdepth", OPT_INT, 'T'},
97 	{"poll", OPT_NOPARAM, 'P'},
98 	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
99 	{"read_inv", OPT_NOPARAM, 'R'},
100 	{"fr", OPT_NOPARAM, 'f'},
101	{NULL, 0, 0}
102};
103
104#define htonll(x) cpu_to_be64((x))
105#define ntohll(x) cpu_to_be64((x))
106
107static struct mutex krping_mutex;
108
109/*
110 * List of running krping threads.
111 */
112static LIST_HEAD(krping_cbs);
113
114/*
115 * krping "ping/pong" loop:
116 * 	client sends source rkey/addr/len
117 *	server receives source rkey/add/len
118 *	server rdma reads "ping" data from source
119 * 	server sends "go ahead" on rdma read completion
120 *	client sends sink rkey/addr/len
121 * 	server receives sink rkey/addr/len
122 * 	server rdma writes "pong" data to sink
123 * 	server sends "go ahead" on rdma write completion
124 * 	<repeat loop>
125 */
126
127/*
128 * These states are used to signal events between the completion handler
129 * and the main client or server thread.
130 *
131 * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
132 * and RDMA_WRITE_COMPLETE for each ping.
133 */
134enum test_state {
135	IDLE = 1,
136	CONNECT_REQUEST,
137	ADDR_RESOLVED,
138	ROUTE_RESOLVED,
139	CONNECTED,
140	RDMA_READ_ADV,
141	RDMA_READ_COMPLETE,
142	RDMA_WRITE_ADV,
143	RDMA_WRITE_COMPLETE,
144	ERROR
145};
146
147struct krping_rdma_info {
148	uint64_t buf;
149	uint32_t rkey;
150	uint32_t size;
151};
152
153/*
154 * Default max buffer size for IO...
155 */
156#define RPING_BUFSIZE 128*1024
157#define RPING_SQ_DEPTH 64
158
159/*
160 * Control block struct.
161 */
162struct krping_cb {
163	void *cookie;
164	int server;			/* 0 iff client */
165	struct ib_cq *cq;
166	struct ib_pd *pd;
167	struct ib_qp *qp;
168
169	enum mem_type mem;
170	struct ib_mr *dma_mr;
171
172	struct ib_fast_reg_page_list *page_list;
173	int page_list_len;
174	struct ib_send_wr fastreg_wr;
175	struct ib_send_wr invalidate_wr;
176	struct ib_mr *fastreg_mr;
177	int server_invalidate;
178	int read_inv;
179	u8 key;
180
181	struct ib_mw *mw;
182	struct ib_mw_bind bind_attr;
183
184	struct ib_recv_wr rq_wr;	/* recv work request record */
185	struct ib_sge recv_sgl;		/* recv single SGE */
186	struct krping_rdma_info recv_buf;/* malloc'd buffer */
187	u64 recv_dma_addr;
188	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
189	struct ib_mr *recv_mr;
190
191	struct ib_send_wr sq_wr;	/* send work requrest record */
192	struct ib_sge send_sgl;
193	struct krping_rdma_info send_buf;/* single send buf */
194	u64 send_dma_addr;
195	DECLARE_PCI_UNMAP_ADDR(send_mapping)
196	struct ib_mr *send_mr;
197
198	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
199	struct ib_sge rdma_sgl;		/* rdma single SGE */
200	char *rdma_buf;			/* used as rdma sink */
201	u64  rdma_dma_addr;
202	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
203	struct ib_mr *rdma_mr;
204
205	uint32_t remote_rkey;		/* remote guys RKEY */
206	uint64_t remote_addr;		/* remote guys TO */
207	uint32_t remote_len;		/* remote guys LEN */
208
209	char *start_buf;		/* rdma read src */
210	u64  start_dma_addr;
211	DECLARE_PCI_UNMAP_ADDR(start_mapping)
212	struct ib_mr *start_mr;
213
214	enum test_state state;		/* used for cond/signalling */
215	wait_queue_head_t sem;
216	struct krping_stats stats;
217
218	uint16_t port;			/* dst port in NBO */
219	struct in_addr addr;		/* dst addr in NBO */
220	char *addr_str;			/* dst addr string */
221	int verbose;			/* verbose logging */
222	int count;			/* ping count */
223	int size;			/* ping data size */
224	int validate;			/* validate ping data */
225	int wlat;			/* run wlat test */
226	int rlat;			/* run rlat test */
227	int bw;				/* run bw test */
228	int duplex;			/* run bw full duplex test */
229	int poll;			/* poll or block for rlat test */
230	int txdepth;			/* SQ depth */
231	int local_dma_lkey;		/* use 0 for lkey */
232	int frtest;			/* fastreg test */
233
234	/* CM stuff */
235	struct rdma_cm_id *cm_id;	/* connection on client side,*/
236					/* listener on server side. */
237	struct rdma_cm_id *child_cm_id;	/* connection on server side */
238	struct list_head list;
239};
240
241static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
242				   struct rdma_cm_event *event)
243{
244	int ret;
245	struct krping_cb *cb = cma_id->context;
246
247	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
248	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
249
250	switch (event->event) {
251	case RDMA_CM_EVENT_ADDR_RESOLVED:
252		cb->state = ADDR_RESOLVED;
253		ret = rdma_resolve_route(cma_id, 2000);
254		if (ret) {
255			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
256			wake_up_interruptible(&cb->sem);
257		}
258		break;
259
260	case RDMA_CM_EVENT_ROUTE_RESOLVED:
261		cb->state = ROUTE_RESOLVED;
262		wake_up_interruptible(&cb->sem);
263		break;
264
265	case RDMA_CM_EVENT_CONNECT_REQUEST:
266		cb->state = CONNECT_REQUEST;
267		cb->child_cm_id = cma_id;
268		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
269		wake_up_interruptible(&cb->sem);
270		break;
271
272	case RDMA_CM_EVENT_ESTABLISHED:
273		DEBUG_LOG(cb, "ESTABLISHED\n");
274		if (!cb->server) {
275			cb->state = CONNECTED;
276		}
277		wake_up_interruptible(&cb->sem);
278		break;
279
280	case RDMA_CM_EVENT_ADDR_ERROR:
281	case RDMA_CM_EVENT_ROUTE_ERROR:
282	case RDMA_CM_EVENT_CONNECT_ERROR:
283	case RDMA_CM_EVENT_UNREACHABLE:
284	case RDMA_CM_EVENT_REJECTED:
285		PRINTF(cb, "cma event %d, error %d\n", event->event,
286		       event->status);
287		cb->state = ERROR;
288		wake_up_interruptible(&cb->sem);
289		break;
290
291	case RDMA_CM_EVENT_DISCONNECTED:
292		PRINTF(cb, "DISCONNECT EVENT...\n");
293		cb->state = ERROR;
294		wake_up_interruptible(&cb->sem);
295		break;
296
297	case RDMA_CM_EVENT_DEVICE_REMOVAL:
298		PRINTF(cb, "cma detected device removal!!!!\n");
299		break;
300
301	default:
302		PRINTF(cb, "oof bad type!\n");
303		wake_up_interruptible(&cb->sem);
304		break;
305	}
306	return 0;
307}
308
309static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
310{
311	if (wc->byte_len != sizeof(cb->recv_buf)) {
312		PRINTF(cb, "Received bogus data, size %d\n",
313		       wc->byte_len);
314		return -1;
315	}
316
317	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
318	cb->remote_addr = ntohll(cb->recv_buf.buf);
319	cb->remote_len  = ntohl(cb->recv_buf.size);
320	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
321		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
322		  cb->remote_len);
323
324	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
325		cb->state = RDMA_READ_ADV;
326	else
327		cb->state = RDMA_WRITE_ADV;
328
329	return 0;
330}
331
332static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
333{
334	if (wc->byte_len != sizeof(cb->recv_buf)) {
335		PRINTF(cb, "Received bogus data, size %d\n",
336		       wc->byte_len);
337		return -1;
338	}
339
340	if (cb->state == RDMA_READ_ADV)
341		cb->state = RDMA_WRITE_ADV;
342	else
343		cb->state = RDMA_WRITE_COMPLETE;
344
345	return 0;
346}
347
348static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
349{
350	struct krping_cb *cb = ctx;
351	struct ib_wc wc;
352	struct ib_recv_wr *bad_wr;
353	int ret;
354
355	BUG_ON(cb->cq != cq);
356	if (cb->state == ERROR) {
357		PRINTF(cb, "cq completion in ERROR state\n");
358		return;
359	}
360	if (cb->frtest) {
361		PRINTF(cb, "cq completion event in frtest!\n");
362		return;
363	}
364	if (!cb->wlat && !cb->rlat && !cb->bw)
365		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
366	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
367		if (wc.status) {
368			if (wc.status == IB_WC_WR_FLUSH_ERR) {
369				DEBUG_LOG(cb, "cq flushed\n");
370				continue;
371			} else {
372				PRINTF(cb, "cq completion failed with "
373				       "wr_id %Lx status %d opcode %d vender_err %x\n",
374					wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
375				goto error;
376			}
377		}
378
379		switch (wc.opcode) {
380		case IB_WC_SEND:
381			DEBUG_LOG(cb, "send completion\n");
382			cb->stats.send_bytes += cb->send_sgl.length;
383			cb->stats.send_msgs++;
384			break;
385
386		case IB_WC_RDMA_WRITE:
387			DEBUG_LOG(cb, "rdma write completion\n");
388			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
389			cb->stats.write_msgs++;
390			cb->state = RDMA_WRITE_COMPLETE;
391			wake_up_interruptible(&cb->sem);
392			break;
393
394		case IB_WC_RDMA_READ:
395			DEBUG_LOG(cb, "rdma read completion\n");
396			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
397			cb->stats.read_msgs++;
398			cb->state = RDMA_READ_COMPLETE;
399			wake_up_interruptible(&cb->sem);
400			break;
401
402		case IB_WC_RECV:
403			DEBUG_LOG(cb, "recv completion\n");
404			cb->stats.recv_bytes += sizeof(cb->recv_buf);
405			cb->stats.recv_msgs++;
406			if (cb->wlat || cb->rlat || cb->bw)
407				ret = server_recv(cb, &wc);
408			else
409				ret = cb->server ? server_recv(cb, &wc) :
410						   client_recv(cb, &wc);
411			if (ret) {
412				PRINTF(cb, "recv wc error: %d\n", ret);
413				goto error;
414			}
415
416			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
417			if (ret) {
418				PRINTF(cb, "post recv error: %d\n",
419				       ret);
420				goto error;
421			}
422			wake_up_interruptible(&cb->sem);
423			break;
424
425		default:
426			PRINTF(cb,
427			       "%s:%d Unexpected opcode %d, Shutting down\n",
428			       __func__, __LINE__, wc.opcode);
429			goto error;
430		}
431	}
432	if (ret) {
433		PRINTF(cb, "poll error %d\n", ret);
434		goto error;
435	}
436	return;
437error:
438	cb->state = ERROR;
439	wake_up_interruptible(&cb->sem);
440}
441
442static int krping_accept(struct krping_cb *cb)
443{
444	struct rdma_conn_param conn_param;
445	int ret;
446
447	DEBUG_LOG(cb, "accepting client connection request\n");
448
449	memset(&conn_param, 0, sizeof conn_param);
450	conn_param.responder_resources = 1;
451	conn_param.initiator_depth = 1;
452
453	ret = rdma_accept(cb->child_cm_id, &conn_param);
454	if (ret) {
455		PRINTF(cb, "rdma_accept error: %d\n", ret);
456		return ret;
457	}
458
459	if (!cb->wlat && !cb->rlat && !cb->bw) {
460		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
461		if (cb->state == ERROR) {
462			PRINTF(cb, "wait for CONNECTED state %d\n",
463				cb->state);
464			return -1;
465		}
466	}
467	return 0;
468}
469
470static void krping_setup_wr(struct krping_cb *cb)
471{
472	cb->recv_sgl.addr = cb->recv_dma_addr;
473	cb->recv_sgl.length = sizeof cb->recv_buf;
474	if (cb->local_dma_lkey)
475		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
476	else if (cb->mem == DMA)
477		cb->recv_sgl.lkey = cb->dma_mr->lkey;
478	else
479		cb->recv_sgl.lkey = cb->recv_mr->lkey;
480	cb->rq_wr.sg_list = &cb->recv_sgl;
481	cb->rq_wr.num_sge = 1;
482
483	cb->send_sgl.addr = cb->send_dma_addr;
484	cb->send_sgl.length = sizeof cb->send_buf;
485	if (cb->local_dma_lkey)
486		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
487	else if (cb->mem == DMA)
488		cb->send_sgl.lkey = cb->dma_mr->lkey;
489	else
490		cb->send_sgl.lkey = cb->send_mr->lkey;
491
492	cb->sq_wr.opcode = IB_WR_SEND;
493	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
494	cb->sq_wr.sg_list = &cb->send_sgl;
495	cb->sq_wr.num_sge = 1;
496
497	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
498		cb->rdma_sgl.addr = cb->rdma_dma_addr;
499		if (cb->mem == MR)
500			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
501		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
502		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
503		cb->rdma_sq_wr.num_sge = 1;
504	}
505
506	switch(cb->mem) {
507	case FASTREG:
508
509		/*
510		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
511		 * both unsignaled.  The client uses them to reregister
512		 * the rdma buffers with a new key each iteration.
513		 */
514		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
515		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
516		cb->fastreg_wr.wr.fast_reg.length = cb->size;
517		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
518		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
519
520		cb->invalidate_wr.next = &cb->fastreg_wr;
521		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
522		break;
523	case MW:
524		cb->bind_attr.wr_id = 0xabbaabba;
525		cb->bind_attr.send_flags = 0; /* unsignaled */
526		cb->bind_attr.length = cb->size;
527		break;
528	default:
529		break;
530	}
531}
532
533static int krping_setup_buffers(struct krping_cb *cb)
534{
535	int ret;
536	struct ib_phys_buf buf;
537	u64 iovbase;
538
539	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
540
541	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device,
542				   &cb->recv_buf,
543				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
544	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
545	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device,
546					   &cb->send_buf, sizeof(cb->send_buf),
547					   DMA_BIDIRECTIONAL);
548	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
549
550	if (cb->mem == DMA) {
551		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
552					   IB_ACCESS_REMOTE_READ|
553				           IB_ACCESS_REMOTE_WRITE);
554		if (IS_ERR(cb->dma_mr)) {
555			DEBUG_LOG(cb, "reg_dmamr failed\n");
556			ret = PTR_ERR(cb->dma_mr);
557			goto bail;
558		}
559	} else {
560		if (!cb->local_dma_lkey) {
561			buf.addr = cb->recv_dma_addr;
562			buf.size = sizeof cb->recv_buf;
563			DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr,
564				(int)buf.size);
565			iovbase = cb->recv_dma_addr;
566			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
567						     IB_ACCESS_LOCAL_WRITE,
568						     &iovbase);
569
570			if (IS_ERR(cb->recv_mr)) {
571				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
572				ret = PTR_ERR(cb->recv_mr);
573				goto bail;
574			}
575
576			buf.addr = cb->send_dma_addr;
577			buf.size = sizeof cb->send_buf;
578			DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr,
579				(int)buf.size);
580			iovbase = cb->send_dma_addr;
581			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
582						     0, &iovbase);
583
584			if (IS_ERR(cb->send_mr)) {
585				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
586				ret = PTR_ERR(cb->send_mr);
587				goto bail;
588			}
589		}
590	}
591
592	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
593	if (!cb->rdma_buf) {
594		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
595		ret = -ENOMEM;
596		goto bail;
597	}
598
599	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device,
600			       cb->rdma_buf, cb->size,
601			       DMA_BIDIRECTIONAL);
602	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
603	if (cb->mem != DMA) {
604		switch (cb->mem) {
605		case FASTREG:
606			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
607				PAGE_SIZE) >> PAGE_SHIFT;
608			cb->page_list = ib_alloc_fast_reg_page_list(
609						cb->pd->device,
610						cb->page_list_len);
611			if (IS_ERR(cb->page_list)) {
612				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
613				ret = PTR_ERR(cb->page_list);
614				goto bail;
615			}
616			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
617					cb->page_list->max_page_list_len);
618			if (IS_ERR(cb->fastreg_mr)) {
619				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
620				ret = PTR_ERR(cb->fastreg_mr);
621				goto bail;
622			}
623			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
624				" page_list_len %u\n", cb->fastreg_mr->rkey,
625				cb->page_list, cb->page_list_len);
626			break;
627		case MW:
628			cb->mw = ib_alloc_mw(cb->pd);
629			if (IS_ERR(cb->mw)) {
630				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
631				ret = PTR_ERR(cb->mw);
632				goto bail;
633			}
634			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
635			/*FALLTHROUGH*/
636		case MR:
637			buf.addr = cb->rdma_dma_addr;
638			buf.size = cb->size;
639			iovbase = cb->rdma_dma_addr;
640			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
641					     IB_ACCESS_REMOTE_READ|
642					     IB_ACCESS_REMOTE_WRITE,
643					     &iovbase);
644			if (IS_ERR(cb->rdma_mr)) {
645				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
646				ret = PTR_ERR(cb->rdma_mr);
647				goto bail;
648			}
649			DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n",
650				buf.addr, (int)buf.size, cb->rdma_mr->rkey);
651			break;
652		default:
653			ret = -EINVAL;
654			goto bail;
655			break;
656		}
657	}
658
659	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
660
661		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
662		if (!cb->start_buf) {
663			DEBUG_LOG(cb, "start_buf malloc failed\n");
664			ret = -ENOMEM;
665			goto bail;
666		}
667
668		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device,
669						   cb->start_buf, cb->size,
670						   DMA_BIDIRECTIONAL);
671		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
672
673		if (cb->mem == MR || cb->mem == MW) {
674			unsigned flags = IB_ACCESS_REMOTE_READ;
675
676			if (cb->wlat || cb->rlat || cb->bw)
677				flags |= IB_ACCESS_REMOTE_WRITE;
678
679			buf.addr = cb->start_dma_addr;
680			buf.size = cb->size;
681			DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n",
682				buf.addr, (int)buf.size);
683			iovbase = cb->start_dma_addr;
684			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
685					     flags,
686					     &iovbase);
687
688			if (IS_ERR(cb->start_mr)) {
689				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
690				ret = PTR_ERR(cb->start_mr);
691				goto bail;
692			}
693		}
694	}
695
696	krping_setup_wr(cb);
697	DEBUG_LOG(cb, "allocated & registered buffers...\n");
698	return 0;
699bail:
700	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
701		ib_dereg_mr(cb->fastreg_mr);
702	if (cb->mw && !IS_ERR(cb->mw))
703		ib_dealloc_mw(cb->mw);
704	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
705		ib_dereg_mr(cb->rdma_mr);
706	if (cb->page_list && !IS_ERR(cb->page_list))
707		ib_free_fast_reg_page_list(cb->page_list);
708	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
709		ib_dereg_mr(cb->dma_mr);
710	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
711		ib_dereg_mr(cb->recv_mr);
712	if (cb->send_mr && !IS_ERR(cb->send_mr))
713		ib_dereg_mr(cb->send_mr);
714	if (cb->rdma_buf)
715		kfree(cb->rdma_buf);
716	if (cb->start_buf)
717		kfree(cb->start_buf);
718	return ret;
719}
720
721static void krping_free_buffers(struct krping_cb *cb)
722{
723	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
724
725	if (cb->dma_mr)
726		ib_dereg_mr(cb->dma_mr);
727	if (cb->send_mr)
728		ib_dereg_mr(cb->send_mr);
729	if (cb->recv_mr)
730		ib_dereg_mr(cb->recv_mr);
731	if (cb->rdma_mr)
732		ib_dereg_mr(cb->rdma_mr);
733	if (cb->start_mr)
734		ib_dereg_mr(cb->start_mr);
735	if (cb->fastreg_mr)
736		ib_dereg_mr(cb->fastreg_mr);
737	if (cb->mw)
738		ib_dealloc_mw(cb->mw);
739
740	dma_unmap_single(cb->pd->device->dma_device,
741			 pci_unmap_addr(cb, recv_mapping),
742			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
743	dma_unmap_single(cb->pd->device->dma_device,
744			 pci_unmap_addr(cb, send_mapping),
745			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
746	dma_unmap_single(cb->pd->device->dma_device,
747			 pci_unmap_addr(cb, rdma_mapping),
748			 cb->size, DMA_BIDIRECTIONAL);
749	kfree(cb->rdma_buf);
750	if (cb->start_buf) {
751		dma_unmap_single(cb->pd->device->dma_device,
752			 pci_unmap_addr(cb, start_mapping),
753			 cb->size, DMA_BIDIRECTIONAL);
754		kfree(cb->start_buf);
755	}
756}
757
758static int krping_create_qp(struct krping_cb *cb)
759{
760	struct ib_qp_init_attr init_attr;
761	int ret;
762
763	memset(&init_attr, 0, sizeof(init_attr));
764	init_attr.cap.max_send_wr = cb->txdepth;
765	init_attr.cap.max_recv_wr = 2;
766	init_attr.cap.max_recv_sge = 1;
767	init_attr.cap.max_send_sge = 1;
768	init_attr.qp_type = IB_QPT_RC;
769	init_attr.send_cq = cb->cq;
770	init_attr.recv_cq = cb->cq;
771	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
772
773	if (cb->server) {
774		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
775		if (!ret)
776			cb->qp = cb->child_cm_id->qp;
777	} else {
778		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
779		if (!ret)
780			cb->qp = cb->cm_id->qp;
781	}
782
783	return ret;
784}
785
786static void krping_free_qp(struct krping_cb *cb)
787{
788	ib_destroy_qp(cb->qp);
789	ib_destroy_cq(cb->cq);
790	ib_dealloc_pd(cb->pd);
791}
792
793static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
794{
795	int ret;
796	cb->pd = ib_alloc_pd(cm_id->device);
797	if (IS_ERR(cb->pd)) {
798		PRINTF(cb, "ib_alloc_pd failed\n");
799		return PTR_ERR(cb->pd);
800	}
801	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
802
803	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
804
805	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
806			      cb, cb->txdepth * 2, 0);
807	if (IS_ERR(cb->cq)) {
808		PRINTF(cb, "ib_create_cq failed\n");
809		ret = PTR_ERR(cb->cq);
810		goto err1;
811	}
812	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
813
814	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
815		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
816		if (ret) {
817			PRINTF(cb, "ib_create_cq failed\n");
818			goto err2;
819		}
820	}
821
822	ret = krping_create_qp(cb);
823	if (ret) {
824		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
825		goto err2;
826	}
827	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
828	return 0;
829err2:
830	ib_destroy_cq(cb->cq);
831err1:
832	ib_dealloc_pd(cb->pd);
833	return ret;
834}
835
836/*
837 * return the (possibly rebound) rkey for the rdma buffer.
838 * FASTREG mode: invalidate and rebind via fastreg wr.
839 * MW mode: rebind the MW.
840 * other modes: just return the mr rkey.
841 */
842static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
843{
844	u32 rkey = 0xffffffff;
845	u64 p;
846	struct ib_send_wr *bad_wr;
847	int i;
848	int ret;
849
850	switch (cb->mem) {
851	case FASTREG:
852		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
853
854		/*
855		 * Update the fastreg key.
856		 */
857		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
858		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
859
860		/*
861		 * Update the fastreg WR with new buf info.
862		 */
863		if (buf == (u64)cb->start_dma_addr)
864			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
865		else
866			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
867		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
868		p = (u64)(buf & PAGE_MASK);
869		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
870		     i++, p += PAGE_SIZE) {
871			cb->page_list->page_list[i] = p;
872			DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
873		}
874
875		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
876			" iova_start %llx page_list_len %u\n",
877			post_inv,
878			cb->fastreg_wr.wr.fast_reg.rkey,
879			cb->fastreg_wr.wr.fast_reg.page_shift,
880			cb->fastreg_wr.wr.fast_reg.length,
881			cb->fastreg_wr.wr.fast_reg.iova_start,
882			cb->fastreg_wr.wr.fast_reg.page_list_len);
883
884		if (post_inv)
885			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
886		else
887			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
888		if (ret) {
889			PRINTF(cb, "post send error %d\n", ret);
890			cb->state = ERROR;
891		}
892		rkey = cb->fastreg_mr->rkey;
893		break;
894	case MW:
895		/*
896		 * Update the MW with new buf info.
897		 */
898		if (buf == (u64)cb->start_dma_addr) {
899			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
900			cb->bind_attr.mr = cb->start_mr;
901		} else {
902			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
903			cb->bind_attr.mr = cb->rdma_mr;
904		}
905		cb->bind_attr.addr = buf;
906		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
907			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
908		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
909		if (ret) {
910			PRINTF(cb, "bind mw error %d\n", ret);
911			cb->state = ERROR;
912		} else
913			rkey = cb->mw->rkey;
914		break;
915	case MR:
916		if (buf == (u64)cb->start_dma_addr)
917			rkey = cb->start_mr->rkey;
918		else
919			rkey = cb->rdma_mr->rkey;
920		break;
921	case DMA:
922		rkey = cb->dma_mr->rkey;
923		break;
924	default:
925		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
926		cb->state = ERROR;
927		break;
928	}
929	return rkey;
930}
931
932static void krping_format_send(struct krping_cb *cb, u64 buf)
933{
934	struct krping_rdma_info *info = &cb->send_buf;
935	u32 rkey;
936
937	/*
938	 * Client side will do fastreg or mw bind before
939	 * advertising the rdma buffer.  Server side
940	 * sends have no data.
941	 */
942	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
943		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
944		info->buf = htonll(buf);
945		info->rkey = htonl(rkey);
946		info->size = htonl(cb->size);
947		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
948			  (unsigned long long)buf, rkey, cb->size);
949	}
950}
951
952static void krping_test_server(struct krping_cb *cb)
953{
954	struct ib_send_wr *bad_wr, inv;
955	int ret;
956
957	while (1) {
958		/* Wait for client's Start STAG/TO/Len */
959		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
960		if (cb->state != RDMA_READ_ADV) {
961			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
962				cb->state);
963			break;
964		}
965
966		DEBUG_LOG(cb, "server received sink adv\n");
967
968		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
969		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
970		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
971		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
972		cb->rdma_sq_wr.next = NULL;
973
974		/* Issue RDMA Read. */
975		if (cb->read_inv)
976			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
977		else {
978
979			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
980			if (cb->mem == FASTREG) {
981				/*
982				 * Immediately follow the read with a
983				 * fenced LOCAL_INV.
984				 */
985				cb->rdma_sq_wr.next = &inv;
986				memset(&inv, 0, sizeof inv);
987				inv.opcode = IB_WR_LOCAL_INV;
988				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
989				inv.send_flags = IB_SEND_FENCE;
990			}
991		}
992
993		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
994		if (ret) {
995			PRINTF(cb, "post send error %d\n", ret);
996			break;
997		}
998		cb->rdma_sq_wr.next = NULL;
999
1000		DEBUG_LOG(cb, "server posted rdma read req \n");
1001
1002		/* Wait for read completion */
1003		wait_event_interruptible(cb->sem,
1004					 cb->state >= RDMA_READ_COMPLETE);
1005		if (cb->state != RDMA_READ_COMPLETE) {
1006			PRINTF(cb,
1007			       "wait for RDMA_READ_COMPLETE state %d\n",
1008			       cb->state);
1009			break;
1010		}
1011		DEBUG_LOG(cb, "server received read complete\n");
1012
1013		/* Display data in recv buf */
1014		if (cb->verbose)
1015			PRINTF(cb, "server ping data: %s\n",
1016				cb->rdma_buf);
1017
1018		/* Tell client to continue */
1019		if (cb->server && cb->server_invalidate) {
1020			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1021			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1022			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1023		}
1024		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1025		if (ret) {
1026			PRINTF(cb, "post send error %d\n", ret);
1027			break;
1028		}
1029		DEBUG_LOG(cb, "server posted go ahead\n");
1030
1031		/* Wait for client's RDMA STAG/TO/Len */
1032		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1033		if (cb->state != RDMA_WRITE_ADV) {
1034			PRINTF(cb,
1035			       "wait for RDMA_WRITE_ADV state %d\n",
1036			       cb->state);
1037			break;
1038		}
1039		DEBUG_LOG(cb, "server received sink adv\n");
1040
1041		/* RDMA Write echo data */
1042		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1043		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1044		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1045		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1046		if (cb->local_dma_lkey)
1047			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1048		else
1049			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1050
1051		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1052			  cb->rdma_sq_wr.sg_list->lkey,
1053			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1054			  cb->rdma_sq_wr.sg_list->length);
1055
1056		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1057		if (ret) {
1058			PRINTF(cb, "post send error %d\n", ret);
1059			break;
1060		}
1061
1062		/* Wait for completion */
1063		ret = wait_event_interruptible(cb->sem, cb->state >=
1064							 RDMA_WRITE_COMPLETE);
1065		if (cb->state != RDMA_WRITE_COMPLETE) {
1066			PRINTF(cb,
1067			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1068			       cb->state);
1069			break;
1070		}
1071		DEBUG_LOG(cb, "server rdma write complete \n");
1072
1073		cb->state = CONNECTED;
1074
1075		/* Tell client to begin again */
1076		if (cb->server && cb->server_invalidate) {
1077			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1078			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1079			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1080		}
1081		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1082		if (ret) {
1083			PRINTF(cb, "post send error %d\n", ret);
1084			break;
1085		}
1086		DEBUG_LOG(cb, "server posted go ahead\n");
1087	}
1088}
1089
1090static void rlat_test(struct krping_cb *cb)
1091{
1092	int scnt;
1093	int iters = cb->count;
1094	struct timeval start_tv, stop_tv;
1095	int ret;
1096	struct ib_wc wc;
1097	struct ib_send_wr *bad_wr;
1098	int ne;
1099
1100	scnt = 0;
1101	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1102	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1103	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1104	cb->rdma_sq_wr.sg_list->length = cb->size;
1105
1106	microtime(&start_tv);
1107	if (!cb->poll) {
1108		cb->state = RDMA_READ_ADV;
1109		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1110	}
1111	while (scnt < iters) {
1112
1113		cb->state = RDMA_READ_ADV;
1114		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1115		if (ret) {
1116			PRINTF(cb,
1117				"Couldn't post send: ret=%d scnt %d\n",
1118				ret, scnt);
1119			return;
1120		}
1121
1122		do {
1123			if (!cb->poll) {
1124				wait_event_interruptible(cb->sem,
1125					cb->state != RDMA_READ_ADV);
1126				if (cb->state == RDMA_READ_COMPLETE) {
1127					ne = 1;
1128					ib_req_notify_cq(cb->cq,
1129						IB_CQ_NEXT_COMP);
1130				} else {
1131					ne = -1;
1132				}
1133			} else
1134				ne = ib_poll_cq(cb->cq, 1, &wc);
1135			if (cb->state == ERROR) {
1136				PRINTF(cb,
1137					"state == ERROR...bailing scnt %d\n",
1138					scnt);
1139				return;
1140			}
1141		} while (ne == 0);
1142
1143		if (ne < 0) {
1144			PRINTF(cb, "poll CQ failed %d\n", ne);
1145			return;
1146		}
1147		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1148			PRINTF(cb, "Completion wth error at %s:\n",
1149				cb->server ? "server" : "client");
1150			PRINTF(cb, "Failed status %d: wr_id %d\n",
1151				wc.status, (int) wc.wr_id);
1152			return;
1153		}
1154		++scnt;
1155	}
1156	microtime(&stop_tv);
1157
1158        if (stop_tv.tv_usec < start_tv.tv_usec) {
1159                stop_tv.tv_usec += 1000000;
1160                stop_tv.tv_sec  -= 1;
1161        }
1162
1163	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1164		stop_tv.tv_sec - start_tv.tv_sec,
1165		stop_tv.tv_usec - start_tv.tv_usec,
1166		scnt, cb->size);
1167}
1168
1169static void wlat_test(struct krping_cb *cb)
1170{
1171	int ccnt, scnt, rcnt;
1172	int iters=cb->count;
1173	volatile char *poll_buf = (char *) cb->start_buf;
1174	char *buf = (char *)cb->rdma_buf;
1175	struct timeval start_tv, stop_tv;
1176	cycles_t *post_cycles_start, *post_cycles_stop;
1177	cycles_t *poll_cycles_start, *poll_cycles_stop;
1178	cycles_t *last_poll_cycles_start;
1179	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1180	int i;
1181	int cycle_iters = 1000;
1182
1183	ccnt = 0;
1184	scnt = 0;
1185	rcnt = 0;
1186
1187	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1188	if (!post_cycles_start) {
1189		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1190		return;
1191	}
1192	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1193	if (!post_cycles_stop) {
1194		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1195		return;
1196	}
1197	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1198	if (!poll_cycles_start) {
1199		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1200		return;
1201	}
1202	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1203	if (!poll_cycles_stop) {
1204		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1205		return;
1206	}
1207	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1208		GFP_KERNEL);
1209	if (!last_poll_cycles_start) {
1210		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1211		return;
1212	}
1213	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1214	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1215	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1216	cb->rdma_sq_wr.sg_list->length = cb->size;
1217
1218	if (cycle_iters > iters)
1219		cycle_iters = iters;
1220	microtime(&start_tv);
1221	while (scnt < iters || ccnt < iters || rcnt < iters) {
1222
1223		/* Wait till buffer changes. */
1224		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1225			++rcnt;
1226			while (*poll_buf != (char)rcnt) {
1227				if (cb->state == ERROR) {
1228					PRINTF(cb,
1229						"state = ERROR, bailing\n");
1230					return;
1231				}
1232			}
1233		}
1234
1235		if (scnt < iters) {
1236			struct ib_send_wr *bad_wr;
1237
1238			*buf = (char)scnt+1;
1239			if (scnt < cycle_iters)
1240				post_cycles_start[scnt] = get_cycles();
1241			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1242				PRINTF(cb,
1243					"Couldn't post send: scnt=%d\n",
1244					scnt);
1245				return;
1246			}
1247			if (scnt < cycle_iters)
1248				post_cycles_stop[scnt] = get_cycles();
1249			scnt++;
1250		}
1251
1252		if (ccnt < iters) {
1253			struct ib_wc wc;
1254			int ne;
1255
1256			if (ccnt < cycle_iters)
1257				poll_cycles_start[ccnt] = get_cycles();
1258			do {
1259				if (ccnt < cycle_iters)
1260					last_poll_cycles_start[ccnt] =
1261						get_cycles();
1262				ne = ib_poll_cq(cb->cq, 1, &wc);
1263			} while (ne == 0);
1264			if (ccnt < cycle_iters)
1265				poll_cycles_stop[ccnt] = get_cycles();
1266			++ccnt;
1267
1268			if (ne < 0) {
1269				PRINTF(cb, "poll CQ failed %d\n", ne);
1270				return;
1271			}
1272			if (wc.status != IB_WC_SUCCESS) {
1273				PRINTF(cb,
1274					"Completion wth error at %s:\n",
1275					cb->server ? "server" : "client");
1276				PRINTF(cb,
1277					"Failed status %d: wr_id %d\n",
1278					wc.status, (int) wc.wr_id);
1279				PRINTF(cb,
1280					"scnt=%d, rcnt=%d, ccnt=%d\n",
1281					scnt, rcnt, ccnt);
1282				return;
1283			}
1284		}
1285	}
1286	microtime(&stop_tv);
1287
1288        if (stop_tv.tv_usec < start_tv.tv_usec) {
1289                stop_tv.tv_usec += 1000000;
1290                stop_tv.tv_sec  -= 1;
1291        }
1292
1293	for (i=0; i < cycle_iters; i++) {
1294		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1295		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1296		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1297	}
1298	PRINTF(cb,
1299		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1300		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1301		stop_tv.tv_sec - start_tv.tv_sec,
1302		stop_tv.tv_usec - start_tv.tv_usec,
1303		scnt, cb->size, cycle_iters,
1304		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1305		(unsigned long long)sum_last_poll);
1306	kfree(post_cycles_start);
1307	kfree(post_cycles_stop);
1308	kfree(poll_cycles_start);
1309	kfree(poll_cycles_stop);
1310	kfree(last_poll_cycles_start);
1311}
1312
1313static void bw_test(struct krping_cb *cb)
1314{
1315	int ccnt, scnt, rcnt;
1316	int iters=cb->count;
1317	struct timeval start_tv, stop_tv;
1318	cycles_t *post_cycles_start, *post_cycles_stop;
1319	cycles_t *poll_cycles_start, *poll_cycles_stop;
1320	cycles_t *last_poll_cycles_start;
1321	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1322	int i;
1323	int cycle_iters = 1000;
1324
1325	ccnt = 0;
1326	scnt = 0;
1327	rcnt = 0;
1328
1329	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1330	if (!post_cycles_start) {
1331		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1332		return;
1333	}
1334	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1335	if (!post_cycles_stop) {
1336		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1337		return;
1338	}
1339	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1340	if (!poll_cycles_start) {
1341		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1342		return;
1343	}
1344	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1345	if (!poll_cycles_stop) {
1346		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1347		return;
1348	}
1349	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1350		GFP_KERNEL);
1351	if (!last_poll_cycles_start) {
1352		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1353		return;
1354	}
1355	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1356	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1357	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1358	cb->rdma_sq_wr.sg_list->length = cb->size;
1359
1360	if (cycle_iters > iters)
1361		cycle_iters = iters;
1362	microtime(&start_tv);
1363	while (scnt < iters || ccnt < iters) {
1364
1365		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1366			struct ib_send_wr *bad_wr;
1367
1368			if (scnt < cycle_iters)
1369				post_cycles_start[scnt] = get_cycles();
1370			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1371				PRINTF(cb,
1372					"Couldn't post send: scnt=%d\n",
1373					scnt);
1374				return;
1375			}
1376			if (scnt < cycle_iters)
1377				post_cycles_stop[scnt] = get_cycles();
1378			++scnt;
1379		}
1380
1381		if (ccnt < iters) {
1382			int ne;
1383			struct ib_wc wc;
1384
1385			if (ccnt < cycle_iters)
1386				poll_cycles_start[ccnt] = get_cycles();
1387			do {
1388				if (ccnt < cycle_iters)
1389					last_poll_cycles_start[ccnt] =
1390						get_cycles();
1391				ne = ib_poll_cq(cb->cq, 1, &wc);
1392			} while (ne == 0);
1393			if (ccnt < cycle_iters)
1394				poll_cycles_stop[ccnt] = get_cycles();
1395			ccnt += 1;
1396
1397			if (ne < 0) {
1398				PRINTF(cb, "poll CQ failed %d\n", ne);
1399				return;
1400			}
1401			if (wc.status != IB_WC_SUCCESS) {
1402				PRINTF(cb,
1403					"Completion wth error at %s:\n",
1404					cb->server ? "server" : "client");
1405				PRINTF(cb,
1406					"Failed status %d: wr_id %d\n",
1407					wc.status, (int) wc.wr_id);
1408				return;
1409			}
1410		}
1411	}
1412	microtime(&stop_tv);
1413
1414        if (stop_tv.tv_usec < start_tv.tv_usec) {
1415                stop_tv.tv_usec += 1000000;
1416                stop_tv.tv_sec  -= 1;
1417        }
1418
1419	for (i=0; i < cycle_iters; i++) {
1420		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1421		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1422		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1423	}
1424	PRINTF(cb,
1425		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1426		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1427		stop_tv.tv_sec - start_tv.tv_sec,
1428		stop_tv.tv_usec - start_tv.tv_usec,
1429		scnt, cb->size, cycle_iters,
1430		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1431		(unsigned long long)sum_last_poll);
1432	kfree(post_cycles_start);
1433	kfree(post_cycles_stop);
1434	kfree(poll_cycles_start);
1435	kfree(poll_cycles_stop);
1436	kfree(last_poll_cycles_start);
1437}
1438
1439static void krping_rlat_test_server(struct krping_cb *cb)
1440{
1441	struct ib_send_wr *bad_wr;
1442	struct ib_wc wc;
1443	int ret;
1444
1445	/* Spin waiting for client's Start STAG/TO/Len */
1446	while (cb->state < RDMA_READ_ADV) {
1447		krping_cq_event_handler(cb->cq, cb);
1448	}
1449
1450	/* Send STAG/TO/Len to client */
1451	krping_format_send(cb, cb->start_dma_addr);
1452	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1453	if (ret) {
1454		PRINTF(cb, "post send error %d\n", ret);
1455		return;
1456	}
1457
1458	/* Spin waiting for send completion */
1459	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1460	if (ret < 0) {
1461		PRINTF(cb, "poll error %d\n", ret);
1462		return;
1463	}
1464	if (wc.status) {
1465		PRINTF(cb, "send completiong error %d\n", wc.status);
1466		return;
1467	}
1468
1469	wait_event_interruptible(cb->sem, cb->state == ERROR);
1470}
1471
1472static void krping_wlat_test_server(struct krping_cb *cb)
1473{
1474	struct ib_send_wr *bad_wr;
1475	struct ib_wc wc;
1476	int ret;
1477
1478	/* Spin waiting for client's Start STAG/TO/Len */
1479	while (cb->state < RDMA_READ_ADV) {
1480		krping_cq_event_handler(cb->cq, cb);
1481	}
1482
1483	/* Send STAG/TO/Len to client */
1484	krping_format_send(cb, cb->start_dma_addr);
1485	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1486	if (ret) {
1487		PRINTF(cb, "post send error %d\n", ret);
1488		return;
1489	}
1490
1491	/* Spin waiting for send completion */
1492	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1493	if (ret < 0) {
1494		PRINTF(cb, "poll error %d\n", ret);
1495		return;
1496	}
1497	if (wc.status) {
1498		PRINTF(cb, "send completiong error %d\n", wc.status);
1499		return;
1500	}
1501
1502	wlat_test(cb);
1503	wait_event_interruptible(cb->sem, cb->state == ERROR);
1504}
1505
1506static void krping_bw_test_server(struct krping_cb *cb)
1507{
1508	struct ib_send_wr *bad_wr;
1509	struct ib_wc wc;
1510	int ret;
1511
1512	/* Spin waiting for client's Start STAG/TO/Len */
1513	while (cb->state < RDMA_READ_ADV) {
1514		krping_cq_event_handler(cb->cq, cb);
1515	}
1516
1517	/* Send STAG/TO/Len to client */
1518	krping_format_send(cb, cb->start_dma_addr);
1519	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1520	if (ret) {
1521		PRINTF(cb, "post send error %d\n", ret);
1522		return;
1523	}
1524
1525	/* Spin waiting for send completion */
1526	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1527	if (ret < 0) {
1528		PRINTF(cb, "poll error %d\n", ret);
1529		return;
1530	}
1531	if (wc.status) {
1532		PRINTF(cb, "send completiong error %d\n", wc.status);
1533		return;
1534	}
1535
1536	if (cb->duplex)
1537		bw_test(cb);
1538	wait_event_interruptible(cb->sem, cb->state == ERROR);
1539}
1540
1541static int fastreg_supported(struct krping_cb *cb)
1542{
1543	struct ib_device *dev = cb->child_cm_id->device;
1544	struct ib_device_attr attr;
1545	int ret;
1546
1547	ret = ib_query_device(dev, &attr);
1548	if (ret) {
1549		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1550		return 0;
1551	}
1552	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1553		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1554		    attr.device_cap_flags);
1555		return 0;
1556	}
1557	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1558		attr.device_cap_flags);
1559	return 1;
1560}
1561
1562static int krping_bind_server(struct krping_cb *cb)
1563{
1564	struct sockaddr_in sin;
1565	int ret;
1566
1567	memset(&sin, 0, sizeof(sin));
1568	sin.sin_len = sizeof sin;
1569	sin.sin_family = AF_INET;
1570	sin.sin_addr.s_addr = cb->addr.s_addr;
1571	sin.sin_port = cb->port;
1572
1573	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1574	if (ret) {
1575		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1576		return ret;
1577	}
1578	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1579
1580	DEBUG_LOG(cb, "rdma_listen\n");
1581	ret = rdma_listen(cb->cm_id, 3);
1582	if (ret) {
1583		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1584		return ret;
1585	}
1586
1587	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1588	if (cb->state != CONNECT_REQUEST) {
1589		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1590			cb->state);
1591		return -1;
1592	}
1593
1594	if (cb->mem == FASTREG && !fastreg_supported(cb))
1595		return -EINVAL;
1596
1597	return 0;
1598}
1599
1600static void krping_run_server(struct krping_cb *cb)
1601{
1602	struct ib_recv_wr *bad_wr;
1603	int ret;
1604
1605	ret = krping_bind_server(cb);
1606	if (ret)
1607		return;
1608
1609	ret = krping_setup_qp(cb, cb->child_cm_id);
1610	if (ret) {
1611		PRINTF(cb, "setup_qp failed: %d\n", ret);
1612		goto err0;
1613	}
1614
1615	ret = krping_setup_buffers(cb);
1616	if (ret) {
1617		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1618		goto err1;
1619	}
1620
1621	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1622	if (ret) {
1623		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1624		goto err2;
1625	}
1626
1627	ret = krping_accept(cb);
1628	if (ret) {
1629		PRINTF(cb, "connect error %d\n", ret);
1630		goto err2;
1631	}
1632
1633	if (cb->wlat)
1634		krping_wlat_test_server(cb);
1635	else if (cb->rlat)
1636		krping_rlat_test_server(cb);
1637	else if (cb->bw)
1638		krping_bw_test_server(cb);
1639	else
1640		krping_test_server(cb);
1641	rdma_disconnect(cb->child_cm_id);
1642err2:
1643	krping_free_buffers(cb);
1644err1:
1645	krping_free_qp(cb);
1646err0:
1647	rdma_destroy_id(cb->child_cm_id);
1648}
1649
1650static void krping_test_client(struct krping_cb *cb)
1651{
1652	int ping, start, cc, i, ret;
1653	struct ib_send_wr *bad_wr;
1654	unsigned char c;
1655
1656	start = 65;
1657	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1658		cb->state = RDMA_READ_ADV;
1659
1660		/* Put some ascii text in the buffer. */
1661		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1662		for (i = cc, c = start; i < cb->size; i++) {
1663			cb->start_buf[i] = c;
1664			c++;
1665			if (c > 122)
1666				c = 65;
1667		}
1668		start++;
1669		if (start > 122)
1670			start = 65;
1671		cb->start_buf[cb->size - 1] = 0;
1672
1673		krping_format_send(cb, cb->start_dma_addr);
1674		if (cb->state == ERROR) {
1675			PRINTF(cb, "krping_format_send failed\n");
1676			break;
1677		}
1678		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1679		if (ret) {
1680			PRINTF(cb, "post send error %d\n", ret);
1681			break;
1682		}
1683
1684		/* Wait for server to ACK */
1685		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1686		if (cb->state != RDMA_WRITE_ADV) {
1687			PRINTF(cb,
1688			       "wait for RDMA_WRITE_ADV state %d\n",
1689			       cb->state);
1690			break;
1691		}
1692
1693		krping_format_send(cb, cb->rdma_dma_addr);
1694		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1695		if (ret) {
1696			PRINTF(cb, "post send error %d\n", ret);
1697			break;
1698		}
1699
1700		/* Wait for the server to say the RDMA Write is complete. */
1701		wait_event_interruptible(cb->sem,
1702					 cb->state >= RDMA_WRITE_COMPLETE);
1703		if (cb->state != RDMA_WRITE_COMPLETE) {
1704			PRINTF(cb,
1705			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1706			       cb->state);
1707			break;
1708		}
1709
1710		if (cb->validate)
1711			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1712				PRINTF(cb, "data mismatch!\n");
1713				break;
1714			}
1715
1716		if (cb->verbose)
1717			PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1718#ifdef SLOW_KRPING
1719		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1720#endif
1721	}
1722}
1723
1724static void krping_rlat_test_client(struct krping_cb *cb)
1725{
1726	struct ib_send_wr *bad_wr;
1727	struct ib_wc wc;
1728	int ret;
1729
1730	cb->state = RDMA_READ_ADV;
1731
1732	/* Send STAG/TO/Len to client */
1733	krping_format_send(cb, cb->start_dma_addr);
1734	if (cb->state == ERROR) {
1735		PRINTF(cb, "krping_format_send failed\n");
1736		return;
1737	}
1738	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1739	if (ret) {
1740		PRINTF(cb, "post send error %d\n", ret);
1741		return;
1742	}
1743
1744	/* Spin waiting for send completion */
1745	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1746	if (ret < 0) {
1747		PRINTF(cb, "poll error %d\n", ret);
1748		return;
1749	}
1750	if (wc.status) {
1751		PRINTF(cb, "send completion error %d\n", wc.status);
1752		return;
1753	}
1754
1755	/* Spin waiting for server's Start STAG/TO/Len */
1756	while (cb->state < RDMA_WRITE_ADV) {
1757		krping_cq_event_handler(cb->cq, cb);
1758	}
1759
1760#if 0
1761{
1762	int i;
1763	struct timeval start, stop;
1764	time_t sec;
1765	suseconds_t usec;
1766	unsigned long long elapsed;
1767	struct ib_wc wc;
1768	struct ib_send_wr *bad_wr;
1769	int ne;
1770
1771	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1772	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1773	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1774	cb->rdma_sq_wr.sg_list->length = 0;
1775	cb->rdma_sq_wr.num_sge = 0;
1776
1777	microtime(&start);
1778	for (i=0; i < 100000; i++) {
1779		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1780			PRINTF(cb, "Couldn't post send\n");
1781			return;
1782		}
1783		do {
1784			ne = ib_poll_cq(cb->cq, 1, &wc);
1785		} while (ne == 0);
1786		if (ne < 0) {
1787			PRINTF(cb, "poll CQ failed %d\n", ne);
1788			return;
1789		}
1790		if (wc.status != IB_WC_SUCCESS) {
1791			PRINTF(cb, "Completion wth error at %s:\n",
1792				cb->server ? "server" : "client");
1793			PRINTF(cb, "Failed status %d: wr_id %d\n",
1794				wc.status, (int) wc.wr_id);
1795			return;
1796		}
1797	}
1798	microtime(&stop);
1799
1800	if (stop.tv_usec < start.tv_usec) {
1801		stop.tv_usec += 1000000;
1802		stop.tv_sec  -= 1;
1803	}
1804	sec     = stop.tv_sec - start.tv_sec;
1805	usec    = stop.tv_usec - start.tv_usec;
1806	elapsed = sec * 1000000 + usec;
1807	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1808}
1809#endif
1810
1811	rlat_test(cb);
1812}
1813
1814static void krping_wlat_test_client(struct krping_cb *cb)
1815{
1816	struct ib_send_wr *bad_wr;
1817	struct ib_wc wc;
1818	int ret;
1819
1820	cb->state = RDMA_READ_ADV;
1821
1822	/* Send STAG/TO/Len to client */
1823	krping_format_send(cb, cb->start_dma_addr);
1824	if (cb->state == ERROR) {
1825		PRINTF(cb, "krping_format_send failed\n");
1826		return;
1827	}
1828	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1829	if (ret) {
1830		PRINTF(cb, "post send error %d\n", ret);
1831		return;
1832	}
1833
1834	/* Spin waiting for send completion */
1835	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1836	if (ret < 0) {
1837		PRINTF(cb, "poll error %d\n", ret);
1838		return;
1839	}
1840	if (wc.status) {
1841		PRINTF(cb, "send completion error %d\n", wc.status);
1842		return;
1843	}
1844
1845	/* Spin waiting for server's Start STAG/TO/Len */
1846	while (cb->state < RDMA_WRITE_ADV) {
1847		krping_cq_event_handler(cb->cq, cb);
1848	}
1849
1850	wlat_test(cb);
1851}
1852
1853static void krping_bw_test_client(struct krping_cb *cb)
1854{
1855	struct ib_send_wr *bad_wr;
1856	struct ib_wc wc;
1857	int ret;
1858
1859	cb->state = RDMA_READ_ADV;
1860
1861	/* Send STAG/TO/Len to client */
1862	krping_format_send(cb, cb->start_dma_addr);
1863	if (cb->state == ERROR) {
1864		PRINTF(cb, "krping_format_send failed\n");
1865		return;
1866	}
1867	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1868	if (ret) {
1869		PRINTF(cb, "post send error %d\n", ret);
1870		return;
1871	}
1872
1873	/* Spin waiting for send completion */
1874	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1875	if (ret < 0) {
1876		PRINTF(cb, "poll error %d\n", ret);
1877		return;
1878	}
1879	if (wc.status) {
1880		PRINTF(cb, "send completion error %d\n", wc.status);
1881		return;
1882	}
1883
1884	/* Spin waiting for server's Start STAG/TO/Len */
1885	while (cb->state < RDMA_WRITE_ADV) {
1886		krping_cq_event_handler(cb->cq, cb);
1887	}
1888
1889	bw_test(cb);
1890}
1891
1892static void krping_fr_test(struct krping_cb *cb)
1893{
1894	struct ib_fast_reg_page_list *pl;
1895	struct ib_send_wr fr, inv, *bad;
1896	struct ib_wc wc;
1897	u8 key = 0;
1898	struct ib_mr *mr;
1899	int i;
1900	int ret;
1901	int size = cb->size;
1902	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1903	time_t start;
1904	int count = 0;
1905	int scnt = 0;
1906
1907	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1908	if (IS_ERR(pl)) {
1909		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1910		return;
1911	}
1912
1913	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1914	if (IS_ERR(mr)) {
1915		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1916		goto err1;
1917	}
1918
1919	for (i=0; i<plen; i++)
1920		pl->page_list[i] = 0xcafebabe | i;
1921
1922	memset(&fr, 0, sizeof fr);
1923	fr.opcode = IB_WR_FAST_REG_MR;
1924	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1925	fr.wr.fast_reg.length = size;
1926	fr.wr.fast_reg.page_list = pl;
1927	fr.wr.fast_reg.page_list_len = plen;
1928	fr.wr.fast_reg.iova_start = 0;
1929	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1930	fr.next = &inv;
1931	memset(&inv, 0, sizeof inv);
1932	inv.opcode = IB_WR_LOCAL_INV;
1933	inv.send_flags = IB_SEND_SIGNALED;
1934
1935	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1936	start = time_uptime;
1937	while (1) {
1938		if ((time_uptime - start) >= 9) {
1939			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1940			wait_event_interruptible(cb->sem, cb->state == ERROR);
1941			if (cb->state == ERROR)
1942				break;
1943			start = time_uptime;
1944		}
1945		while (scnt < (cb->txdepth>>1)) {
1946			ib_update_fast_reg_key(mr, ++key);
1947			fr.wr.fast_reg.rkey = mr->rkey;
1948			inv.ex.invalidate_rkey = mr->rkey;
1949			size = arc4random() % cb->size;
1950			if (size == 0)
1951				size = cb->size;
1952			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1953			fr.wr.fast_reg.length = size;
1954			fr.wr.fast_reg.page_list_len = plen;
1955			ret = ib_post_send(cb->qp, &fr, &bad);
1956			if (ret) {
1957				PRINTF(cb, "ib_post_send failed %d\n", ret);
1958				goto err2;
1959			}
1960			scnt++;
1961		}
1962
1963		do {
1964			ret = ib_poll_cq(cb->cq, 1, &wc);
1965			if (ret < 0) {
1966				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1967				goto err2;
1968			}
1969			if (ret == 1) {
1970				if (wc.status) {
1971					PRINTF(cb, "completion error %u\n", wc.status);
1972					goto err2;
1973				}
1974				count++;
1975				scnt--;
1976			}
1977			else if (krping_sigpending()) {
1978				PRINTF(cb, "signal!\n");
1979				goto err2;
1980			}
1981		} while (ret == 1);
1982	}
1983err2:
1984#if 0
1985	DEBUG_LOG(cb, "sleeping 1 second\n");
1986	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1987#endif
1988	DEBUG_LOG(cb, "draining the cq...\n");
1989	do {
1990		ret = ib_poll_cq(cb->cq, 1, &wc);
1991		if (ret < 0) {
1992			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1993			break;
1994		}
1995		if (ret == 1) {
1996			if (wc.status) {
1997				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
1998			}
1999		}
2000	} while (ret == 1);
2001	DEBUG_LOG(cb, "fr_test: done!\n");
2002	ib_dereg_mr(mr);
2003err1:
2004	ib_free_fast_reg_page_list(pl);
2005}
2006
2007static int krping_connect_client(struct krping_cb *cb)
2008{
2009	struct rdma_conn_param conn_param;
2010	int ret;
2011
2012	memset(&conn_param, 0, sizeof conn_param);
2013	conn_param.responder_resources = 1;
2014	conn_param.initiator_depth = 1;
2015	conn_param.retry_count = 10;
2016
2017	ret = rdma_connect(cb->cm_id, &conn_param);
2018	if (ret) {
2019		PRINTF(cb, "rdma_connect error %d\n", ret);
2020		return ret;
2021	}
2022
2023	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2024	if (cb->state == ERROR) {
2025		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2026		return -1;
2027	}
2028
2029	DEBUG_LOG(cb, "rdma_connect successful\n");
2030	return 0;
2031}
2032
2033static int krping_bind_client(struct krping_cb *cb)
2034{
2035	struct sockaddr_in sin;
2036	int ret;
2037
2038	memset(&sin, 0, sizeof(sin));
2039	sin.sin_len = sizeof sin;
2040	sin.sin_family = AF_INET;
2041	sin.sin_addr.s_addr = cb->addr.s_addr;
2042	sin.sin_port = cb->port;
2043
2044	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2045				2000);
2046	if (ret) {
2047		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2048		return ret;
2049	}
2050
2051	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2052	if (cb->state != ROUTE_RESOLVED) {
2053		PRINTF(cb,
2054		       "addr/route resolution did not resolve: state %d\n",
2055		       cb->state);
2056		return -EINTR;
2057	}
2058
2059	if (cb->mem == FASTREG && !fastreg_supported(cb))
2060		return -EINVAL;
2061
2062	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2063	return 0;
2064}
2065
2066static void krping_run_client(struct krping_cb *cb)
2067{
2068	struct ib_recv_wr *bad_wr;
2069	int ret;
2070
2071	ret = krping_bind_client(cb);
2072	if (ret)
2073		return;
2074
2075	ret = krping_setup_qp(cb, cb->cm_id);
2076	if (ret) {
2077		PRINTF(cb, "setup_qp failed: %d\n", ret);
2078		return;
2079	}
2080
2081	ret = krping_setup_buffers(cb);
2082	if (ret) {
2083		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2084		goto err1;
2085	}
2086
2087	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2088	if (ret) {
2089		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2090		goto err2;
2091	}
2092
2093	ret = krping_connect_client(cb);
2094	if (ret) {
2095		PRINTF(cb, "connect error %d\n", ret);
2096		goto err2;
2097	}
2098
2099	if (cb->wlat)
2100		krping_wlat_test_client(cb);
2101	else if (cb->rlat)
2102		krping_rlat_test_client(cb);
2103	else if (cb->bw)
2104		krping_bw_test_client(cb);
2105	else if (cb->frtest)
2106		krping_fr_test(cb);
2107	else
2108		krping_test_client(cb);
2109	rdma_disconnect(cb->cm_id);
2110err2:
2111	krping_free_buffers(cb);
2112err1:
2113	krping_free_qp(cb);
2114}
2115
2116int krping_doit(char *cmd, void *cookie)
2117{
2118	struct krping_cb *cb;
2119	int op;
2120	int ret = 0;
2121	char *optarg;
2122	unsigned long optint;
2123
2124	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2125	if (!cb)
2126		return -ENOMEM;
2127
2128	mutex_lock(&krping_mutex);
2129	list_add_tail(&cb->list, &krping_cbs);
2130	mutex_unlock(&krping_mutex);
2131
2132	cb->cookie = cookie;
2133	cb->server = -1;
2134	cb->state = IDLE;
2135	cb->size = 64;
2136	cb->txdepth = RPING_SQ_DEPTH;
2137	cb->mem = DMA;
2138	init_waitqueue_head(&cb->sem);
2139
2140	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2141			      &optint)) != 0) {
2142		switch (op) {
2143		case 'a':
2144			cb->addr_str = optarg;
2145			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2146			if (!inet_aton(optarg, &cb->addr)) {
2147				PRINTF(cb, "bad addr string %s\n",
2148				    optarg);
2149				ret = EINVAL;
2150			}
2151			break;
2152		case 'p':
2153			cb->port = htons(optint);
2154			DEBUG_LOG(cb, "port %d\n", (int)optint);
2155			break;
2156		case 'P':
2157			cb->poll = 1;
2158			DEBUG_LOG(cb, "server\n");
2159			break;
2160		case 's':
2161			cb->server = 1;
2162			DEBUG_LOG(cb, "server\n");
2163			break;
2164		case 'c':
2165			cb->server = 0;
2166			DEBUG_LOG(cb, "client\n");
2167			break;
2168		case 'S':
2169			cb->size = optint;
2170			if ((cb->size < 1) ||
2171			    (cb->size > RPING_BUFSIZE)) {
2172				PRINTF(cb, "Invalid size %d "
2173				       "(valid range is 1 to %d)\n",
2174				       cb->size, RPING_BUFSIZE);
2175				ret = EINVAL;
2176			} else
2177				DEBUG_LOG(cb, "size %d\n", (int)optint);
2178			break;
2179		case 'C':
2180			cb->count = optint;
2181			if (cb->count < 0) {
2182				PRINTF(cb, "Invalid count %d\n",
2183					cb->count);
2184				ret = EINVAL;
2185			} else
2186				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2187			break;
2188		case 'v':
2189			cb->verbose++;
2190			DEBUG_LOG(cb, "verbose\n");
2191			break;
2192		case 'V':
2193			cb->validate++;
2194			DEBUG_LOG(cb, "validate data\n");
2195			break;
2196		case 'l':
2197			cb->wlat++;
2198			break;
2199		case 'L':
2200			cb->rlat++;
2201			break;
2202		case 'B':
2203			cb->bw++;
2204			break;
2205		case 'd':
2206			cb->duplex++;
2207			break;
2208		case 'm':
2209			if (!strncmp(optarg, "dma", 3))
2210				cb->mem = DMA;
2211			else if (!strncmp(optarg, "fastreg", 7))
2212				cb->mem = FASTREG;
2213			else if (!strncmp(optarg, "mw", 2))
2214				cb->mem = MW;
2215			else if (!strncmp(optarg, "mr", 2))
2216				cb->mem = MR;
2217			else {
2218				PRINTF(cb, "unknown mem mode %s.  "
2219					"Must be dma, fastreg, mw, or mr\n",
2220					optarg);
2221				ret = -EINVAL;
2222				break;
2223			}
2224			break;
2225		case 'I':
2226			cb->server_invalidate = 1;
2227			break;
2228		case 'T':
2229			cb->txdepth = optint;
2230			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2231			break;
2232		case 'Z':
2233			cb->local_dma_lkey = 1;
2234			DEBUG_LOG(cb, "using local dma lkey\n");
2235			break;
2236		case 'R':
2237			cb->read_inv = 1;
2238			DEBUG_LOG(cb, "using read-with-inv\n");
2239			break;
2240		case 'f':
2241			cb->frtest = 1;
2242			DEBUG_LOG(cb, "fast-reg test!\n");
2243			break;
2244		default:
2245			PRINTF(cb, "unknown opt %s\n", optarg);
2246			ret = -EINVAL;
2247			break;
2248		}
2249	}
2250	if (ret)
2251		goto out;
2252
2253	if (cb->server == -1) {
2254		PRINTF(cb, "must be either client or server\n");
2255		ret = -EINVAL;
2256		goto out;
2257	}
2258
2259	if (cb->server && cb->frtest) {
2260		PRINTF(cb, "must be client to run frtest\n");
2261		ret = -EINVAL;
2262		goto out;
2263	}
2264
2265	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2266		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2267		ret = -EINVAL;
2268		goto out;
2269	}
2270
2271	if (cb->server_invalidate && cb->mem != FASTREG) {
2272		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2273		ret = -EINVAL;
2274		goto out;
2275	}
2276
2277	if (cb->read_inv && cb->mem != FASTREG) {
2278		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2279		ret = -EINVAL;
2280		goto out;
2281	}
2282
2283	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2284		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2285		ret = -EINVAL;
2286		goto out;
2287	}
2288
2289	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2290	if (IS_ERR(cb->cm_id)) {
2291		ret = PTR_ERR(cb->cm_id);
2292		PRINTF(cb, "rdma_create_id error %d\n", ret);
2293		goto out;
2294	}
2295	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2296
2297	if (cb->server)
2298		krping_run_server(cb);
2299	else
2300		krping_run_client(cb);
2301
2302	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2303	rdma_destroy_id(cb->cm_id);
2304out:
2305	mutex_lock(&krping_mutex);
2306	list_del(&cb->list);
2307	mutex_unlock(&krping_mutex);
2308	kfree(cb);
2309	return ret;
2310}
2311
2312void
2313krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2314{
2315	struct krping_cb *cb;
2316
2317	mutex_lock(&krping_mutex);
2318	list_for_each_entry(cb, &krping_cbs, list)
2319	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2320	mutex_unlock(&krping_mutex);
2321}
2322
2323void krping_init(void)
2324{
2325
2326	mutex_init(&krping_mutex);
2327}
2328