1/*
2 * Copyright (c) 2006 Intel Corporation.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/dma-mapping.h>
34#include <linux/err.h>
35#include <linux/interrupt.h>
36#include <linux/rbtree.h>
37#include <linux/mutex.h>
38#include <linux/spinlock.h>
39#include <linux/pci.h>
40#include <linux/miscdevice.h>
41#include <linux/random.h>
42
43#include <rdma/ib_cache.h>
44#include <rdma/ib_sa.h>
45#include "sa.h"
46
47MODULE_AUTHOR("Sean Hefty");
48MODULE_DESCRIPTION("InfiniBand subnet administration caching");
49MODULE_LICENSE("Dual BSD/GPL");
50
51enum {
52	SA_DB_MAX_PATHS_PER_DEST = 0x7F,
53	SA_DB_MIN_RETRY_TIMER	 = 4000,  /*   4 sec */
54	SA_DB_MAX_RETRY_TIMER	 = 256000 /* 256 sec */
55};
56
57static int set_paths_per_dest(const char *val, struct kernel_param *kp);
58static unsigned long paths_per_dest = 0;
59module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
60		  &paths_per_dest, 0644);
61MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
62				 "to each destination (DGID).  Set to 0 "
63				 "to disable cache.");
64
65static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
66static char subscribe_inform_info = 1;
67module_param_call(subscribe_inform_info, set_subscribe_inform_info,
68		  param_get_bool, &subscribe_inform_info, 0644);
69MODULE_PARM_DESC(subscribe_inform_info,
70		 "Subscribe for SA InformInfo/Notice events.");
71
72static int do_refresh(const char *val, struct kernel_param *kp);
73module_param_call(refresh, do_refresh, NULL, NULL, 0200);
74
75static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
76
77enum sa_db_lookup_method {
78	SA_DB_LOOKUP_LEAST_USED,
79	SA_DB_LOOKUP_RANDOM
80};
81
82static int set_lookup_method(const char *val, struct kernel_param *kp);
83static int get_lookup_method(char *buf, struct kernel_param *kp);
84static unsigned long lookup_method;
85module_param_call(lookup_method, set_lookup_method, get_lookup_method,
86		  &lookup_method, 0644);
87MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
88				"multiple paths exist to a given destination.");
89
90static void sa_db_add_dev(struct ib_device *device);
91static void sa_db_remove_dev(struct ib_device *device);
92
93static struct ib_client sa_db_client = {
94	.name   = "local_sa",
95	.add    = sa_db_add_dev,
96	.remove = sa_db_remove_dev
97};
98
99static LIST_HEAD(dev_list);
100static DEFINE_MUTEX(lock);
101static rwlock_t rwlock;
102static struct workqueue_struct *sa_wq;
103static struct ib_sa_client sa_client;
104
105enum sa_db_state {
106	SA_DB_IDLE,
107	SA_DB_REFRESH,
108	SA_DB_DESTROY
109};
110
111struct sa_db_port {
112	struct sa_db_device	*dev;
113	struct ib_mad_agent	*agent;
114	/* Limit number of outstanding MADs to SA to reduce SA flooding */
115	struct ib_mad_send_buf	*msg;
116	u16			sm_lid;
117	u8			sm_sl;
118	struct ib_inform_info	*in_info;
119	struct ib_inform_info	*out_info;
120	struct rb_root		paths;
121	struct list_head	update_list;
122	unsigned long		update_id;
123	enum sa_db_state	state;
124	struct work_struct	work;
125	union ib_gid		gid;
126	int			port_num;
127};
128
129struct sa_db_device {
130	struct list_head	list;
131	struct ib_device	*device;
132	struct ib_event_handler event_handler;
133	int			start_port;
134	int			port_count;
135	struct sa_db_port	port[0];
136};
137
138struct ib_sa_iterator {
139	struct ib_sa_iterator	*next;
140};
141
142struct ib_sa_attr_iter {
143	struct ib_sa_iterator	*iter;
144	unsigned long		flags;
145};
146
147struct ib_sa_attr_list {
148	struct ib_sa_iterator	iter;
149	struct ib_sa_iterator	*tail;
150	int			update_id;
151	union ib_gid		gid;
152	struct rb_node		node;
153};
154
155struct ib_path_rec_info {
156	struct ib_sa_iterator	iter; /* keep first */
157	struct ib_sa_path_rec	rec;
158	unsigned long		lookups;
159};
160
161struct ib_sa_mad_iter {
162	struct ib_mad_recv_wc	*recv_wc;
163	struct ib_mad_recv_buf	*recv_buf;
164	int			attr_size;
165	int			attr_offset;
166	int			data_offset;
167	int			data_left;
168	void			*attr;
169	u8			attr_data[0];
170};
171
172enum sa_update_type {
173	SA_UPDATE_FULL,
174	SA_UPDATE_ADD,
175	SA_UPDATE_REMOVE
176};
177
178struct update_info {
179	struct list_head	list;
180	union ib_gid		gid;
181	enum sa_update_type	type;
182};
183
184struct sa_path_request {
185	struct work_struct	work;
186	struct ib_sa_client	*client;
187	void			(*callback)(int, struct ib_sa_path_rec *, void *);
188	void			*context;
189	struct ib_sa_path_rec	path_rec;
190};
191
192static void process_updates(struct sa_db_port *port);
193
194static void free_attr_list(struct ib_sa_attr_list *attr_list)
195{
196	struct ib_sa_iterator *cur;
197
198	for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
199		attr_list->iter.next = cur->next;
200		kfree(cur);
201	}
202	attr_list->tail = &attr_list->iter;
203}
204
205static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
206{
207	rb_erase(&attr_list->node, root);
208	free_attr_list(attr_list);
209	kfree(attr_list);
210}
211
212static void remove_all_attrs(struct rb_root *root)
213{
214	struct rb_node *node, *next_node;
215	struct ib_sa_attr_list *attr_list;
216
217	write_lock_irq(&rwlock);
218	for (node = rb_first(root); node; node = next_node) {
219		next_node = rb_next(node);
220		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
221		remove_attr(root, attr_list);
222	}
223	write_unlock_irq(&rwlock);
224}
225
226static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
227{
228	struct rb_node *node, *next_node;
229	struct ib_sa_attr_list *attr_list;
230
231	write_lock_irq(&rwlock);
232	for (node = rb_first(root); node; node = next_node) {
233		next_node = rb_next(node);
234		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
235		if (attr_list->update_id != update_id)
236			remove_attr(root, attr_list);
237	}
238	write_unlock_irq(&rwlock);
239}
240
241static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
242						struct ib_sa_attr_list *attr_list)
243{
244	struct rb_node **link = &root->rb_node;
245	struct rb_node *parent = NULL;
246	struct ib_sa_attr_list *cur_attr_list;
247	int cmp;
248
249	while (*link) {
250		parent = *link;
251		cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
252		cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
253			     sizeof attr_list->gid);
254		if (cmp < 0)
255			link = &(*link)->rb_left;
256		else if (cmp > 0)
257			link = &(*link)->rb_right;
258		else
259			return cur_attr_list;
260	}
261	rb_link_node(&attr_list->node, parent, link);
262	rb_insert_color(&attr_list->node, root);
263	return NULL;
264}
265
266static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
267{
268	struct rb_node *node = root->rb_node;
269	struct ib_sa_attr_list *attr_list;
270	int cmp;
271
272	while (node) {
273		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
274		cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
275		if (cmp < 0)
276			node = node->rb_left;
277		else if (cmp > 0)
278			node = node->rb_right;
279		else
280			return attr_list;
281	}
282	return NULL;
283}
284
285static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
286		       struct ib_sa_iterator *iter)
287{
288	struct ib_sa_attr_list *attr_list;
289	void *err;
290
291	write_lock_irq(&rwlock);
292	attr_list = find_attr_list(root, key);
293	if (!attr_list) {
294		write_unlock_irq(&rwlock);
295		attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
296		if (!attr_list)
297			return -ENOMEM;
298
299		attr_list->iter.next = NULL;
300		attr_list->tail = &attr_list->iter;
301		attr_list->update_id = update_id;
302		memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
303
304		write_lock_irq(&rwlock);
305		err = insert_attr_list(root, attr_list);
306		if (err) {
307			write_unlock_irq(&rwlock);
308			kfree(attr_list);
309			return PTR_ERR(err);
310		}
311	} else if (attr_list->update_id != update_id) {
312		free_attr_list(attr_list);
313		attr_list->update_id = update_id;
314	}
315
316	attr_list->tail->next = iter;
317	iter->next = NULL;
318	attr_list->tail = iter;
319	write_unlock_irq(&rwlock);
320	return 0;
321}
322
323static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
324{
325	struct ib_sa_mad_iter *iter;
326	struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
327	int attr_size, attr_offset;
328
329	attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
330	attr_size = 64;		/* path record length */
331	if (attr_offset < attr_size)
332		return ERR_PTR(-EINVAL);
333
334	iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
335	if (!iter)
336		return ERR_PTR(-ENOMEM);
337
338	iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
339	iter->recv_wc = mad_recv_wc;
340	iter->recv_buf = &mad_recv_wc->recv_buf;
341	iter->attr_offset = attr_offset;
342	iter->attr_size = attr_size;
343	return iter;
344}
345
346static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
347{
348	kfree(iter);
349}
350
351static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
352{
353	struct ib_sa_mad *mad;
354	int left, offset = 0;
355
356	while (iter->data_left >= iter->attr_offset) {
357		while (iter->data_offset < IB_MGMT_SA_DATA) {
358			mad = (struct ib_sa_mad *) iter->recv_buf->mad;
359
360			left = IB_MGMT_SA_DATA - iter->data_offset;
361			if (left < iter->attr_size) {
362				/* copy first piece of the attribute */
363				iter->attr = &iter->attr_data;
364				memcpy(iter->attr,
365				       &mad->data[iter->data_offset], left);
366				offset = left;
367				break;
368			} else if (offset) {
369				/* copy the second piece of the attribute */
370				memcpy(iter->attr + offset, &mad->data[0],
371				       iter->attr_size - offset);
372				iter->data_offset = iter->attr_size - offset;
373				offset = 0;
374			} else {
375				iter->attr = &mad->data[iter->data_offset];
376				iter->data_offset += iter->attr_size;
377			}
378
379			iter->data_left -= iter->attr_offset;
380			goto out;
381		}
382		iter->data_offset = 0;
383		iter->recv_buf = list_entry(iter->recv_buf->list.next,
384					    struct ib_mad_recv_buf, list);
385	}
386	iter->attr = NULL;
387out:
388	return iter->attr;
389}
390
391/*
392 * Copy path records from a received response and insert them into our cache.
393 * A path record in the MADs are in network order, packed, and may
394 * span multiple MAD buffers, just to make our life hard.
395 */
396static void update_path_db(struct sa_db_port *port,
397			   struct ib_mad_recv_wc *mad_recv_wc,
398			   enum sa_update_type type)
399{
400	struct ib_sa_mad_iter *iter;
401	struct ib_path_rec_info *path_info;
402	void *attr;
403	int ret;
404
405	iter = ib_sa_iter_create(mad_recv_wc);
406	if (IS_ERR(iter))
407		return;
408
409	port->update_id += (type == SA_UPDATE_FULL);
410
411	while ((attr = ib_sa_iter_next(iter)) &&
412	       (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
413
414		ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
415
416		ret = insert_attr(&port->paths, port->update_id,
417				  path_info->rec.dgid.raw, &path_info->iter);
418		if (ret) {
419			kfree(path_info);
420			break;
421		}
422	}
423	ib_sa_iter_free(iter);
424
425	if (type == SA_UPDATE_FULL)
426		remove_old_attrs(&port->paths, port->update_id);
427}
428
429static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
430					  struct update_info *update)
431{
432	struct ib_ah_attr ah_attr;
433	struct ib_mad_send_buf *msg;
434
435	msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
436				 IB_MGMT_SA_DATA, GFP_KERNEL);
437	if (IS_ERR(msg))
438		return NULL;
439
440	memset(&ah_attr, 0, sizeof ah_attr);
441	ah_attr.dlid = port->sm_lid;
442	ah_attr.sl = port->sm_sl;
443	ah_attr.port_num = port->port_num;
444
445	msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
446	if (IS_ERR(msg->ah)) {
447		ib_free_send_mad(msg);
448		return NULL;
449	}
450
451	msg->timeout_ms = retry_timer;
452	msg->retries = 0;
453	msg->context[0] = port;
454	msg->context[1] = update;
455	return msg;
456}
457
458static __be64 form_tid(u32 hi_tid)
459{
460	static atomic_t tid;
461	return cpu_to_be64((((u64) hi_tid) << 32) |
462			   ((u32) atomic_inc_return(&tid)));
463}
464
465static void format_path_req(struct sa_db_port *port,
466			    struct update_info *update,
467			    struct ib_mad_send_buf *msg)
468{
469	struct ib_sa_mad *mad = msg->mad;
470	struct ib_sa_path_rec path_rec;
471
472	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
473	mad->mad_hdr.mgmt_class	   = IB_MGMT_CLASS_SUBN_ADM;
474	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
475	mad->mad_hdr.method	   = IB_SA_METHOD_GET_TABLE;
476	mad->mad_hdr.attr_id	   = cpu_to_be16(IB_SA_ATTR_PATH_REC);
477	mad->mad_hdr.tid	   = form_tid(msg->mad_agent->hi_tid);
478
479	mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
480
481	path_rec.sgid = port->gid;
482	path_rec.numb_path = (u8) paths_per_dest;
483
484	if (update->type == SA_UPDATE_ADD) {
485		mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
486		memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
487	}
488
489	ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
490}
491
492static int send_query(struct sa_db_port *port,
493		      struct update_info *update)
494{
495	int ret;
496
497	port->msg = get_sa_msg(port, update);
498	if (!port->msg)
499		return -ENOMEM;
500
501	format_path_req(port, update, port->msg);
502
503	ret = ib_post_send_mad(port->msg, NULL);
504	if (ret)
505		goto err;
506
507	return 0;
508
509err:
510	ib_destroy_ah(port->msg->ah);
511	ib_free_send_mad(port->msg);
512	return ret;
513}
514
515static void add_update(struct sa_db_port *port, u8 *gid,
516		       enum sa_update_type type)
517{
518	struct update_info *update;
519
520	update = kmalloc(sizeof *update, GFP_KERNEL);
521	if (update) {
522		if (gid)
523			memcpy(&update->gid, gid, sizeof update->gid);
524		update->type = type;
525		list_add(&update->list, &port->update_list);
526	}
527
528	if (port->state == SA_DB_IDLE) {
529		port->state = SA_DB_REFRESH;
530		process_updates(port);
531	}
532}
533
534static void clean_update_list(struct sa_db_port *port)
535{
536	struct update_info *update;
537
538	while (!list_empty(&port->update_list)) {
539		update = list_entry(port->update_list.next,
540				    struct update_info, list);
541		list_del(&update->list);
542		kfree(update);
543	}
544}
545
546static int notice_handler(int status, struct ib_inform_info *info,
547			  struct ib_sa_notice *notice)
548{
549	struct sa_db_port *port = info->context;
550	struct ib_sa_notice_data_gid *gid_data;
551	struct ib_inform_info **pinfo;
552	enum sa_update_type type;
553
554	if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
555		pinfo = &port->in_info;
556		type = SA_UPDATE_ADD;
557	} else {
558		pinfo = &port->out_info;
559		type = SA_UPDATE_REMOVE;
560	}
561
562	mutex_lock(&lock);
563	if (port->state == SA_DB_DESTROY || !*pinfo) {
564		mutex_unlock(&lock);
565		return 0;
566	}
567
568	if (notice) {
569		gid_data = (struct ib_sa_notice_data_gid *)
570			   &notice->data_details;
571		add_update(port, gid_data->gid, type);
572		mutex_unlock(&lock);
573	} else if (status == -ENETRESET) {
574		*pinfo = NULL;
575		mutex_unlock(&lock);
576	} else {
577		if (status)
578			*pinfo = ERR_PTR(-EINVAL);
579		port->state = SA_DB_IDLE;
580		clean_update_list(port);
581		mutex_unlock(&lock);
582		queue_work(sa_wq, &port->work);
583	}
584
585	return status;
586}
587
588static int reg_in_info(struct sa_db_port *port)
589{
590	int ret = 0;
591
592	port->in_info = ib_sa_register_inform_info(&sa_client,
593						   port->dev->device,
594						   port->port_num,
595						   IB_SA_SM_TRAP_GID_IN_SERVICE,
596						   GFP_KERNEL, notice_handler,
597						   port);
598	if (IS_ERR(port->in_info))
599		ret = PTR_ERR(port->in_info);
600
601	return ret;
602}
603
604static int reg_out_info(struct sa_db_port *port)
605{
606	int ret = 0;
607
608	port->out_info = ib_sa_register_inform_info(&sa_client,
609						    port->dev->device,
610						    port->port_num,
611						    IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
612						    GFP_KERNEL, notice_handler,
613						    port);
614	if (IS_ERR(port->out_info))
615		ret = PTR_ERR(port->out_info);
616
617	return ret;
618}
619
620static void unsubscribe_port(struct sa_db_port *port)
621{
622	if (port->in_info && !IS_ERR(port->in_info))
623		ib_sa_unregister_inform_info(port->in_info);
624
625	if (port->out_info && !IS_ERR(port->out_info))
626		ib_sa_unregister_inform_info(port->out_info);
627
628	port->out_info = NULL;
629	port->in_info = NULL;
630
631}
632
633static void cleanup_port(struct sa_db_port *port)
634{
635	unsubscribe_port(port);
636
637	clean_update_list(port);
638	remove_all_attrs(&port->paths);
639}
640
641static int update_port_info(struct sa_db_port *port)
642{
643	struct ib_port_attr port_attr;
644	int ret;
645
646	ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
647	if (ret)
648		return ret;
649
650	if (port_attr.state != IB_PORT_ACTIVE)
651		return -ENODATA;
652
653        port->sm_lid = port_attr.sm_lid;
654	port->sm_sl = port_attr.sm_sl;
655	return 0;
656}
657
658static void process_updates(struct sa_db_port *port)
659{
660	struct update_info *update;
661	struct ib_sa_attr_list *attr_list;
662	int ret;
663
664	if (!paths_per_dest || update_port_info(port)) {
665		cleanup_port(port);
666		goto out;
667	}
668
669	/* Event registration is an optimization, so ignore failures. */
670	if (subscribe_inform_info) {
671		if (!port->out_info) {
672			ret = reg_out_info(port);
673			if (!ret)
674				return;
675		}
676
677		if (!port->in_info) {
678			ret = reg_in_info(port);
679			if (!ret)
680				return;
681		}
682	} else
683		unsubscribe_port(port);
684
685	while (!list_empty(&port->update_list)) {
686		update = list_entry(port->update_list.next,
687				    struct update_info, list);
688
689		if (update->type == SA_UPDATE_REMOVE) {
690			write_lock_irq(&rwlock);
691			attr_list = find_attr_list(&port->paths,
692						   update->gid.raw);
693			if (attr_list)
694				remove_attr(&port->paths, attr_list);
695			write_unlock_irq(&rwlock);
696		} else {
697			ret = send_query(port, update);
698			if (!ret)
699				return;
700
701		}
702		list_del(&update->list);
703		kfree(update);
704	}
705out:
706	port->state = SA_DB_IDLE;
707}
708
709static void refresh_port_db(struct sa_db_port *port)
710{
711	if (port->state == SA_DB_DESTROY)
712		return;
713
714	if (port->state == SA_DB_REFRESH) {
715		clean_update_list(port);
716		ib_cancel_mad(port->agent, port->msg);
717	}
718
719	add_update(port, NULL, SA_UPDATE_FULL);
720}
721
722static void refresh_dev_db(struct sa_db_device *dev)
723{
724	int i;
725
726	for (i = 0; i < dev->port_count; i++)
727		refresh_port_db(&dev->port[i]);
728}
729
730static void refresh_db(void)
731{
732	struct sa_db_device *dev;
733
734	list_for_each_entry(dev, &dev_list, list)
735		refresh_dev_db(dev);
736}
737
738static int do_refresh(const char *val, struct kernel_param *kp)
739{
740	mutex_lock(&lock);
741	refresh_db();
742	mutex_unlock(&lock);
743	return 0;
744}
745
746static int get_lookup_method(char *buf, struct kernel_param *kp)
747{
748	return sprintf(buf,
749		       "%c %d round robin\n"
750		       "%c %d random",
751		       (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
752		       SA_DB_LOOKUP_LEAST_USED,
753		       (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
754		       SA_DB_LOOKUP_RANDOM);
755}
756
757static int set_lookup_method(const char *val, struct kernel_param *kp)
758{
759	unsigned long method;
760	int ret = 0;
761
762	method = simple_strtoul(val, NULL, 0);
763
764	switch (method) {
765	case SA_DB_LOOKUP_LEAST_USED:
766	case SA_DB_LOOKUP_RANDOM:
767		lookup_method = method;
768		break;
769	default:
770		ret = -EINVAL;
771		break;
772	}
773
774	return ret;
775}
776
777static int set_paths_per_dest(const char *val, struct kernel_param *kp)
778{
779	int ret;
780
781	mutex_lock(&lock);
782	ret = param_set_ulong(val, kp);
783	if (ret)
784		goto out;
785
786	if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
787		paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
788	refresh_db();
789out:
790	mutex_unlock(&lock);
791	return ret;
792}
793
794static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
795{
796	int ret;
797
798	ret = param_set_bool(val, kp);
799	if (ret)
800		return ret;
801
802	return do_refresh(val, kp);
803}
804
805static void port_work_handler(struct work_struct *work)
806{
807	struct sa_db_port *port;
808
809	port = container_of(work, typeof(*port), work);
810	mutex_lock(&lock);
811	refresh_port_db(port);
812	mutex_unlock(&lock);
813}
814
815static void handle_event(struct ib_event_handler *event_handler,
816			 struct ib_event *event)
817{
818	struct sa_db_device *dev;
819	struct sa_db_port *port;
820
821	dev = container_of(event_handler, typeof(*dev), event_handler);
822	port = &dev->port[event->element.port_num - dev->start_port];
823
824	switch (event->event) {
825	case IB_EVENT_PORT_ERR:
826	case IB_EVENT_LID_CHANGE:
827	case IB_EVENT_SM_CHANGE:
828	case IB_EVENT_CLIENT_REREGISTER:
829	case IB_EVENT_PKEY_CHANGE:
830	case IB_EVENT_PORT_ACTIVE:
831		queue_work(sa_wq, &port->work);
832		break;
833	default:
834		break;
835	}
836}
837
838static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
839{
840	read_unlock_irqrestore(&rwlock, iter->flags);
841}
842
843static int ib_create_path_iter(struct ib_device *device, u8 port_num,
844			       union ib_gid *dgid, struct ib_sa_attr_iter *iter)
845{
846	struct sa_db_device *dev;
847	struct sa_db_port *port;
848	struct ib_sa_attr_list *list;
849
850	dev = ib_get_client_data(device, &sa_db_client);
851	if (!dev)
852		return -ENODEV;
853
854	port = &dev->port[port_num - dev->start_port];
855
856	read_lock_irqsave(&rwlock, iter->flags);
857	list = find_attr_list(&port->paths, dgid->raw);
858	if (!list) {
859		ib_free_path_iter(iter);
860		return -ENODATA;
861	}
862
863	iter->iter = &list->iter;
864	return 0;
865}
866
867static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
868{
869	struct ib_path_rec_info *next_path;
870
871	iter->iter = iter->iter->next;
872	if (iter->iter) {
873		next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
874		return &next_path->rec;
875	} else
876		return NULL;
877}
878
879static int cmp_rec(struct ib_sa_path_rec *src,
880		   struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
881{
882	/* DGID check already done */
883	if (comp_mask & IB_SA_PATH_REC_SGID &&
884	    memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
885		return -EINVAL;
886	if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
887		return -EINVAL;
888	if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
889		return -EINVAL;
890	if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
891	    src->raw_traffic != dst->raw_traffic)
892		return -EINVAL;
893
894	if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
895	    src->flow_label != dst->flow_label)
896		return -EINVAL;
897	if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
898	    src->hop_limit != dst->hop_limit)
899		return -EINVAL;
900	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
901	    src->traffic_class != dst->traffic_class)
902		return -EINVAL;
903	if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
904	    dst->reversible && !src->reversible)
905		return -EINVAL;
906	/* Numb path check already done */
907	if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
908		return -EINVAL;
909
910	if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
911		return -EINVAL;
912
913	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
914				 IB_SA_PATH_REC_MTU, dst->mtu_selector,
915				 src->mtu, dst->mtu))
916		return -EINVAL;
917	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
918				 IB_SA_PATH_REC_RATE, dst->rate_selector,
919				 src->rate, dst->rate))
920		return -EINVAL;
921	if (ib_sa_check_selector(comp_mask,
922				 IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
923				 IB_SA_PATH_REC_PACKET_LIFE_TIME,
924				 dst->packet_life_time_selector,
925				 src->packet_life_time, dst->packet_life_time))
926		return -EINVAL;
927
928	return 0;
929}
930
931static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
932					      struct ib_sa_path_rec *req_path,
933					      ib_sa_comp_mask comp_mask)
934{
935	struct ib_sa_path_rec *path, *rand_path = NULL;
936	int num, count = 0;
937
938	for (path = ib_get_next_path(iter); path;
939	     path = ib_get_next_path(iter)) {
940		if (!cmp_rec(path, req_path, comp_mask)) {
941			get_random_bytes(&num, sizeof num);
942			if ((num % ++count) == 0)
943				rand_path = path;
944		}
945	}
946
947	return rand_path;
948}
949
950static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
951					    struct ib_sa_path_rec *req_path,
952					    ib_sa_comp_mask comp_mask)
953{
954	struct ib_path_rec_info *cur_path, *next_path = NULL;
955	struct ib_sa_path_rec *path;
956	unsigned long lookups = ~0;
957
958	for (path = ib_get_next_path(iter); path;
959	     path = ib_get_next_path(iter)) {
960		if (!cmp_rec(path, req_path, comp_mask)) {
961
962			cur_path = container_of(iter->iter, struct ib_path_rec_info,
963						iter);
964			if (cur_path->lookups < lookups) {
965				lookups = cur_path->lookups;
966				next_path = cur_path;
967			}
968		}
969	}
970
971	if (next_path) {
972		next_path->lookups++;
973		return &next_path->rec;
974	} else
975		return NULL;
976}
977
978static void report_path(struct work_struct *work)
979{
980	struct sa_path_request *req;
981
982	req = container_of(work, struct sa_path_request, work);
983	req->callback(0, &req->path_rec, req->context);
984	ib_sa_client_put(req->client);
985	kfree(req);
986}
987
988/**
989 * ib_sa_path_rec_get - Start a Path get query
990 * @client:SA client
991 * @device:device to send query on
992 * @port_num: port number to send query on
993 * @rec:Path Record to send in query
994 * @comp_mask:component mask to send in query
995 * @timeout_ms:time to wait for response
996 * @gfp_mask:GFP mask to use for internal allocations
997 * @callback:function called when query completes, times out or is
998 * canceled
999 * @context:opaque user context passed to callback
1000 * @sa_query:query context, used to cancel query
1001 *
1002 * Send a Path Record Get query to the SA to look up a path.  The
1003 * callback function will be called when the query completes (or
1004 * fails); status is 0 for a successful response, -EINTR if the query
1005 * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
1006 * occurred sending the query.  The resp parameter of the callback is
1007 * only valid if status is 0.
1008 *
1009 * If the return value of ib_sa_path_rec_get() is negative, it is an
1010 * error code.  Otherwise it is a query ID that can be used to cancel
1011 * the query.
1012 */
1013int ib_sa_path_rec_get(struct ib_sa_client *client,
1014		       struct ib_device *device, u8 port_num,
1015		       struct ib_sa_path_rec *rec,
1016		       ib_sa_comp_mask comp_mask,
1017		       int timeout_ms, gfp_t gfp_mask,
1018		       void (*callback)(int status,
1019					struct ib_sa_path_rec *resp,
1020					void *context),
1021		       void *context,
1022		       struct ib_sa_query **sa_query)
1023{
1024	struct sa_path_request *req;
1025	struct ib_sa_attr_iter iter;
1026	struct ib_sa_path_rec *path_rec;
1027	int ret;
1028
1029	if (!paths_per_dest)
1030		goto query_sa;
1031
1032	if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
1033	    !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
1034		goto query_sa;
1035
1036	req = kmalloc(sizeof *req, gfp_mask);
1037	if (!req)
1038		goto query_sa;
1039
1040	ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
1041	if (ret)
1042		goto free_req;
1043
1044	if (lookup_method == SA_DB_LOOKUP_RANDOM)
1045		path_rec = get_random_path(&iter, rec, comp_mask);
1046	else
1047		path_rec = get_next_path(&iter, rec, comp_mask);
1048
1049	if (!path_rec)
1050		goto free_iter;
1051
1052	memcpy(&req->path_rec, path_rec, sizeof *path_rec);
1053	ib_free_path_iter(&iter);
1054
1055	INIT_WORK(&req->work, report_path);
1056	req->client = client;
1057	req->callback = callback;
1058	req->context = context;
1059
1060	ib_sa_client_get(client);
1061	queue_work(sa_wq, &req->work);
1062	*sa_query = ERR_PTR(-EEXIST);
1063	return 0;
1064
1065free_iter:
1066	ib_free_path_iter(&iter);
1067free_req:
1068	kfree(req);
1069query_sa:
1070	return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
1071				    timeout_ms, gfp_mask, callback, context,
1072				    sa_query);
1073}
1074EXPORT_SYMBOL(ib_sa_path_rec_get);
1075
1076static void recv_handler(struct ib_mad_agent *mad_agent,
1077			 struct ib_mad_recv_wc *mad_recv_wc)
1078{
1079	struct sa_db_port *port;
1080	struct update_info *update;
1081	struct ib_mad_send_buf *msg;
1082	enum sa_update_type type;
1083
1084	msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
1085	port = msg->context[0];
1086	update = msg->context[1];
1087
1088	mutex_lock(&lock);
1089	if (port->state == SA_DB_DESTROY ||
1090	    update != list_entry(port->update_list.next,
1091				 struct update_info, list)) {
1092		mutex_unlock(&lock);
1093	} else {
1094		type = update->type;
1095		mutex_unlock(&lock);
1096		update_path_db(mad_agent->context, mad_recv_wc, type);
1097	}
1098
1099	ib_free_recv_mad(mad_recv_wc);
1100}
1101
1102static void send_handler(struct ib_mad_agent *agent,
1103			 struct ib_mad_send_wc *mad_send_wc)
1104{
1105	struct ib_mad_send_buf *msg;
1106	struct sa_db_port *port;
1107	struct update_info *update;
1108	int ret;
1109
1110	msg = mad_send_wc->send_buf;
1111	port = msg->context[0];
1112	update = msg->context[1];
1113
1114	mutex_lock(&lock);
1115	if (port->state == SA_DB_DESTROY)
1116		goto unlock;
1117
1118	if (update == list_entry(port->update_list.next,
1119				 struct update_info, list)) {
1120
1121		if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
1122		    msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
1123
1124			msg->timeout_ms <<= 1;
1125			ret = ib_post_send_mad(msg, NULL);
1126			if (!ret) {
1127				mutex_unlock(&lock);
1128				return;
1129			}
1130		}
1131		list_del(&update->list);
1132		kfree(update);
1133	}
1134	process_updates(port);
1135unlock:
1136	mutex_unlock(&lock);
1137
1138	ib_destroy_ah(msg->ah);
1139	ib_free_send_mad(msg);
1140}
1141
1142static int init_port(struct sa_db_device *dev, int port_num)
1143{
1144	struct sa_db_port *port;
1145	int ret;
1146
1147	port = &dev->port[port_num - dev->start_port];
1148	port->dev = dev;
1149	port->port_num = port_num;
1150	INIT_WORK(&port->work, port_work_handler);
1151	port->paths = RB_ROOT;
1152	INIT_LIST_HEAD(&port->update_list);
1153
1154	ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
1155	if (ret)
1156		return ret;
1157
1158	port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
1159					    NULL, IB_MGMT_RMPP_VERSION,
1160					    send_handler, recv_handler, port);
1161	if (IS_ERR(port->agent))
1162		ret = PTR_ERR(port->agent);
1163
1164	return ret;
1165}
1166
1167static void destroy_port(struct sa_db_port *port)
1168{
1169	mutex_lock(&lock);
1170	port->state = SA_DB_DESTROY;
1171	mutex_unlock(&lock);
1172
1173	ib_unregister_mad_agent(port->agent);
1174	cleanup_port(port);
1175	flush_workqueue(sa_wq);
1176}
1177
1178static void sa_db_add_dev(struct ib_device *device)
1179{
1180	struct sa_db_device *dev;
1181	struct sa_db_port *port;
1182	int s, e, i, ret;
1183
1184	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1185		return;
1186
1187	if (device->node_type == RDMA_NODE_IB_SWITCH) {
1188		s = e = 0;
1189	} else {
1190		s = 1;
1191		e = device->phys_port_cnt;
1192	}
1193
1194	dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
1195	if (!dev)
1196		return;
1197
1198	dev->start_port = s;
1199	dev->port_count = e - s + 1;
1200	dev->device = device;
1201	for (i = 0; i < dev->port_count; i++) {
1202		ret = init_port(dev, s + i);
1203		if (ret)
1204			goto err;
1205	}
1206
1207	ib_set_client_data(device, &sa_db_client, dev);
1208
1209	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
1210
1211	mutex_lock(&lock);
1212	list_add_tail(&dev->list, &dev_list);
1213	refresh_dev_db(dev);
1214	mutex_unlock(&lock);
1215
1216	ib_register_event_handler(&dev->event_handler);
1217	return;
1218err:
1219	while (i--)
1220		destroy_port(&dev->port[i]);
1221	kfree(dev);
1222}
1223
1224static void sa_db_remove_dev(struct ib_device *device)
1225{
1226	struct sa_db_device *dev;
1227	int i;
1228
1229	dev = ib_get_client_data(device, &sa_db_client);
1230	if (!dev)
1231		return;
1232
1233	ib_unregister_event_handler(&dev->event_handler);
1234	flush_workqueue(sa_wq);
1235
1236	for (i = 0; i < dev->port_count; i++)
1237		destroy_port(&dev->port[i]);
1238
1239	mutex_lock(&lock);
1240	list_del(&dev->list);
1241	mutex_unlock(&lock);
1242
1243	kfree(dev);
1244}
1245
1246int sa_db_init(void)
1247{
1248	int ret;
1249
1250	rwlock_init(&rwlock);
1251	sa_wq = create_singlethread_workqueue("local_sa");
1252	if (!sa_wq)
1253		return -ENOMEM;
1254
1255	ib_sa_register_client(&sa_client);
1256	ret = ib_register_client(&sa_db_client);
1257	if (ret)
1258		goto err;
1259
1260	return 0;
1261
1262err:
1263	ib_sa_unregister_client(&sa_client);
1264	destroy_workqueue(sa_wq);
1265	return ret;
1266}
1267
1268void sa_db_cleanup(void)
1269{
1270	ib_unregister_client(&sa_db_client);
1271	ib_sa_unregister_client(&sa_client);
1272	destroy_workqueue(sa_wq);
1273}
1274