1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  Block device elevator/IO-scheduler.
4 *
5 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6 *
7 * 30042000 Jens Axboe <axboe@kernel.dk> :
8 *
9 * Split the elevator a bit so that it is possible to choose a different
10 * one or even write a new "plug in". There are three pieces:
11 * - elevator_fn, inserts a new request in the queue list
12 * - elevator_merge_fn, decides whether a new buffer can be merged with
13 *   an existing request
14 * - elevator_dequeue_fn, called when a request is taken off the active list
15 *
16 * 20082000 Dave Jones <davej@suse.de> :
17 * Removed tests for max-bomb-segments, which was breaking elvtune
18 *  when run without -bN
19 *
20 * Jens:
21 * - Rework again to work with bio instead of buffer_heads
22 * - loose bi_dev comparisons, partition handling is right now
23 * - completely modularize elevator setup and teardown
24 *
25 */
26#include <linux/kernel.h>
27#include <linux/fs.h>
28#include <linux/blkdev.h>
29#include <linux/bio.h>
30#include <linux/module.h>
31#include <linux/slab.h>
32#include <linux/init.h>
33#include <linux/compiler.h>
34#include <linux/blktrace_api.h>
35#include <linux/hash.h>
36#include <linux/uaccess.h>
37#include <linux/pm_runtime.h>
38
39#include <trace/events/block.h>
40
41#include "elevator.h"
42#include "blk.h"
43#include "blk-mq-sched.h"
44#include "blk-pm.h"
45#include "blk-wbt.h"
46#include "blk-cgroup.h"
47
48static DEFINE_SPINLOCK(elv_list_lock);
49static LIST_HEAD(elv_list);
50
51/*
52 * Merge hash stuff.
53 */
54#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
55
56/*
57 * Query io scheduler to see if the current process issuing bio may be
58 * merged with rq.
59 */
60static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
61{
62	struct request_queue *q = rq->q;
63	struct elevator_queue *e = q->elevator;
64
65	if (e->type->ops.allow_merge)
66		return e->type->ops.allow_merge(q, rq, bio);
67
68	return true;
69}
70
71/*
72 * can we safely merge with this request?
73 */
74bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
75{
76	if (!blk_rq_merge_ok(rq, bio))
77		return false;
78
79	if (!elv_iosched_allow_bio_merge(rq, bio))
80		return false;
81
82	return true;
83}
84EXPORT_SYMBOL(elv_bio_merge_ok);
85
86/**
87 * elevator_match - Check whether @e's name or alias matches @name
88 * @e: Scheduler to test
89 * @name: Elevator name to test
90 *
91 * Return true if the elevator @e's name or alias matches @name.
92 */
93static bool elevator_match(const struct elevator_type *e, const char *name)
94{
95	return !strcmp(e->elevator_name, name) ||
96		(e->elevator_alias && !strcmp(e->elevator_alias, name));
97}
98
99static struct elevator_type *__elevator_find(const char *name)
100{
101	struct elevator_type *e;
102
103	list_for_each_entry(e, &elv_list, list)
104		if (elevator_match(e, name))
105			return e;
106	return NULL;
107}
108
109static struct elevator_type *elevator_find_get(struct request_queue *q,
110		const char *name)
111{
112	struct elevator_type *e;
113
114	spin_lock(&elv_list_lock);
115	e = __elevator_find(name);
116	if (e && (!elevator_tryget(e)))
117		e = NULL;
118	spin_unlock(&elv_list_lock);
119	return e;
120}
121
122static const struct kobj_type elv_ktype;
123
124struct elevator_queue *elevator_alloc(struct request_queue *q,
125				  struct elevator_type *e)
126{
127	struct elevator_queue *eq;
128
129	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
130	if (unlikely(!eq))
131		return NULL;
132
133	__elevator_get(e);
134	eq->type = e;
135	kobject_init(&eq->kobj, &elv_ktype);
136	mutex_init(&eq->sysfs_lock);
137	hash_init(eq->hash);
138
139	return eq;
140}
141EXPORT_SYMBOL(elevator_alloc);
142
143static void elevator_release(struct kobject *kobj)
144{
145	struct elevator_queue *e;
146
147	e = container_of(kobj, struct elevator_queue, kobj);
148	elevator_put(e->type);
149	kfree(e);
150}
151
152void elevator_exit(struct request_queue *q)
153{
154	struct elevator_queue *e = q->elevator;
155
156	ioc_clear_queue(q);
157	blk_mq_sched_free_rqs(q);
158
159	mutex_lock(&e->sysfs_lock);
160	blk_mq_exit_sched(q, e);
161	mutex_unlock(&e->sysfs_lock);
162
163	kobject_put(&e->kobj);
164}
165
166static inline void __elv_rqhash_del(struct request *rq)
167{
168	hash_del(&rq->hash);
169	rq->rq_flags &= ~RQF_HASHED;
170}
171
172void elv_rqhash_del(struct request_queue *q, struct request *rq)
173{
174	if (ELV_ON_HASH(rq))
175		__elv_rqhash_del(rq);
176}
177EXPORT_SYMBOL_GPL(elv_rqhash_del);
178
179void elv_rqhash_add(struct request_queue *q, struct request *rq)
180{
181	struct elevator_queue *e = q->elevator;
182
183	BUG_ON(ELV_ON_HASH(rq));
184	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
185	rq->rq_flags |= RQF_HASHED;
186}
187EXPORT_SYMBOL_GPL(elv_rqhash_add);
188
189void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
190{
191	__elv_rqhash_del(rq);
192	elv_rqhash_add(q, rq);
193}
194
195struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
196{
197	struct elevator_queue *e = q->elevator;
198	struct hlist_node *next;
199	struct request *rq;
200
201	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
202		BUG_ON(!ELV_ON_HASH(rq));
203
204		if (unlikely(!rq_mergeable(rq))) {
205			__elv_rqhash_del(rq);
206			continue;
207		}
208
209		if (rq_hash_key(rq) == offset)
210			return rq;
211	}
212
213	return NULL;
214}
215
216/*
217 * RB-tree support functions for inserting/lookup/removal of requests
218 * in a sorted RB tree.
219 */
220void elv_rb_add(struct rb_root *root, struct request *rq)
221{
222	struct rb_node **p = &root->rb_node;
223	struct rb_node *parent = NULL;
224	struct request *__rq;
225
226	while (*p) {
227		parent = *p;
228		__rq = rb_entry(parent, struct request, rb_node);
229
230		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
231			p = &(*p)->rb_left;
232		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
233			p = &(*p)->rb_right;
234	}
235
236	rb_link_node(&rq->rb_node, parent, p);
237	rb_insert_color(&rq->rb_node, root);
238}
239EXPORT_SYMBOL(elv_rb_add);
240
241void elv_rb_del(struct rb_root *root, struct request *rq)
242{
243	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
244	rb_erase(&rq->rb_node, root);
245	RB_CLEAR_NODE(&rq->rb_node);
246}
247EXPORT_SYMBOL(elv_rb_del);
248
249struct request *elv_rb_find(struct rb_root *root, sector_t sector)
250{
251	struct rb_node *n = root->rb_node;
252	struct request *rq;
253
254	while (n) {
255		rq = rb_entry(n, struct request, rb_node);
256
257		if (sector < blk_rq_pos(rq))
258			n = n->rb_left;
259		else if (sector > blk_rq_pos(rq))
260			n = n->rb_right;
261		else
262			return rq;
263	}
264
265	return NULL;
266}
267EXPORT_SYMBOL(elv_rb_find);
268
269enum elv_merge elv_merge(struct request_queue *q, struct request **req,
270		struct bio *bio)
271{
272	struct elevator_queue *e = q->elevator;
273	struct request *__rq;
274
275	/*
276	 * Levels of merges:
277	 * 	nomerges:  No merges at all attempted
278	 * 	noxmerges: Only simple one-hit cache try
279	 * 	merges:	   All merge tries attempted
280	 */
281	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
282		return ELEVATOR_NO_MERGE;
283
284	/*
285	 * First try one-hit cache.
286	 */
287	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
288		enum elv_merge ret = blk_try_merge(q->last_merge, bio);
289
290		if (ret != ELEVATOR_NO_MERGE) {
291			*req = q->last_merge;
292			return ret;
293		}
294	}
295
296	if (blk_queue_noxmerges(q))
297		return ELEVATOR_NO_MERGE;
298
299	/*
300	 * See if our hash lookup can find a potential backmerge.
301	 */
302	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
303	if (__rq && elv_bio_merge_ok(__rq, bio)) {
304		*req = __rq;
305
306		if (blk_discard_mergable(__rq))
307			return ELEVATOR_DISCARD_MERGE;
308		return ELEVATOR_BACK_MERGE;
309	}
310
311	if (e->type->ops.request_merge)
312		return e->type->ops.request_merge(q, req, bio);
313
314	return ELEVATOR_NO_MERGE;
315}
316
317/*
318 * Attempt to do an insertion back merge. Only check for the case where
319 * we can append 'rq' to an existing request, so we can throw 'rq' away
320 * afterwards.
321 *
322 * Returns true if we merged, false otherwise. 'free' will contain all
323 * requests that need to be freed.
324 */
325bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
326			      struct list_head *free)
327{
328	struct request *__rq;
329	bool ret;
330
331	if (blk_queue_nomerges(q))
332		return false;
333
334	/*
335	 * First try one-hit cache.
336	 */
337	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
338		list_add(&rq->queuelist, free);
339		return true;
340	}
341
342	if (blk_queue_noxmerges(q))
343		return false;
344
345	ret = false;
346	/*
347	 * See if our hash lookup can find a potential backmerge.
348	 */
349	while (1) {
350		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
351		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
352			break;
353
354		list_add(&rq->queuelist, free);
355		/* The merged request could be merged with others, try again */
356		ret = true;
357		rq = __rq;
358	}
359
360	return ret;
361}
362
363void elv_merged_request(struct request_queue *q, struct request *rq,
364		enum elv_merge type)
365{
366	struct elevator_queue *e = q->elevator;
367
368	if (e->type->ops.request_merged)
369		e->type->ops.request_merged(q, rq, type);
370
371	if (type == ELEVATOR_BACK_MERGE)
372		elv_rqhash_reposition(q, rq);
373
374	q->last_merge = rq;
375}
376
377void elv_merge_requests(struct request_queue *q, struct request *rq,
378			     struct request *next)
379{
380	struct elevator_queue *e = q->elevator;
381
382	if (e->type->ops.requests_merged)
383		e->type->ops.requests_merged(q, rq, next);
384
385	elv_rqhash_reposition(q, rq);
386	q->last_merge = rq;
387}
388
389struct request *elv_latter_request(struct request_queue *q, struct request *rq)
390{
391	struct elevator_queue *e = q->elevator;
392
393	if (e->type->ops.next_request)
394		return e->type->ops.next_request(q, rq);
395
396	return NULL;
397}
398
399struct request *elv_former_request(struct request_queue *q, struct request *rq)
400{
401	struct elevator_queue *e = q->elevator;
402
403	if (e->type->ops.former_request)
404		return e->type->ops.former_request(q, rq);
405
406	return NULL;
407}
408
409#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
410
411static ssize_t
412elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
413{
414	struct elv_fs_entry *entry = to_elv(attr);
415	struct elevator_queue *e;
416	ssize_t error;
417
418	if (!entry->show)
419		return -EIO;
420
421	e = container_of(kobj, struct elevator_queue, kobj);
422	mutex_lock(&e->sysfs_lock);
423	error = e->type ? entry->show(e, page) : -ENOENT;
424	mutex_unlock(&e->sysfs_lock);
425	return error;
426}
427
428static ssize_t
429elv_attr_store(struct kobject *kobj, struct attribute *attr,
430	       const char *page, size_t length)
431{
432	struct elv_fs_entry *entry = to_elv(attr);
433	struct elevator_queue *e;
434	ssize_t error;
435
436	if (!entry->store)
437		return -EIO;
438
439	e = container_of(kobj, struct elevator_queue, kobj);
440	mutex_lock(&e->sysfs_lock);
441	error = e->type ? entry->store(e, page, length) : -ENOENT;
442	mutex_unlock(&e->sysfs_lock);
443	return error;
444}
445
446static const struct sysfs_ops elv_sysfs_ops = {
447	.show	= elv_attr_show,
448	.store	= elv_attr_store,
449};
450
451static const struct kobj_type elv_ktype = {
452	.sysfs_ops	= &elv_sysfs_ops,
453	.release	= elevator_release,
454};
455
456int elv_register_queue(struct request_queue *q, bool uevent)
457{
458	struct elevator_queue *e = q->elevator;
459	int error;
460
461	lockdep_assert_held(&q->sysfs_lock);
462
463	error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
464	if (!error) {
465		struct elv_fs_entry *attr = e->type->elevator_attrs;
466		if (attr) {
467			while (attr->attr.name) {
468				if (sysfs_create_file(&e->kobj, &attr->attr))
469					break;
470				attr++;
471			}
472		}
473		if (uevent)
474			kobject_uevent(&e->kobj, KOBJ_ADD);
475
476		set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
477	}
478	return error;
479}
480
481void elv_unregister_queue(struct request_queue *q)
482{
483	struct elevator_queue *e = q->elevator;
484
485	lockdep_assert_held(&q->sysfs_lock);
486
487	if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
488		kobject_uevent(&e->kobj, KOBJ_REMOVE);
489		kobject_del(&e->kobj);
490	}
491}
492
493int elv_register(struct elevator_type *e)
494{
495	/* finish request is mandatory */
496	if (WARN_ON_ONCE(!e->ops.finish_request))
497		return -EINVAL;
498	/* insert_requests and dispatch_request are mandatory */
499	if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request))
500		return -EINVAL;
501
502	/* create icq_cache if requested */
503	if (e->icq_size) {
504		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
505		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
506			return -EINVAL;
507
508		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
509			 "%s_io_cq", e->elevator_name);
510		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
511						 e->icq_align, 0, NULL);
512		if (!e->icq_cache)
513			return -ENOMEM;
514	}
515
516	/* register, don't allow duplicate names */
517	spin_lock(&elv_list_lock);
518	if (__elevator_find(e->elevator_name)) {
519		spin_unlock(&elv_list_lock);
520		kmem_cache_destroy(e->icq_cache);
521		return -EBUSY;
522	}
523	list_add_tail(&e->list, &elv_list);
524	spin_unlock(&elv_list_lock);
525
526	printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);
527
528	return 0;
529}
530EXPORT_SYMBOL_GPL(elv_register);
531
532void elv_unregister(struct elevator_type *e)
533{
534	/* unregister */
535	spin_lock(&elv_list_lock);
536	list_del_init(&e->list);
537	spin_unlock(&elv_list_lock);
538
539	/*
540	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
541	 * sure all RCU operations are complete before proceeding.
542	 */
543	if (e->icq_cache) {
544		rcu_barrier();
545		kmem_cache_destroy(e->icq_cache);
546		e->icq_cache = NULL;
547	}
548}
549EXPORT_SYMBOL_GPL(elv_unregister);
550
551static inline bool elv_support_iosched(struct request_queue *q)
552{
553	if (!queue_is_mq(q) ||
554	    (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
555		return false;
556	return true;
557}
558
559/*
560 * For single queue devices, default to using mq-deadline. If we have multiple
561 * queues or mq-deadline is not available, default to "none".
562 */
563static struct elevator_type *elevator_get_default(struct request_queue *q)
564{
565	if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
566		return NULL;
567
568	if (q->nr_hw_queues != 1 &&
569	    !blk_mq_is_shared_tags(q->tag_set->flags))
570		return NULL;
571
572	return elevator_find_get(q, "mq-deadline");
573}
574
575/*
576 * Use the default elevator settings. If the chosen elevator initialization
577 * fails, fall back to the "none" elevator (no elevator).
578 */
579void elevator_init_mq(struct request_queue *q)
580{
581	struct elevator_type *e;
582	int err;
583
584	if (!elv_support_iosched(q))
585		return;
586
587	WARN_ON_ONCE(blk_queue_registered(q));
588
589	if (unlikely(q->elevator))
590		return;
591
592	e = elevator_get_default(q);
593	if (!e)
594		return;
595
596	/*
597	 * We are called before adding disk, when there isn't any FS I/O,
598	 * so freezing queue plus canceling dispatch work is enough to
599	 * drain any dispatch activities originated from passthrough
600	 * requests, then no need to quiesce queue which may add long boot
601	 * latency, especially when lots of disks are involved.
602	 */
603	blk_mq_freeze_queue(q);
604	blk_mq_cancel_work_sync(q);
605
606	err = blk_mq_init_sched(q, e);
607
608	blk_mq_unfreeze_queue(q);
609
610	if (err) {
611		pr_warn("\"%s\" elevator initialization failed, "
612			"falling back to \"none\"\n", e->elevator_name);
613	}
614
615	elevator_put(e);
616}
617
618/*
619 * Switch to new_e io scheduler.
620 *
621 * If switching fails, we are most likely running out of memory and not able
622 * to restore the old io scheduler, so leaving the io scheduler being none.
623 */
624int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
625{
626	int ret;
627
628	lockdep_assert_held(&q->sysfs_lock);
629
630	blk_mq_freeze_queue(q);
631	blk_mq_quiesce_queue(q);
632
633	if (q->elevator) {
634		elv_unregister_queue(q);
635		elevator_exit(q);
636	}
637
638	ret = blk_mq_init_sched(q, new_e);
639	if (ret)
640		goto out_unfreeze;
641
642	ret = elv_register_queue(q, true);
643	if (ret) {
644		elevator_exit(q);
645		goto out_unfreeze;
646	}
647	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
648
649out_unfreeze:
650	blk_mq_unquiesce_queue(q);
651	blk_mq_unfreeze_queue(q);
652
653	if (ret) {
654		pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
655			new_e->elevator_name);
656	}
657
658	return ret;
659}
660
661void elevator_disable(struct request_queue *q)
662{
663	lockdep_assert_held(&q->sysfs_lock);
664
665	blk_mq_freeze_queue(q);
666	blk_mq_quiesce_queue(q);
667
668	elv_unregister_queue(q);
669	elevator_exit(q);
670	blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
671	q->elevator = NULL;
672	q->nr_requests = q->tag_set->queue_depth;
673	blk_add_trace_msg(q, "elv switch: none");
674
675	blk_mq_unquiesce_queue(q);
676	blk_mq_unfreeze_queue(q);
677}
678
679/*
680 * Switch this queue to the given IO scheduler.
681 */
682static int elevator_change(struct request_queue *q, const char *elevator_name)
683{
684	struct elevator_type *e;
685	int ret;
686
687	/* Make sure queue is not in the middle of being removed */
688	if (!blk_queue_registered(q))
689		return -ENOENT;
690
691	if (!strncmp(elevator_name, "none", 4)) {
692		if (q->elevator)
693			elevator_disable(q);
694		return 0;
695	}
696
697	if (q->elevator && elevator_match(q->elevator->type, elevator_name))
698		return 0;
699
700	e = elevator_find_get(q, elevator_name);
701	if (!e) {
702		request_module("%s-iosched", elevator_name);
703		e = elevator_find_get(q, elevator_name);
704		if (!e)
705			return -EINVAL;
706	}
707	ret = elevator_switch(q, e);
708	elevator_put(e);
709	return ret;
710}
711
712ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
713			  size_t count)
714{
715	char elevator_name[ELV_NAME_MAX];
716	int ret;
717
718	if (!elv_support_iosched(q))
719		return count;
720
721	strscpy(elevator_name, buf, sizeof(elevator_name));
722	ret = elevator_change(q, strstrip(elevator_name));
723	if (!ret)
724		return count;
725	return ret;
726}
727
728ssize_t elv_iosched_show(struct request_queue *q, char *name)
729{
730	struct elevator_queue *eq = q->elevator;
731	struct elevator_type *cur = NULL, *e;
732	int len = 0;
733
734	if (!elv_support_iosched(q))
735		return sprintf(name, "none\n");
736
737	if (!q->elevator) {
738		len += sprintf(name+len, "[none] ");
739	} else {
740		len += sprintf(name+len, "none ");
741		cur = eq->type;
742	}
743
744	spin_lock(&elv_list_lock);
745	list_for_each_entry(e, &elv_list, list) {
746		if (e == cur)
747			len += sprintf(name+len, "[%s] ", e->elevator_name);
748		else
749			len += sprintf(name+len, "%s ", e->elevator_name);
750	}
751	spin_unlock(&elv_list_lock);
752
753	len += sprintf(name+len, "\n");
754	return len;
755}
756
757struct request *elv_rb_former_request(struct request_queue *q,
758				      struct request *rq)
759{
760	struct rb_node *rbprev = rb_prev(&rq->rb_node);
761
762	if (rbprev)
763		return rb_entry_rq(rbprev);
764
765	return NULL;
766}
767EXPORT_SYMBOL(elv_rb_former_request);
768
769struct request *elv_rb_latter_request(struct request_queue *q,
770				      struct request *rq)
771{
772	struct rb_node *rbnext = rb_next(&rq->rb_node);
773
774	if (rbnext)
775		return rb_entry_rq(rbnext);
776
777	return NULL;
778}
779EXPORT_SYMBOL(elv_rb_latter_request);
780
781static int __init elevator_setup(char *str)
782{
783	pr_warn("Kernel parameter elevator= does not have any effect anymore.\n"
784		"Please use sysfs to set IO scheduler for individual devices.\n");
785	return 1;
786}
787
788__setup("elevator=", elevator_setup);
789