1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30#include "opt_sched.h" 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/kdb.h> 35#include <sys/kernel.h> 36#include <sys/ktr.h> 37#include <sys/lock.h> 38#include <sys/mutex.h> 39#include <sys/proc.h> 40#include <sys/queue.h> 41#include <sys/sched.h> 42#include <sys/smp.h> 43#include <sys/sysctl.h> 44 45#include <machine/cpu.h> 46 47/* Uncomment this to enable logging of critical_enter/exit. */ 48#if 0 49#define KTR_CRITICAL KTR_SCHED 50#else 51#define KTR_CRITICAL 0 52#endif 53 54#ifdef FULL_PREEMPTION 55#ifndef PREEMPTION 56#error "The FULL_PREEMPTION option requires the PREEMPTION option" 57#endif 58#endif 59 60CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); 61 62/* 63 * kern.sched.preemption allows user space to determine if preemption support 64 * is compiled in or not. It is not currently a boot or runtime flag that 65 * can be changed. 66 */ 67#ifdef PREEMPTION 68static int kern_sched_preemption = 1; 69#else 70static int kern_sched_preemption = 0; 71#endif 72SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, 73 &kern_sched_preemption, 0, "Kernel preemption enabled"); 74 75/* 76 * Support for scheduler stats exported via kern.sched.stats. All stats may 77 * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere 78 * with SCHED_STAT_DEFINE(). 79 */ 80#ifdef SCHED_STATS 81SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 82 "switch stats"); 83 84/* Switch reasons from mi_switch(9). */ 85DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]); 86SCHED_STAT_DEFINE_VAR(owepreempt, 87 &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), ""); 88SCHED_STAT_DEFINE_VAR(turnstile, 89 &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); 90SCHED_STAT_DEFINE_VAR(sleepq, 91 &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); 92SCHED_STAT_DEFINE_VAR(relinquish, 93 &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); 94SCHED_STAT_DEFINE_VAR(needresched, 95 &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), ""); 96SCHED_STAT_DEFINE_VAR(idle, 97 &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), ""); 98SCHED_STAT_DEFINE_VAR(iwait, 99 &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), ""); 100SCHED_STAT_DEFINE_VAR(suspend, 101 &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), ""); 102SCHED_STAT_DEFINE_VAR(remotepreempt, 103 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), ""); 104SCHED_STAT_DEFINE_VAR(remotewakeidle, 105 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), ""); 106SCHED_STAT_DEFINE_VAR(bind, 107 &DPCPU_NAME(sched_switch_stats[SWT_BIND]), ""); 108 109static int 110sysctl_stats_reset(SYSCTL_HANDLER_ARGS) 111{ 112 struct sysctl_oid *p; 113 uintptr_t counter; 114 int error; 115 int val; 116 int i; 117 118 val = 0; 119 error = sysctl_handle_int(oidp, &val, 0, req); 120 if (error != 0 || req->newptr == NULL) 121 return (error); 122 if (val == 0) 123 return (0); 124 /* 125 * Traverse the list of children of _kern_sched_stats and reset each 126 * to 0. Skip the reset entry. 127 */ 128 RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) { 129 if (p == oidp || p->oid_arg1 == NULL) 130 continue; 131 counter = (uintptr_t)p->oid_arg1; 132 CPU_FOREACH(i) { 133 *(long *)(dpcpu_off[i] + counter) = 0; 134 } 135 } 136 return (0); 137} 138 139SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, 140 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0, 141 sysctl_stats_reset, "I", 142 "Reset scheduler statistics"); 143#endif 144 145/************************************************************************ 146 * Functions that manipulate runnability from a thread perspective. * 147 ************************************************************************/ 148/* 149 * Select the thread that will be run next. 150 */ 151 152static __noinline struct thread * 153choosethread_panic(struct thread *td) 154{ 155 156 /* 157 * If we are in panic, only allow system threads, 158 * plus the one we are running in, to be run. 159 */ 160retry: 161 if (((td->td_proc->p_flag & P_SYSTEM) == 0 && 162 (td->td_flags & TDF_INPANIC) == 0)) { 163 /* note that it is no longer on the run queue */ 164 TD_SET_CAN_RUN(td); 165 td = sched_choose(); 166 goto retry; 167 } 168 169 TD_SET_RUNNING(td); 170 return (td); 171} 172 173struct thread * 174choosethread(void) 175{ 176 struct thread *td; 177 178 td = sched_choose(); 179 180 if (KERNEL_PANICKED()) 181 return (choosethread_panic(td)); 182 183 TD_SET_RUNNING(td); 184 return (td); 185} 186 187/* 188 * Kernel thread preemption implementation. Critical sections mark 189 * regions of code in which preemptions are not allowed. 190 * 191 * It might seem a good idea to inline critical_enter() but, in order 192 * to prevent instructions reordering by the compiler, a __compiler_membar() 193 * would have to be used here (the same as sched_pin()). The performance 194 * penalty imposed by the membar could, then, produce slower code than 195 * the function call itself, for most cases. 196 */ 197void 198critical_enter_KBI(void) 199{ 200#ifdef KTR 201 struct thread *td = curthread; 202#endif 203 critical_enter(); 204 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, 205 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 206} 207 208void __noinline 209critical_exit_preempt(void) 210{ 211 struct thread *td; 212 int flags; 213 214 /* 215 * If td_critnest is 0, it is possible that we are going to get 216 * preempted again before reaching the code below. This happens 217 * rarely and is harmless. However, this means td_owepreempt may 218 * now be unset. 219 */ 220 td = curthread; 221 if (td->td_critnest != 0) 222 return; 223 if (kdb_active) 224 return; 225 226 /* 227 * Microoptimization: we committed to switch, 228 * disable preemption in interrupt handlers 229 * while spinning for the thread lock. 230 */ 231 td->td_critnest = 1; 232 thread_lock(td); 233 td->td_critnest--; 234 flags = SW_INVOL | SW_PREEMPT; 235 if (TD_IS_IDLETHREAD(td)) 236 flags |= SWT_IDLE; 237 else 238 flags |= SWT_OWEPREEMPT; 239 mi_switch(flags); 240} 241 242void 243critical_exit_KBI(void) 244{ 245#ifdef KTR 246 struct thread *td = curthread; 247#endif 248 critical_exit(); 249 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, 250 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 251} 252 253/************************************************************************ 254 * SYSTEM RUN QUEUE manipulations and tests * 255 ************************************************************************/ 256/* 257 * Initialize a run structure. 258 */ 259void 260runq_init(struct runq *rq) 261{ 262 int i; 263 264 bzero(rq, sizeof *rq); 265 for (i = 0; i < RQ_NQS; i++) 266 TAILQ_INIT(&rq->rq_queues[i]); 267} 268 269/* 270 * Clear the status bit of the queue corresponding to priority level pri, 271 * indicating that it is empty. 272 */ 273static __inline void 274runq_clrbit(struct runq *rq, int pri) 275{ 276 struct rqbits *rqb; 277 278 rqb = &rq->rq_status; 279 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", 280 rqb->rqb_bits[RQB_WORD(pri)], 281 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), 282 RQB_BIT(pri), RQB_WORD(pri)); 283 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); 284} 285 286/* 287 * Find the index of the first non-empty run queue. This is done by 288 * scanning the status bits, a set bit indicates a non-empty queue. 289 */ 290static __inline int 291runq_findbit(struct runq *rq) 292{ 293 struct rqbits *rqb; 294 int pri; 295 int i; 296 297 rqb = &rq->rq_status; 298 for (i = 0; i < RQB_LEN; i++) 299 if (rqb->rqb_bits[i]) { 300 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); 301 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", 302 rqb->rqb_bits[i], i, pri); 303 return (pri); 304 } 305 306 return (-1); 307} 308 309static __inline int 310runq_findbit_from(struct runq *rq, u_char pri) 311{ 312 struct rqbits *rqb; 313 rqb_word_t mask; 314 int i; 315 316 /* 317 * Set the mask for the first word so we ignore priorities before 'pri'. 318 */ 319 mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1)); 320 rqb = &rq->rq_status; 321again: 322 for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) { 323 mask = rqb->rqb_bits[i] & mask; 324 if (mask == 0) 325 continue; 326 pri = RQB_FFS(mask) + (i << RQB_L2BPW); 327 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d", 328 mask, i, pri); 329 return (pri); 330 } 331 if (pri == 0) 332 return (-1); 333 /* 334 * Wrap back around to the beginning of the list just once so we 335 * scan the whole thing. 336 */ 337 pri = 0; 338 goto again; 339} 340 341/* 342 * Set the status bit of the queue corresponding to priority level pri, 343 * indicating that it is non-empty. 344 */ 345static __inline void 346runq_setbit(struct runq *rq, int pri) 347{ 348 struct rqbits *rqb; 349 350 rqb = &rq->rq_status; 351 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", 352 rqb->rqb_bits[RQB_WORD(pri)], 353 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), 354 RQB_BIT(pri), RQB_WORD(pri)); 355 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); 356} 357 358/* 359 * Add the thread to the queue specified by its priority, and set the 360 * corresponding status bit. 361 */ 362void 363runq_add(struct runq *rq, struct thread *td, int flags) 364{ 365 struct rqhead *rqh; 366 int pri; 367 368 pri = td->td_priority / RQ_PPQ; 369 td->td_rqindex = pri; 370 runq_setbit(rq, pri); 371 rqh = &rq->rq_queues[pri]; 372 CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p", 373 td, td->td_priority, pri, rqh); 374 if (flags & SRQ_PREEMPTED) { 375 TAILQ_INSERT_HEAD(rqh, td, td_runq); 376 } else { 377 TAILQ_INSERT_TAIL(rqh, td, td_runq); 378 } 379} 380 381void 382runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags) 383{ 384 struct rqhead *rqh; 385 386 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri)); 387 td->td_rqindex = pri; 388 runq_setbit(rq, pri); 389 rqh = &rq->rq_queues[pri]; 390 CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p", 391 td, td->td_priority, pri, rqh); 392 if (flags & SRQ_PREEMPTED) { 393 TAILQ_INSERT_HEAD(rqh, td, td_runq); 394 } else { 395 TAILQ_INSERT_TAIL(rqh, td, td_runq); 396 } 397} 398/* 399 * Return true if there are runnable processes of any priority on the run 400 * queue, false otherwise. Has no side effects, does not modify the run 401 * queue structure. 402 */ 403int 404runq_check(struct runq *rq) 405{ 406 struct rqbits *rqb; 407 int i; 408 409 rqb = &rq->rq_status; 410 for (i = 0; i < RQB_LEN; i++) 411 if (rqb->rqb_bits[i]) { 412 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", 413 rqb->rqb_bits[i], i); 414 return (1); 415 } 416 CTR0(KTR_RUNQ, "runq_check: empty"); 417 418 return (0); 419} 420 421/* 422 * Find the highest priority process on the run queue. 423 */ 424struct thread * 425runq_choose_fuzz(struct runq *rq, int fuzz) 426{ 427 struct rqhead *rqh; 428 struct thread *td; 429 int pri; 430 431 while ((pri = runq_findbit(rq)) != -1) { 432 rqh = &rq->rq_queues[pri]; 433 /* fuzz == 1 is normal.. 0 or less are ignored */ 434 if (fuzz > 1) { 435 /* 436 * In the first couple of entries, check if 437 * there is one for our CPU as a preference. 438 */ 439 int count = fuzz; 440 int cpu = PCPU_GET(cpuid); 441 struct thread *td2; 442 td2 = td = TAILQ_FIRST(rqh); 443 444 while (count-- && td2) { 445 if (td2->td_lastcpu == cpu) { 446 td = td2; 447 break; 448 } 449 td2 = TAILQ_NEXT(td2, td_runq); 450 } 451 } else 452 td = TAILQ_FIRST(rqh); 453 KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue")); 454 CTR3(KTR_RUNQ, 455 "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh); 456 return (td); 457 } 458 CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri); 459 460 return (NULL); 461} 462 463/* 464 * Find the highest priority process on the run queue. 465 */ 466struct thread * 467runq_choose(struct runq *rq) 468{ 469 struct rqhead *rqh; 470 struct thread *td; 471 int pri; 472 473 while ((pri = runq_findbit(rq)) != -1) { 474 rqh = &rq->rq_queues[pri]; 475 td = TAILQ_FIRST(rqh); 476 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 477 CTR3(KTR_RUNQ, 478 "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh); 479 return (td); 480 } 481 CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri); 482 483 return (NULL); 484} 485 486struct thread * 487runq_choose_from(struct runq *rq, u_char idx) 488{ 489 struct rqhead *rqh; 490 struct thread *td; 491 int pri; 492 493 if ((pri = runq_findbit_from(rq, idx)) != -1) { 494 rqh = &rq->rq_queues[pri]; 495 td = TAILQ_FIRST(rqh); 496 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 497 CTR4(KTR_RUNQ, 498 "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p", 499 pri, td, td->td_rqindex, rqh); 500 return (td); 501 } 502 CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri); 503 504 return (NULL); 505} 506/* 507 * Remove the thread from the queue specified by its priority, and clear the 508 * corresponding status bit if the queue becomes empty. 509 * Caller must set state afterwards. 510 */ 511void 512runq_remove(struct runq *rq, struct thread *td) 513{ 514 515 runq_remove_idx(rq, td, NULL); 516} 517 518void 519runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx) 520{ 521 struct rqhead *rqh; 522 u_char pri; 523 524 KASSERT(td->td_flags & TDF_INMEM, 525 ("runq_remove_idx: thread swapped out")); 526 pri = td->td_rqindex; 527 KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); 528 rqh = &rq->rq_queues[pri]; 529 CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p", 530 td, td->td_priority, pri, rqh); 531 TAILQ_REMOVE(rqh, td, td_runq); 532 if (TAILQ_EMPTY(rqh)) { 533 CTR0(KTR_RUNQ, "runq_remove_idx: empty"); 534 runq_clrbit(rq, pri); 535 if (idx != NULL && *idx == pri) 536 *idx = (pri + 1) % RQ_NQS; 537 } 538} 539