1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Machine dependent interrupt code for x86.  For x86, we have to
30 * deal with different PICs.  Thus, we use the passed in vector to lookup
31 * an interrupt source associated with that vector.  The interrupt source
32 * describes which PIC the source belongs to and includes methods to handle
33 * that source.
34 */
35
36#include "opt_atpic.h"
37#include "opt_ddb.h"
38#include "opt_smp.h"
39
40#include <sys/param.h>
41#include <sys/bus.h>
42#include <sys/interrupt.h>
43#include <sys/ktr.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/queue.h>
50#include <sys/sbuf.h>
51#include <sys/smp.h>
52#include <sys/sx.h>
53#include <sys/sysctl.h>
54#include <sys/syslog.h>
55#include <sys/systm.h>
56#include <sys/taskqueue.h>
57#include <sys/vmmeter.h>
58#include <machine/clock.h>
59#include <machine/intr_machdep.h>
60#include <machine/smp.h>
61#ifdef DDB
62#include <ddb/ddb.h>
63#endif
64
65#ifndef DEV_ATPIC
66#include <machine/segments.h>
67#include <machine/frame.h>
68#include <dev/ic/i8259.h>
69#include <x86/isa/icu.h>
70#include <isa/isareg.h>
71#endif
72
73#include <vm/vm.h>
74
75typedef void (*mask_fn)(void *);
76
77static int intrcnt_index;
78static struct intsrc **interrupt_sources;
79#ifdef SMP
80static struct intsrc **interrupt_sorted;
81static int intrbalance;
82SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RWTUN, &intrbalance, 0,
83    "Interrupt auto-balance interval (seconds).  Zero disables.");
84static struct timeout_task intrbalance_task;
85#endif
86static struct sx intrsrc_lock;
87static struct mtx intrpic_lock;
88static struct mtx intrcnt_lock;
89static TAILQ_HEAD(pics_head, pic) pics;
90u_int num_io_irqs;
91
92#if defined(SMP) && !defined(EARLY_AP_STARTUP)
93#error EARLY_AP_STARTUP required on x86
94#endif
95
96#define	INTRNAME_LEN	(MAXCOMLEN + 1)
97u_long *intrcnt;
98char *intrnames;
99size_t sintrcnt = sizeof(intrcnt);
100size_t sintrnames = sizeof(intrnames);
101int nintrcnt;
102
103static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
104
105static int	intr_assign_cpu(void *arg, int cpu);
106static void	intr_disable_src(void *arg);
107static void	intr_init(void *__dummy);
108static int	intr_pic_registered(struct pic *pic);
109static void	intrcnt_setname(const char *name, int index);
110static void	intrcnt_updatename(struct intsrc *is);
111static void	intrcnt_register(struct intsrc *is);
112
113/*
114 * SYSINIT levels for SI_SUB_INTR:
115 *
116 * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
117 * SI_ORDER_SECOND: Xen PICs
118 * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
119 * SI_ORDER_FOURTH: Add 8259A PICs
120 * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
121 * SI_ORDER_MIDDLE: SMP interrupt counters
122 * SI_ORDER_ANY: Enable interrupts on BSP
123 */
124
125static int
126intr_pic_registered(struct pic *pic)
127{
128	struct pic *p;
129
130	TAILQ_FOREACH(p, &pics, pics) {
131		if (p == pic)
132			return (1);
133	}
134	return (0);
135}
136
137/*
138 * Register a new interrupt controller (PIC).  This is to support suspend
139 * and resume where we suspend/resume controllers rather than individual
140 * sources.  This also allows controllers with no active sources (such as
141 * 8259As in a system using the APICs) to participate in suspend and resume.
142 */
143int
144intr_register_pic(struct pic *pic)
145{
146	int error;
147
148	mtx_lock(&intrpic_lock);
149	if (intr_pic_registered(pic))
150		error = EBUSY;
151	else {
152		TAILQ_INSERT_TAIL(&pics, pic, pics);
153		error = 0;
154	}
155	mtx_unlock(&intrpic_lock);
156	return (error);
157}
158
159/*
160 * Allocate interrupt source arrays and register interrupt sources
161 * once the number of interrupts is known.
162 */
163static void
164intr_init_sources(void *arg)
165{
166	struct pic *pic;
167
168	MPASS(num_io_irqs > 0);
169
170	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
171	    M_INTR, M_WAITOK | M_ZERO);
172#ifdef SMP
173	interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
174	    M_INTR, M_WAITOK | M_ZERO);
175#endif
176
177	/*
178	 * - 1 ??? dummy counter.
179	 * - 2 counters for each I/O interrupt.
180	 * - 1 counter for each CPU for lapic timer.
181	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
182	 * - 8 counters for each CPU for IPI counters for SMP.
183	 */
184	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
185#ifdef COUNT_IPIS
186	if (mp_ncpus > 1)
187		nintrcnt += 8 * mp_ncpus;
188#endif
189	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
190	    M_ZERO);
191	intrnames = mallocarray(nintrcnt, INTRNAME_LEN, M_INTR, M_WAITOK |
192	    M_ZERO);
193	sintrcnt = nintrcnt * sizeof(u_long);
194	sintrnames = nintrcnt * INTRNAME_LEN;
195
196	intrcnt_setname("???", 0);
197	intrcnt_index = 1;
198
199	/*
200	 * NB: intrpic_lock is not held here to avoid LORs due to
201	 * malloc() in intr_register_source().  However, we are still
202	 * single-threaded at this point in startup so the list of
203	 * PICs shouldn't change.
204	 */
205	TAILQ_FOREACH(pic, &pics, pics) {
206		if (pic->pic_register_sources != NULL)
207			pic->pic_register_sources(pic);
208	}
209}
210SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
211    NULL);
212
213/*
214 * Register a new interrupt source with the global interrupt system.
215 * The global interrupts need to be disabled when this function is
216 * called.
217 */
218int
219intr_register_source(struct intsrc *isrc)
220{
221	int error, vector;
222
223	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
224	vector = isrc->is_pic->pic_vector(isrc);
225	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
226	    num_io_irqs));
227	if (interrupt_sources[vector] != NULL)
228		return (EEXIST);
229	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
230	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
231	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
232	    vector);
233	if (error)
234		return (error);
235	sx_xlock(&intrsrc_lock);
236	if (interrupt_sources[vector] != NULL) {
237		sx_xunlock(&intrsrc_lock);
238		intr_event_destroy(isrc->is_event);
239		return (EEXIST);
240	}
241	intrcnt_register(isrc);
242	interrupt_sources[vector] = isrc;
243	isrc->is_handlers = 0;
244	sx_xunlock(&intrsrc_lock);
245	return (0);
246}
247
248struct intsrc *
249intr_lookup_source(int vector)
250{
251
252	if (vector < 0 || vector >= num_io_irqs)
253		return (NULL);
254	return (interrupt_sources[vector]);
255}
256
257int
258intr_add_handler(struct intsrc *isrc, const char *name, driver_filter_t filter,
259    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
260    int domain)
261{
262	int error;
263
264	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
265	    arg, intr_priority(flags), flags, cookiep);
266	if (error == 0) {
267		sx_xlock(&intrsrc_lock);
268		intrcnt_updatename(isrc);
269		isrc->is_handlers++;
270		if (isrc->is_handlers == 1) {
271			isrc->is_domain = domain;
272			isrc->is_pic->pic_enable_intr(isrc);
273			isrc->is_pic->pic_enable_source(isrc);
274		}
275		sx_xunlock(&intrsrc_lock);
276	}
277	return (error);
278}
279
280int
281intr_remove_handler(void *cookie)
282{
283	struct intsrc *isrc;
284	int error;
285
286	isrc = intr_handler_source(cookie);
287	error = intr_event_remove_handler(cookie);
288	if (error == 0) {
289		sx_xlock(&intrsrc_lock);
290		isrc->is_handlers--;
291		if (isrc->is_handlers == 0) {
292			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
293			isrc->is_pic->pic_disable_intr(isrc);
294		}
295		intrcnt_updatename(isrc);
296		sx_xunlock(&intrsrc_lock);
297	}
298	return (error);
299}
300
301int
302intr_config_intr(struct intsrc *isrc, enum intr_trigger trig,
303    enum intr_polarity pol)
304{
305
306	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
307}
308
309static void
310intr_disable_src(void *arg)
311{
312	struct intsrc *isrc;
313
314	isrc = arg;
315	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
316}
317
318void
319intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
320{
321	struct intr_event *ie;
322	int vector;
323
324	/*
325	 * We count software interrupts when we process them.  The
326	 * code here follows previous practice, but there's an
327	 * argument for counting hardware interrupts when they're
328	 * processed too.
329	 */
330	(*isrc->is_count)++;
331	VM_CNT_INC(v_intr);
332
333	ie = isrc->is_event;
334
335	/*
336	 * XXX: We assume that IRQ 0 is only used for the ISA timer
337	 * device (clk).
338	 */
339	vector = isrc->is_pic->pic_vector(isrc);
340	if (vector == 0)
341		clkintr_pending = 1;
342
343	/*
344	 * For stray interrupts, mask and EOI the source, bump the
345	 * stray count, and log the condition.
346	 */
347	if (intr_event_handle(ie, frame) != 0) {
348		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
349		(*isrc->is_straycount)++;
350		if (*isrc->is_straycount < INTR_STRAY_LOG_MAX)
351			log(LOG_ERR, "stray irq%d\n", vector);
352		else if (*isrc->is_straycount == INTR_STRAY_LOG_MAX)
353			log(LOG_CRIT,
354			    "too many stray irq %d's: not logging anymore\n",
355			    vector);
356	}
357}
358
359void
360intr_resume(bool suspend_cancelled)
361{
362	struct pic *pic;
363
364#ifndef DEV_ATPIC
365	atpic_reset();
366#endif
367	mtx_lock(&intrpic_lock);
368	TAILQ_FOREACH(pic, &pics, pics) {
369		if (pic->pic_resume != NULL)
370			pic->pic_resume(pic, suspend_cancelled);
371	}
372	mtx_unlock(&intrpic_lock);
373}
374
375void
376intr_suspend(void)
377{
378	struct pic *pic;
379
380	mtx_lock(&intrpic_lock);
381	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
382		if (pic->pic_suspend != NULL)
383			pic->pic_suspend(pic);
384	}
385	mtx_unlock(&intrpic_lock);
386}
387
388static int
389intr_assign_cpu(void *arg, int cpu)
390{
391#ifdef SMP
392	struct intsrc *isrc;
393	int error;
394
395	MPASS(mp_ncpus == 1 || smp_started);
396
397	/* Nothing to do if there is only a single CPU. */
398	if (mp_ncpus > 1 && cpu != NOCPU) {
399		isrc = arg;
400		sx_xlock(&intrsrc_lock);
401		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
402		if (error == 0)
403			isrc->is_cpu = cpu;
404		sx_xunlock(&intrsrc_lock);
405	} else
406		error = 0;
407	return (error);
408#else
409	return (EOPNOTSUPP);
410#endif
411}
412
413static void
414intrcnt_setname(const char *name, int index)
415{
416
417	snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s",
418	    INTRNAME_LEN - 1, name);
419}
420
421static void
422intrcnt_updatename(struct intsrc *is)
423{
424
425	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
426}
427
428static void
429intrcnt_register(struct intsrc *is)
430{
431	char straystr[INTRNAME_LEN];
432
433	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
434	mtx_lock_spin(&intrcnt_lock);
435	MPASS(intrcnt_index + 2 <= nintrcnt);
436	is->is_index = intrcnt_index;
437	intrcnt_index += 2;
438	snprintf(straystr, sizeof(straystr), "stray irq%d",
439	    is->is_pic->pic_vector(is));
440	intrcnt_updatename(is);
441	is->is_count = &intrcnt[is->is_index];
442	intrcnt_setname(straystr, is->is_index + 1);
443	is->is_straycount = &intrcnt[is->is_index + 1];
444	mtx_unlock_spin(&intrcnt_lock);
445}
446
447void
448intrcnt_add(const char *name, u_long **countp)
449{
450
451	mtx_lock_spin(&intrcnt_lock);
452	MPASS(intrcnt_index < nintrcnt);
453	*countp = &intrcnt[intrcnt_index];
454	intrcnt_setname(name, intrcnt_index);
455	intrcnt_index++;
456	mtx_unlock_spin(&intrcnt_lock);
457}
458
459static void
460intr_init(void *dummy __unused)
461{
462
463	TAILQ_INIT(&pics);
464	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
465	sx_init(&intrsrc_lock, "intrsrc");
466	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
467}
468SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
469
470static void
471intr_init_final(void *dummy __unused)
472{
473
474	/*
475	 * Enable interrupts on the BSP after all of the interrupt
476	 * controllers are initialized.  Device interrupts are still
477	 * disabled in the interrupt controllers until interrupt
478	 * handlers are registered.  Interrupts are enabled on each AP
479	 * after their first context switch.
480	 */
481	enable_intr();
482}
483SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
484
485#ifndef DEV_ATPIC
486/* Initialize the two 8259A's to a known-good shutdown state. */
487void
488atpic_reset(void)
489{
490
491	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
492	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
493	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
494	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
495	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
496	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
497
498	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
499	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
500	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
501	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
502	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
503	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
504}
505#endif
506
507/* Add a description to an active interrupt handler. */
508int
509intr_describe(struct intsrc *isrc, void *ih, const char *descr)
510{
511	int error;
512
513	error = intr_event_describe_handler(isrc->is_event, ih, descr);
514	if (error)
515		return (error);
516	intrcnt_updatename(isrc);
517	return (0);
518}
519
520void
521intr_reprogram(void)
522{
523	struct intsrc *is;
524	u_int v;
525
526	sx_xlock(&intrsrc_lock);
527	for (v = 0; v < num_io_irqs; v++) {
528		is = interrupt_sources[v];
529		if (is == NULL)
530			continue;
531		if (is->is_pic->pic_reprogram_pin != NULL)
532			is->is_pic->pic_reprogram_pin(is);
533	}
534	sx_xunlock(&intrsrc_lock);
535}
536
537#ifdef DDB
538/*
539 * Dump data about interrupt handlers
540 */
541DB_SHOW_COMMAND(irqs, db_show_irqs)
542{
543	struct intsrc **isrc;
544	u_int i;
545	int verbose;
546
547	if (strcmp(modif, "v") == 0)
548		verbose = 1;
549	else
550		verbose = 0;
551	isrc = interrupt_sources;
552	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
553		if (*isrc != NULL)
554			db_dump_intr_event((*isrc)->is_event, verbose);
555}
556#endif
557
558#ifdef SMP
559/*
560 * Support for balancing interrupt sources across CPUs.  For now we just
561 * allocate CPUs round-robin.
562 *
563 * XXX If the system has a domain with without any usable CPUs (e.g., where all
564 * APIC IDs are 256 or greater and we do not have an IOMMU) we use
565 * intr_no_domain to fall back to assigning interrupts without regard for
566 * domain.  Once we can rely on the presence of an IOMMU on all x86 platforms
567 * we can revert this.
568 */
569
570cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
571static int current_cpu[MAXMEMDOM];
572static bool intr_no_domain;
573
574static void
575intr_init_cpus(void)
576{
577	int i;
578
579	for (i = 0; i < vm_ndomains; i++) {
580		if (CPU_OVERLAP(&cpuset_domain[i], &intr_cpus) == 0) {
581			intr_no_domain = true;
582			printf("%s: unable to route interrupts to CPUs in domain %d\n",
583			    __func__, i);
584		}
585
586		current_cpu[i] = 0;
587		if (intr_no_domain && i > 0)
588			continue;
589		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
590		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
591			intr_next_cpu(i);
592	}
593}
594
595/*
596 * Return the CPU that the next interrupt source should use.  For now
597 * this just returns the next local APIC according to round-robin.
598 */
599u_int
600intr_next_cpu(int domain)
601{
602	u_int apic_id;
603
604	MPASS(mp_ncpus == 1 || smp_started);
605	if (mp_ncpus == 1)
606		return (PCPU_GET(apic_id));
607
608	if (intr_no_domain)
609		domain = 0;
610	mtx_lock_spin(&icu_lock);
611	apic_id = cpu_apic_ids[current_cpu[domain]];
612	do {
613		current_cpu[domain]++;
614		if (current_cpu[domain] > mp_maxid)
615			current_cpu[domain] = 0;
616	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
617	    (!CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]) &&
618	    !intr_no_domain));
619	mtx_unlock_spin(&icu_lock);
620	return (apic_id);
621}
622
623/*
624 * Add a CPU to our mask of valid CPUs that can be destinations of
625 * interrupts.
626 */
627void
628intr_add_cpu(u_int cpu)
629{
630
631	if (cpu >= MAXCPU)
632		panic("%s: Invalid CPU ID %u", __func__, cpu);
633	if (bootverbose)
634		printf("INTR: Adding local APIC %d as a target\n",
635		    cpu_apic_ids[cpu]);
636
637	CPU_SET(cpu, &intr_cpus);
638}
639
640static void
641intr_smp_startup(void *arg __unused)
642{
643
644	intr_init_cpus();
645	return;
646}
647SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
648    NULL);
649
650/*
651 * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
652 */
653static int
654sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
655{
656	struct sbuf sbuf;
657	struct intsrc *isrc;
658	u_int i;
659	int error;
660
661	error = sysctl_wire_old_buffer(req, 0);
662	if (error != 0)
663		return (error);
664
665	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
666	sx_slock(&intrsrc_lock);
667	for (i = 0; i < num_io_irqs; i++) {
668		isrc = interrupt_sources[i];
669		if (isrc == NULL)
670			continue;
671		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
672		    isrc->is_event->ie_fullname,
673		    isrc->is_index,
674		    isrc->is_cpu,
675		    isrc->is_domain,
676		    *isrc->is_count);
677	}
678
679	sx_sunlock(&intrsrc_lock);
680	error = sbuf_finish(&sbuf);
681	sbuf_delete(&sbuf);
682	return (error);
683}
684SYSCTL_PROC(_hw, OID_AUTO, intrs,
685    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
686    0, 0, sysctl_hw_intrs, "A",
687    "interrupt:number @cpu: count");
688
689/*
690 * Compare two, possibly NULL, entries in the interrupt source array
691 * by load.
692 */
693static int
694intrcmp(const void *one, const void *two)
695{
696	const struct intsrc *i1, *i2;
697
698	i1 = *(const struct intsrc * const *)one;
699	i2 = *(const struct intsrc * const *)two;
700	if (i1 != NULL && i2 != NULL)
701		return (*i1->is_count - *i2->is_count);
702	if (i1 != NULL)
703		return (1);
704	if (i2 != NULL)
705		return (-1);
706	return (0);
707}
708
709/*
710 * Balance IRQs across available CPUs according to load.
711 */
712static void
713intr_balance(void *dummy __unused, int pending __unused)
714{
715	struct intsrc *isrc;
716	int interval;
717	u_int cpu;
718	int i;
719
720	interval = intrbalance;
721	if (interval == 0)
722		goto out;
723
724	/*
725	 * Sort interrupts according to count.
726	 */
727	sx_xlock(&intrsrc_lock);
728	memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
729	    sizeof(interrupt_sorted[0]));
730	qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
731	    intrcmp);
732
733	/*
734	 * Restart the scan from the same location to avoid moving in the
735	 * common case.
736	 */
737	intr_init_cpus();
738
739	/*
740	 * Assign round-robin from most loaded to least.
741	 */
742	for (i = num_io_irqs - 1; i >= 0; i--) {
743		isrc = interrupt_sorted[i];
744		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
745			continue;
746		cpu = current_cpu[isrc->is_domain];
747		intr_next_cpu(isrc->is_domain);
748		if (isrc->is_cpu != cpu &&
749		    isrc->is_pic->pic_assign_cpu(isrc,
750		    cpu_apic_ids[cpu]) == 0)
751			isrc->is_cpu = cpu;
752	}
753	sx_xunlock(&intrsrc_lock);
754out:
755	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
756	    interval ? hz * interval : hz * 60);
757
758}
759
760static void
761intr_balance_init(void *dummy __unused)
762{
763
764	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
765	    NULL);
766	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
767}
768SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
769
770#else
771/*
772 * Always route interrupts to the current processor in the UP case.
773 */
774u_int
775intr_next_cpu(int domain)
776{
777
778	return (PCPU_GET(apic_id));
779}
780#endif
781