mp_machdep.c revision 331910
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/mp_machdep.c 331910 2018-04-03 07:52:06Z avg $"); 29 30#include "opt_cpu.h" 31#include "opt_ddb.h" 32#include "opt_kstack_pages.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/bus.h> 39#include <sys/cpuset.h> 40#ifdef GPROF 41#include <sys/gmon.h> 42#endif 43#include <sys/kernel.h> 44#include <sys/ktr.h> 45#include <sys/lock.h> 46#include <sys/malloc.h> 47#include <sys/memrange.h> 48#include <sys/mutex.h> 49#include <sys/pcpu.h> 50#include <sys/proc.h> 51#include <sys/sched.h> 52#include <sys/smp.h> 53#include <sys/sysctl.h> 54 55#include <vm/vm.h> 56#include <vm/vm_param.h> 57#include <vm/pmap.h> 58#include <vm/vm_kern.h> 59#include <vm/vm_extern.h> 60 61#include <x86/apicreg.h> 62#include <machine/clock.h> 63#include <machine/cputypes.h> 64#include <machine/cpufunc.h> 65#include <x86/mca.h> 66#include <machine/md_var.h> 67#include <machine/pcb.h> 68#include <machine/psl.h> 69#include <machine/smp.h> 70#include <machine/specialreg.h> 71#include <machine/tss.h> 72#include <machine/cpu.h> 73 74#define WARMBOOT_TARGET 0 75#define WARMBOOT_OFF (KERNBASE + 0x0467) 76#define WARMBOOT_SEG (KERNBASE + 0x0469) 77 78#define CMOS_REG (0x70) 79#define CMOS_DATA (0x71) 80#define BIOS_RESET (0x0f) 81#define BIOS_WARM (0x0a) 82 83/* lock region used by kernel profiling */ 84int mcount_lock; 85 86int mp_naps; /* # of Applications processors */ 87int boot_cpu_id = -1; /* designated BSP */ 88 89extern struct pcpu __pcpu[]; 90 91/* AP uses this during bootstrap. Do not staticize. */ 92char *bootSTK; 93static int bootAP; 94 95/* Free these after use */ 96void *bootstacks[MAXCPU]; 97 98/* Temporary variables for init_secondary() */ 99char *doublefault_stack; 100char *nmi_stack; 101void *dpcpu; 102 103struct pcb stoppcbs[MAXCPU]; 104struct susppcb **susppcbs; 105 106/* Variables needed for SMP tlb shootdown. */ 107vm_offset_t smp_tlb_addr2; 108struct invpcid_descr smp_tlb_invpcid; 109volatile int smp_tlb_wait; 110uint64_t pcid_cr3; 111pmap_t smp_tlb_pmap; 112extern int invpcid_works; 113 114#ifdef COUNT_IPIS 115/* Interrupt counts. */ 116static u_long *ipi_preempt_counts[MAXCPU]; 117static u_long *ipi_ast_counts[MAXCPU]; 118u_long *ipi_invltlb_counts[MAXCPU]; 119u_long *ipi_invlrng_counts[MAXCPU]; 120u_long *ipi_invlpg_counts[MAXCPU]; 121u_long *ipi_invlcache_counts[MAXCPU]; 122u_long *ipi_rendezvous_counts[MAXCPU]; 123static u_long *ipi_hardclock_counts[MAXCPU]; 124#endif 125 126/* Default cpu_ops implementation. */ 127struct cpu_ops cpu_ops = { 128 .ipi_vectored = lapic_ipi_vectored 129}; 130 131extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 132 133extern int pmap_pcid_enabled; 134 135/* 136 * Local data and functions. 137 */ 138 139static volatile cpuset_t ipi_nmi_pending; 140 141volatile cpuset_t resuming_cpus; 142volatile cpuset_t toresume_cpus; 143 144/* used to hold the AP's until we are ready to release them */ 145static struct mtx ap_boot_mtx; 146 147/* Set to 1 once we're ready to let the APs out of the pen. */ 148static volatile int aps_ready = 0; 149 150/* 151 * Store data from cpu_add() until later in the boot when we actually setup 152 * the APs. 153 */ 154struct cpu_info { 155 int cpu_present:1; 156 int cpu_bsp:1; 157 int cpu_disabled:1; 158 int cpu_hyperthread:1; 159} static cpu_info[MAX_APIC_ID + 1]; 160int cpu_apic_ids[MAXCPU]; 161int apic_cpuids[MAX_APIC_ID + 1]; 162 163/* Holds pending bitmap based IPIs per CPU */ 164volatile u_int cpu_ipi_pending[MAXCPU]; 165 166static u_int boot_address; 167static int cpu_logical; /* logical cpus per core */ 168static int cpu_cores; /* cores per package */ 169 170static void assign_cpu_ids(void); 171static void set_interrupt_apic_ids(void); 172static int start_all_aps(void); 173static int start_ap(int apic_id); 174static void release_aps(void *dummy); 175 176static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 177static int hyperthreading_allowed = 1; 178static u_int bootMP_size; 179 180static void 181mem_range_AP_init(void) 182{ 183 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 184 mem_range_softc.mr_op->initAP(&mem_range_softc); 185} 186 187static void 188topo_probe_amd(void) 189{ 190 int core_id_bits; 191 int id; 192 193 /* AMD processors do not support HTT. */ 194 cpu_logical = 1; 195 196 if ((amd_feature2 & AMDID2_CMP) == 0) { 197 cpu_cores = 1; 198 return; 199 } 200 201 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 202 AMDID_COREID_SIZE_SHIFT; 203 if (core_id_bits == 0) { 204 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 205 return; 206 } 207 208 /* Fam 10h and newer should get here. */ 209 for (id = 0; id <= MAX_APIC_ID; id++) { 210 /* Check logical CPU availability. */ 211 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 212 continue; 213 /* Check if logical CPU has the same package ID. */ 214 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 215 continue; 216 cpu_cores++; 217 } 218} 219 220/* 221 * Round up to the next power of two, if necessary, and then 222 * take log2. 223 * Returns -1 if argument is zero. 224 */ 225static __inline int 226mask_width(u_int x) 227{ 228 229 return (fls(x << (1 - powerof2(x))) - 1); 230} 231 232static void 233topo_probe_0x4(void) 234{ 235 u_int p[4]; 236 int pkg_id_bits; 237 int core_id_bits; 238 int max_cores; 239 int max_logical; 240 int id; 241 242 /* Both zero and one here mean one logical processor per package. */ 243 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 244 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 245 if (max_logical <= 1) 246 return; 247 248 /* 249 * Because of uniformity assumption we examine only 250 * those logical processors that belong to the same 251 * package as BSP. Further, we count number of 252 * logical processors that belong to the same core 253 * as BSP thus deducing number of threads per core. 254 */ 255 if (cpu_high >= 0x4) { 256 cpuid_count(0x04, 0, p); 257 max_cores = ((p[0] >> 26) & 0x3f) + 1; 258 } else 259 max_cores = 1; 260 core_id_bits = mask_width(max_logical/max_cores); 261 if (core_id_bits < 0) 262 return; 263 pkg_id_bits = core_id_bits + mask_width(max_cores); 264 265 for (id = 0; id <= MAX_APIC_ID; id++) { 266 /* Check logical CPU availability. */ 267 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 268 continue; 269 /* Check if logical CPU has the same package ID. */ 270 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 271 continue; 272 cpu_cores++; 273 /* Check if logical CPU has the same package and core IDs. */ 274 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 275 cpu_logical++; 276 } 277 278 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 279 ("topo_probe_0x4 couldn't find BSP")); 280 281 cpu_cores /= cpu_logical; 282 hyperthreading_cpus = cpu_logical; 283} 284 285static void 286topo_probe_0xb(void) 287{ 288 u_int p[4]; 289 int bits; 290 int cnt; 291 int i; 292 int logical; 293 int type; 294 int x; 295 296 /* We only support three levels for now. */ 297 for (i = 0; i < 3; i++) { 298 cpuid_count(0x0b, i, p); 299 300 /* Fall back if CPU leaf 11 doesn't really exist. */ 301 if (i == 0 && p[1] == 0) { 302 topo_probe_0x4(); 303 return; 304 } 305 306 bits = p[0] & 0x1f; 307 logical = p[1] &= 0xffff; 308 type = (p[2] >> 8) & 0xff; 309 if (type == 0 || logical == 0) 310 break; 311 /* 312 * Because of uniformity assumption we examine only 313 * those logical processors that belong to the same 314 * package as BSP. 315 */ 316 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 317 if (!cpu_info[x].cpu_present || 318 cpu_info[x].cpu_disabled) 319 continue; 320 if (x >> bits == boot_cpu_id >> bits) 321 cnt++; 322 } 323 if (type == CPUID_TYPE_SMT) 324 cpu_logical = cnt; 325 else if (type == CPUID_TYPE_CORE) 326 cpu_cores = cnt; 327 } 328 if (cpu_logical == 0) 329 cpu_logical = 1; 330 cpu_cores /= cpu_logical; 331} 332 333/* 334 * Both topology discovery code and code that consumes topology 335 * information assume top-down uniformity of the topology. 336 * That is, all physical packages must be identical and each 337 * core in a package must have the same number of threads. 338 * Topology information is queried only on BSP, on which this 339 * code runs and for which it can query CPUID information. 340 * Then topology is extrapolated on all packages using the 341 * uniformity assumption. 342 */ 343static void 344topo_probe(void) 345{ 346 static int cpu_topo_probed = 0; 347 348 if (cpu_topo_probed) 349 return; 350 351 CPU_ZERO(&logical_cpus_mask); 352 if (mp_ncpus <= 1) 353 cpu_cores = cpu_logical = 1; 354 else if (cpu_vendor_id == CPU_VENDOR_AMD) 355 topo_probe_amd(); 356 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 357 /* 358 * See Intel(R) 64 Architecture Processor 359 * Topology Enumeration article for details. 360 * 361 * Note that 0x1 <= cpu_high < 4 case should be 362 * compatible with topo_probe_0x4() logic when 363 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 364 * or it should trigger the fallback otherwise. 365 */ 366 if (cpu_high >= 0xb) 367 topo_probe_0xb(); 368 else if (cpu_high >= 0x1) 369 topo_probe_0x4(); 370 } 371 372 /* 373 * Fallback: assume each logical CPU is in separate 374 * physical package. That is, no multi-core, no SMT. 375 */ 376 if (cpu_cores == 0 || cpu_logical == 0) 377 cpu_cores = cpu_logical = 1; 378 cpu_topo_probed = 1; 379} 380 381struct cpu_group * 382cpu_topo(void) 383{ 384 int cg_flags; 385 386 /* 387 * Determine whether any threading flags are 388 * necessry. 389 */ 390 topo_probe(); 391 if (cpu_logical > 1 && hyperthreading_cpus) 392 cg_flags = CG_FLAG_HTT; 393 else if (cpu_logical > 1) 394 cg_flags = CG_FLAG_SMT; 395 else 396 cg_flags = 0; 397 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 398 printf("WARNING: Non-uniform processors.\n"); 399 printf("WARNING: Using suboptimal topology.\n"); 400 return (smp_topo_none()); 401 } 402 /* 403 * No multi-core or hyper-threaded. 404 */ 405 if (cpu_logical * cpu_cores == 1) 406 return (smp_topo_none()); 407 /* 408 * Only HTT no multi-core. 409 */ 410 if (cpu_logical > 1 && cpu_cores == 1) 411 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 412 /* 413 * Only multi-core no HTT. 414 */ 415 if (cpu_cores > 1 && cpu_logical == 1) 416 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 417 /* 418 * Both HTT and multi-core. 419 */ 420 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 421 CG_SHARE_L1, cpu_logical, cg_flags)); 422} 423 424/* 425 * Calculate usable address in base memory for AP trampoline code. 426 */ 427u_int 428mp_bootaddress(u_int basemem) 429{ 430 431 bootMP_size = mptramp_end - mptramp_start; 432 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 433 if (((basemem * 1024) - boot_address) < bootMP_size) 434 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 435 /* 3 levels of page table pages */ 436 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 437 438 return mptramp_pagetables; 439} 440 441void 442cpu_add(u_int apic_id, char boot_cpu) 443{ 444 445 if (apic_id > MAX_APIC_ID) { 446 panic("SMP: APIC ID %d too high", apic_id); 447 return; 448 } 449 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 450 apic_id)); 451 cpu_info[apic_id].cpu_present = 1; 452 if (boot_cpu) { 453 KASSERT(boot_cpu_id == -1, 454 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 455 boot_cpu_id)); 456 boot_cpu_id = apic_id; 457 cpu_info[apic_id].cpu_bsp = 1; 458 } 459 if (mp_ncpus < MAXCPU) { 460 mp_ncpus++; 461 mp_maxid = mp_ncpus - 1; 462 } 463 if (bootverbose) 464 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 465 "AP"); 466} 467 468void 469cpu_mp_setmaxid(void) 470{ 471 472 /* 473 * mp_maxid should be already set by calls to cpu_add(). 474 * Just sanity check its value here. 475 */ 476 if (mp_ncpus == 0) 477 KASSERT(mp_maxid == 0, 478 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 479 else if (mp_ncpus == 1) 480 mp_maxid = 0; 481 else 482 KASSERT(mp_maxid >= mp_ncpus - 1, 483 ("%s: counters out of sync: max %d, count %d", __func__, 484 mp_maxid, mp_ncpus)); 485} 486 487int 488cpu_mp_probe(void) 489{ 490 491 /* 492 * Always record BSP in CPU map so that the mbuf init code works 493 * correctly. 494 */ 495 CPU_SETOF(0, &all_cpus); 496 if (mp_ncpus == 0) { 497 /* 498 * No CPUs were found, so this must be a UP system. Setup 499 * the variables to represent a system with a single CPU 500 * with an id of 0. 501 */ 502 mp_ncpus = 1; 503 return (0); 504 } 505 506 /* At least one CPU was found. */ 507 if (mp_ncpus == 1) { 508 /* 509 * One CPU was found, so this must be a UP system with 510 * an I/O APIC. 511 */ 512 mp_maxid = 0; 513 return (0); 514 } 515 516 /* At least two CPUs were found. */ 517 return (1); 518} 519 520/* 521 * Initialize the IPI handlers and start up the AP's. 522 */ 523void 524cpu_mp_start(void) 525{ 526 int i; 527 528 /* Initialize the logical ID to APIC ID table. */ 529 for (i = 0; i < MAXCPU; i++) { 530 cpu_apic_ids[i] = -1; 531 cpu_ipi_pending[i] = 0; 532 } 533 534 /* Install an inter-CPU IPI for TLB invalidation */ 535 if (pmap_pcid_enabled) { 536 setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT, 537 SEL_KPL, 0); 538 setidt(IPI_INVLPG, IDTVEC(invlpg_pcid), SDT_SYSIGT, 539 SEL_KPL, 0); 540 } else { 541 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 542 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 543 } 544 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 545 546 /* Install an inter-CPU IPI for cache invalidation. */ 547 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 548 549 /* Install an inter-CPU IPI for all-CPU rendezvous */ 550 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 551 552 /* Install generic inter-CPU IPI handler */ 553 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 554 SDT_SYSIGT, SEL_KPL, 0); 555 556 /* Install an inter-CPU IPI for CPU stop/restart */ 557 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 558 559 /* Install an inter-CPU IPI for CPU suspend/resume */ 560 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 561 562 /* Set boot_cpu_id if needed. */ 563 if (boot_cpu_id == -1) { 564 boot_cpu_id = PCPU_GET(apic_id); 565 cpu_info[boot_cpu_id].cpu_bsp = 1; 566 } else 567 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 568 ("BSP's APIC ID doesn't match boot_cpu_id")); 569 570 /* Probe logical/physical core configuration. */ 571 topo_probe(); 572 573 assign_cpu_ids(); 574 575 /* Start each Application Processor */ 576 start_all_aps(); 577 578 set_interrupt_apic_ids(); 579} 580 581 582/* 583 * Print various information about the SMP system hardware and setup. 584 */ 585void 586cpu_mp_announce(void) 587{ 588 const char *hyperthread; 589 int i; 590 591 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 592 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 593 if (hyperthreading_cpus > 1) 594 printf(" x %d HTT threads", cpu_logical); 595 else if (cpu_logical > 1) 596 printf(" x %d SMT threads", cpu_logical); 597 printf("\n"); 598 599 /* List active CPUs first. */ 600 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 601 for (i = 1; i < mp_ncpus; i++) { 602 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 603 hyperthread = "/HT"; 604 else 605 hyperthread = ""; 606 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 607 cpu_apic_ids[i]); 608 } 609 610 /* List disabled CPUs last. */ 611 for (i = 0; i <= MAX_APIC_ID; i++) { 612 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 613 continue; 614 if (cpu_info[i].cpu_hyperthread) 615 hyperthread = "/HT"; 616 else 617 hyperthread = ""; 618 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 619 i); 620 } 621} 622 623/* 624 * AP CPU's call this to initialize themselves. 625 */ 626void 627init_secondary(void) 628{ 629 struct pcpu *pc; 630 struct nmi_pcpu *np; 631 u_int64_t msr, cr0; 632 u_int cpuid; 633 int cpu, gsel_tss, x; 634 struct region_descriptor ap_gdt; 635 636 /* Set by the startup code for us to use */ 637 cpu = bootAP; 638 639 /* Init tss */ 640 common_tss[cpu] = common_tss[0]; 641 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 642 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 643 IOPAGES * PAGE_SIZE; 644 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 645 646 /* The NMI stack runs on IST2. */ 647 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 648 common_tss[cpu].tss_ist2 = (long) np; 649 650 /* Prepare private GDT */ 651 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 652 for (x = 0; x < NGDT; x++) { 653 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 654 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 655 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 656 } 657 ssdtosyssd(&gdt_segs[GPROC0_SEL], 658 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 659 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 660 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 661 lgdt(&ap_gdt); /* does magic intra-segment return */ 662 663 /* Get per-cpu data */ 664 pc = &__pcpu[cpu]; 665 666 /* prime data page for it to use */ 667 pcpu_init(pc, cpu, sizeof(struct pcpu)); 668 dpcpu_init(dpcpu, cpu); 669 pc->pc_apic_id = cpu_apic_ids[cpu]; 670 pc->pc_prvspace = pc; 671 pc->pc_curthread = 0; 672 pc->pc_tssp = &common_tss[cpu]; 673 pc->pc_commontssp = &common_tss[cpu]; 674 pc->pc_rsp0 = 0; 675 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 676 GPROC0_SEL]; 677 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 678 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 679 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 680 GUSERLDT_SEL]; 681 682 /* Save the per-cpu pointer for use by the NMI handler. */ 683 np->np_pcpu = (register_t) pc; 684 685 wrmsr(MSR_FSBASE, 0); /* User value */ 686 wrmsr(MSR_GSBASE, (u_int64_t)pc); 687 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 688 fix_cpuid(); 689 690 lidt(&r_idt); 691 692 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 693 ltr(gsel_tss); 694 695 /* 696 * Set to a known state: 697 * Set by mpboot.s: CR0_PG, CR0_PE 698 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 699 */ 700 cr0 = rcr0(); 701 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 702 load_cr0(cr0); 703 704 /* Set up the fast syscall stuff */ 705 msr = rdmsr(MSR_EFER) | EFER_SCE; 706 wrmsr(MSR_EFER, msr); 707 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 708 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 709 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 710 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 711 wrmsr(MSR_STAR, msr); 712 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 713 714 /* Disable local APIC just to be sure. */ 715 lapic_disable(); 716 717 /* signal our startup to the BSP. */ 718 mp_naps++; 719 720 /* Spin until the BSP releases the AP's. */ 721 while (!aps_ready) 722 ia32_pause(); 723 724 /* Initialize the PAT MSR. */ 725 pmap_init_pat(); 726 727 /* set up CPU registers and state */ 728 cpu_setregs(); 729 730 /* set up SSE/NX */ 731 initializecpu(); 732 733 /* set up FPU state on the AP */ 734 fpuinit(); 735 736 if (cpu_ops.cpu_init) 737 cpu_ops.cpu_init(); 738 739 /* A quick check from sanity claus */ 740 cpuid = PCPU_GET(cpuid); 741 if (PCPU_GET(apic_id) != lapic_id()) { 742 printf("SMP: cpuid = %d\n", cpuid); 743 printf("SMP: actual apic_id = %d\n", lapic_id()); 744 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 745 panic("cpuid mismatch! boom!!"); 746 } 747 748 /* Initialize curthread. */ 749 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 750 PCPU_SET(curthread, PCPU_GET(idlethread)); 751 752 mca_init(); 753 754 mtx_lock_spin(&ap_boot_mtx); 755 756 /* Init local apic for irq's */ 757 lapic_setup(1); 758 759 /* Set memory range attributes for this CPU to match the BSP */ 760 mem_range_AP_init(); 761 762 smp_cpus++; 763 764 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 765 printf("SMP: AP CPU #%d Launched!\n", cpuid); 766 767 /* Determine if we are a logical CPU. */ 768 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 769 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 770 CPU_SET(cpuid, &logical_cpus_mask); 771 772 if (bootverbose) 773 lapic_dump("AP"); 774 775 if (smp_cpus == mp_ncpus) { 776 /* enable IPI's, tlb shootdown, freezes etc */ 777 atomic_store_rel_int(&smp_started, 1); 778 } 779 780 /* 781 * Enable global pages TLB extension 782 * This also implicitly flushes the TLB 783 */ 784 785 load_cr4(rcr4() | CR4_PGE); 786 if (pmap_pcid_enabled) 787 load_cr4(rcr4() | CR4_PCIDE); 788 load_ds(_udatasel); 789 load_es(_udatasel); 790 load_fs(_ufssel); 791 mtx_unlock_spin(&ap_boot_mtx); 792 793 /* Wait until all the AP's are up. */ 794 while (smp_started == 0) 795 ia32_pause(); 796 797 /* Start per-CPU event timers. */ 798 cpu_initclocks_ap(); 799 800 sched_throw(NULL); 801 802 panic("scheduler returned us to %s", __func__); 803 /* NOTREACHED */ 804} 805 806/******************************************************************* 807 * local functions and data 808 */ 809 810/* 811 * We tell the I/O APIC code about all the CPUs we want to receive 812 * interrupts. If we don't want certain CPUs to receive IRQs we 813 * can simply not tell the I/O APIC code about them in this function. 814 * We also do not tell it about the BSP since it tells itself about 815 * the BSP internally to work with UP kernels and on UP machines. 816 */ 817static void 818set_interrupt_apic_ids(void) 819{ 820 u_int i, apic_id; 821 822 for (i = 0; i < MAXCPU; i++) { 823 apic_id = cpu_apic_ids[i]; 824 if (apic_id == -1) 825 continue; 826 if (cpu_info[apic_id].cpu_bsp) 827 continue; 828 if (cpu_info[apic_id].cpu_disabled) 829 continue; 830 831 /* Don't let hyperthreads service interrupts. */ 832 if (hyperthreading_cpus > 1 && 833 apic_id % hyperthreading_cpus != 0) 834 continue; 835 836 intr_add_cpu(i); 837 } 838} 839 840/* 841 * Assign logical CPU IDs to local APICs. 842 */ 843static void 844assign_cpu_ids(void) 845{ 846 u_int i; 847 848 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 849 &hyperthreading_allowed); 850 851 /* Check for explicitly disabled CPUs. */ 852 for (i = 0; i <= MAX_APIC_ID; i++) { 853 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 854 continue; 855 856 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 857 cpu_info[i].cpu_hyperthread = 1; 858 859 /* 860 * Don't use HT CPU if it has been disabled by a 861 * tunable. 862 */ 863 if (hyperthreading_allowed == 0) { 864 cpu_info[i].cpu_disabled = 1; 865 continue; 866 } 867 } 868 869 /* Don't use this CPU if it has been disabled by a tunable. */ 870 if (resource_disabled("lapic", i)) { 871 cpu_info[i].cpu_disabled = 1; 872 continue; 873 } 874 } 875 876 if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { 877 hyperthreading_cpus = 0; 878 cpu_logical = 1; 879 } 880 881 /* 882 * Assign CPU IDs to local APIC IDs and disable any CPUs 883 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 884 * 885 * To minimize confusion for userland, we attempt to number 886 * CPUs such that all threads and cores in a package are 887 * grouped together. For now we assume that the BSP is always 888 * the first thread in a package and just start adding APs 889 * starting with the BSP's APIC ID. 890 */ 891 mp_ncpus = 1; 892 cpu_apic_ids[0] = boot_cpu_id; 893 apic_cpuids[boot_cpu_id] = 0; 894 for (i = boot_cpu_id + 1; i != boot_cpu_id; 895 i == MAX_APIC_ID ? i = 0 : i++) { 896 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 897 cpu_info[i].cpu_disabled) 898 continue; 899 900 if (mp_ncpus < MAXCPU) { 901 cpu_apic_ids[mp_ncpus] = i; 902 apic_cpuids[i] = mp_ncpus; 903 mp_ncpus++; 904 } else 905 cpu_info[i].cpu_disabled = 1; 906 } 907 KASSERT(mp_maxid >= mp_ncpus - 1, 908 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 909 mp_ncpus)); 910} 911 912/* 913 * start each AP in our list 914 */ 915static int 916start_all_aps(void) 917{ 918 vm_offset_t va = boot_address + KERNBASE; 919 u_int64_t *pt4, *pt3, *pt2; 920 u_int32_t mpbioswarmvec; 921 int apic_id, cpu, i; 922 u_char mpbiosreason; 923 924 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 925 926 /* install the AP 1st level boot code */ 927 pmap_kenter(va, boot_address); 928 pmap_invalidate_page(kernel_pmap, va); 929 bcopy(mptramp_start, (void *)va, bootMP_size); 930 931 /* Locate the page tables, they'll be below the trampoline */ 932 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 933 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 934 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 935 936 /* Create the initial 1GB replicated page tables */ 937 for (i = 0; i < 512; i++) { 938 /* Each slot of the level 4 pages points to the same level 3 page */ 939 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 940 pt4[i] |= PG_V | PG_RW | PG_U; 941 942 /* Each slot of the level 3 pages points to the same level 2 page */ 943 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 944 pt3[i] |= PG_V | PG_RW | PG_U; 945 946 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 947 pt2[i] = i * (2 * 1024 * 1024); 948 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 949 } 950 951 /* save the current value of the warm-start vector */ 952 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 953 outb(CMOS_REG, BIOS_RESET); 954 mpbiosreason = inb(CMOS_DATA); 955 956 /* setup a vector to our boot code */ 957 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 958 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 959 outb(CMOS_REG, BIOS_RESET); 960 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 961 962 /* start each AP */ 963 for (cpu = 1; cpu < mp_ncpus; cpu++) { 964 apic_id = cpu_apic_ids[cpu]; 965 966 /* allocate and set up an idle stack data page */ 967 bootstacks[cpu] = (void *)kmem_malloc(kernel_arena, 968 KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO); 969 doublefault_stack = (char *)kmem_malloc(kernel_arena, 970 PAGE_SIZE, M_WAITOK | M_ZERO); 971 nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, 972 M_WAITOK | M_ZERO); 973 dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, 974 M_WAITOK | M_ZERO); 975 976 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 977 bootAP = cpu; 978 979 /* attempt to start the Application Processor */ 980 if (!start_ap(apic_id)) { 981 /* restore the warmstart vector */ 982 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 983 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 984 } 985 986 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 987 } 988 989 /* restore the warmstart vector */ 990 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 991 992 outb(CMOS_REG, BIOS_RESET); 993 outb(CMOS_DATA, mpbiosreason); 994 995 /* number of APs actually started */ 996 return mp_naps; 997} 998 999 1000/* 1001 * This function starts the AP (application processor) identified 1002 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 1003 * to accomplish this. This is necessary because of the nuances 1004 * of the different hardware we might encounter. It isn't pretty, 1005 * but it seems to work. 1006 */ 1007static int 1008start_ap(int apic_id) 1009{ 1010 int vector, ms; 1011 int cpus; 1012 1013 /* calculate the vector */ 1014 vector = (boot_address >> 12) & 0xff; 1015 1016 /* used as a watchpoint to signal AP startup */ 1017 cpus = mp_naps; 1018 1019 ipi_startup(apic_id, vector); 1020 1021 /* Wait up to 5 seconds for it to start. */ 1022 for (ms = 0; ms < 5000; ms++) { 1023 if (mp_naps > cpus) 1024 return 1; /* return SUCCESS */ 1025 DELAY(1000); 1026 } 1027 return 0; /* return FAILURE */ 1028} 1029 1030#ifdef COUNT_XINVLTLB_HITS 1031u_int xhits_gbl[MAXCPU]; 1032u_int xhits_pg[MAXCPU]; 1033u_int xhits_rng[MAXCPU]; 1034static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1035SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1036 sizeof(xhits_gbl), "IU", ""); 1037SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1038 sizeof(xhits_pg), "IU", ""); 1039SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1040 sizeof(xhits_rng), "IU", ""); 1041 1042u_int ipi_global; 1043u_int ipi_page; 1044u_int ipi_range; 1045u_int ipi_range_size; 1046SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1047SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1048SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1049SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, 1050 &ipi_range_size, 0, ""); 1051 1052u_int ipi_masked_global; 1053u_int ipi_masked_page; 1054u_int ipi_masked_range; 1055u_int ipi_masked_range_size; 1056SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1057 &ipi_masked_global, 0, ""); 1058SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1059 &ipi_masked_page, 0, ""); 1060SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1061 &ipi_masked_range, 0, ""); 1062SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1063 &ipi_masked_range_size, 0, ""); 1064#endif /* COUNT_XINVLTLB_HITS */ 1065 1066/* 1067 * Init and startup IPI. 1068 */ 1069void 1070ipi_startup(int apic_id, int vector) 1071{ 1072 1073 /* 1074 * This attempts to follow the algorithm described in the 1075 * Intel Multiprocessor Specification v1.4 in section B.4. 1076 * For each IPI, we allow the local APIC ~20us to deliver the 1077 * IPI. If that times out, we panic. 1078 */ 1079 1080 /* 1081 * first we do an INIT IPI: this INIT IPI might be run, resetting 1082 * and running the target CPU. OR this INIT IPI might be latched (P5 1083 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1084 * ignored. 1085 */ 1086 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1087 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1088 lapic_ipi_wait(100); 1089 1090 /* Explicitly deassert the INIT IPI. */ 1091 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1092 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1093 apic_id); 1094 1095 DELAY(10000); /* wait ~10mS */ 1096 1097 /* 1098 * next we do a STARTUP IPI: the previous INIT IPI might still be 1099 * latched, (P5 bug) this 1st STARTUP would then terminate 1100 * immediately, and the previously started INIT IPI would continue. OR 1101 * the previous INIT IPI has already run. and this STARTUP IPI will 1102 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1103 * will run. 1104 */ 1105 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1106 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1107 vector, apic_id); 1108 if (!lapic_ipi_wait(100)) 1109 panic("Failed to deliver first STARTUP IPI to APIC %d", 1110 apic_id); 1111 DELAY(200); /* wait ~200uS */ 1112 1113 /* 1114 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1115 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1116 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1117 * recognized after hardware RESET or INIT IPI. 1118 */ 1119 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1120 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1121 vector, apic_id); 1122 if (!lapic_ipi_wait(100)) 1123 panic("Failed to deliver second STARTUP IPI to APIC %d", 1124 apic_id); 1125 1126 DELAY(200); /* wait ~200uS */ 1127} 1128 1129/* 1130 * Send an IPI to specified CPU handling the bitmap logic. 1131 */ 1132static void 1133ipi_send_cpu(int cpu, u_int ipi) 1134{ 1135 u_int bitmap, old_pending, new_pending; 1136 1137 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1138 1139 if (IPI_IS_BITMAPED(ipi)) { 1140 bitmap = 1 << ipi; 1141 ipi = IPI_BITMAP_VECTOR; 1142 do { 1143 old_pending = cpu_ipi_pending[cpu]; 1144 new_pending = old_pending | bitmap; 1145 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1146 old_pending, new_pending)); 1147 if (old_pending) 1148 return; 1149 } 1150 cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]); 1151} 1152 1153/* 1154 * Flush the TLB on all other CPU's 1155 */ 1156static void 1157smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1, 1158 vm_offset_t addr2) 1159{ 1160 u_int ncpu; 1161 1162 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1163 if (ncpu < 1) 1164 return; /* no other cpus */ 1165 if (!(read_rflags() & PSL_I)) 1166 panic("%s: interrupts disabled", __func__); 1167 mtx_lock_spin(&smp_ipi_mtx); 1168 smp_tlb_invpcid.addr = addr1; 1169 if (pmap == NULL) { 1170 smp_tlb_invpcid.pcid = 0; 1171 } else { 1172 smp_tlb_invpcid.pcid = pmap->pm_pcid; 1173 pcid_cr3 = pmap->pm_cr3; 1174 } 1175 smp_tlb_addr2 = addr2; 1176 smp_tlb_pmap = pmap; 1177 atomic_store_rel_int(&smp_tlb_wait, 0); 1178 ipi_all_but_self(vector); 1179 while (smp_tlb_wait < ncpu) 1180 ia32_pause(); 1181 mtx_unlock_spin(&smp_ipi_mtx); 1182} 1183 1184static void 1185smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1186 vm_offset_t addr1, vm_offset_t addr2) 1187{ 1188 int cpu, ncpu, othercpus; 1189 1190 othercpus = mp_ncpus - 1; 1191 if (CPU_ISFULLSET(&mask)) { 1192 if (othercpus < 1) 1193 return; 1194 } else { 1195 CPU_CLR(PCPU_GET(cpuid), &mask); 1196 if (CPU_EMPTY(&mask)) 1197 return; 1198 } 1199 if (!(read_rflags() & PSL_I)) 1200 panic("%s: interrupts disabled", __func__); 1201 mtx_lock_spin(&smp_ipi_mtx); 1202 smp_tlb_invpcid.addr = addr1; 1203 if (pmap == NULL) { 1204 smp_tlb_invpcid.pcid = 0; 1205 } else { 1206 smp_tlb_invpcid.pcid = pmap->pm_pcid; 1207 pcid_cr3 = pmap->pm_cr3; 1208 } 1209 smp_tlb_addr2 = addr2; 1210 smp_tlb_pmap = pmap; 1211 atomic_store_rel_int(&smp_tlb_wait, 0); 1212 if (CPU_ISFULLSET(&mask)) { 1213 ncpu = othercpus; 1214 ipi_all_but_self(vector); 1215 } else { 1216 ncpu = 0; 1217 while ((cpu = CPU_FFS(&mask)) != 0) { 1218 cpu--; 1219 CPU_CLR(cpu, &mask); 1220 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1221 cpu, vector); 1222 ipi_send_cpu(cpu, vector); 1223 ncpu++; 1224 } 1225 } 1226 while (smp_tlb_wait < ncpu) 1227 ia32_pause(); 1228 mtx_unlock_spin(&smp_ipi_mtx); 1229} 1230 1231void 1232smp_cache_flush(void) 1233{ 1234 1235 if (smp_started) 1236 smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); 1237} 1238 1239void 1240smp_invltlb(pmap_t pmap) 1241{ 1242 1243 if (smp_started) { 1244 smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); 1245#ifdef COUNT_XINVLTLB_HITS 1246 ipi_global++; 1247#endif 1248 } 1249} 1250 1251void 1252smp_invlpg(pmap_t pmap, vm_offset_t addr) 1253{ 1254 1255 if (smp_started) { 1256 smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0); 1257#ifdef COUNT_XINVLTLB_HITS 1258 ipi_page++; 1259#endif 1260 } 1261} 1262 1263void 1264smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) 1265{ 1266 1267 if (smp_started) { 1268 smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2); 1269#ifdef COUNT_XINVLTLB_HITS 1270 ipi_range++; 1271 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1272#endif 1273 } 1274} 1275 1276void 1277smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1278{ 1279 1280 if (smp_started) { 1281 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1282#ifdef COUNT_XINVLTLB_HITS 1283 ipi_masked_global++; 1284#endif 1285 } 1286} 1287 1288void 1289smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr) 1290{ 1291 1292 if (smp_started) { 1293 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1294#ifdef COUNT_XINVLTLB_HITS 1295 ipi_masked_page++; 1296#endif 1297 } 1298} 1299 1300void 1301smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, 1302 vm_offset_t addr2) 1303{ 1304 1305 if (smp_started) { 1306 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, 1307 addr2); 1308#ifdef COUNT_XINVLTLB_HITS 1309 ipi_masked_range++; 1310 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1311#endif 1312 } 1313} 1314 1315void 1316ipi_bitmap_handler(struct trapframe frame) 1317{ 1318 struct trapframe *oldframe; 1319 struct thread *td; 1320 int cpu = PCPU_GET(cpuid); 1321 u_int ipi_bitmap; 1322 1323 critical_enter(); 1324 td = curthread; 1325 td->td_intr_nesting_level++; 1326 oldframe = td->td_intr_frame; 1327 td->td_intr_frame = &frame; 1328 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1329 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1330#ifdef COUNT_IPIS 1331 (*ipi_preempt_counts[cpu])++; 1332#endif 1333 sched_preempt(td); 1334 } 1335 if (ipi_bitmap & (1 << IPI_AST)) { 1336#ifdef COUNT_IPIS 1337 (*ipi_ast_counts[cpu])++; 1338#endif 1339 /* Nothing to do for AST */ 1340 } 1341 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1342#ifdef COUNT_IPIS 1343 (*ipi_hardclock_counts[cpu])++; 1344#endif 1345 hardclockintr(); 1346 } 1347 td->td_intr_frame = oldframe; 1348 td->td_intr_nesting_level--; 1349 critical_exit(); 1350} 1351 1352/* 1353 * send an IPI to a set of cpus. 1354 */ 1355void 1356ipi_selected(cpuset_t cpus, u_int ipi) 1357{ 1358 int cpu; 1359 1360 /* 1361 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1362 * of help in order to understand what is the source. 1363 * Set the mask of receiving CPUs for this purpose. 1364 */ 1365 if (ipi == IPI_STOP_HARD) 1366 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1367 1368 while ((cpu = CPU_FFS(&cpus)) != 0) { 1369 cpu--; 1370 CPU_CLR(cpu, &cpus); 1371 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1372 ipi_send_cpu(cpu, ipi); 1373 } 1374} 1375 1376/* 1377 * send an IPI to a specific CPU. 1378 */ 1379void 1380ipi_cpu(int cpu, u_int ipi) 1381{ 1382 1383 /* 1384 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1385 * of help in order to understand what is the source. 1386 * Set the mask of receiving CPUs for this purpose. 1387 */ 1388 if (ipi == IPI_STOP_HARD) 1389 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1390 1391 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1392 ipi_send_cpu(cpu, ipi); 1393} 1394 1395/* 1396 * send an IPI to all CPUs EXCEPT myself 1397 */ 1398void 1399ipi_all_but_self(u_int ipi) 1400{ 1401 cpuset_t other_cpus; 1402 1403 other_cpus = all_cpus; 1404 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1405 1406 if (IPI_IS_BITMAPED(ipi)) { 1407 ipi_selected(other_cpus, ipi); 1408 return; 1409 } 1410 1411 /* 1412 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1413 * of help in order to understand what is the source. 1414 * Set the mask of receiving CPUs for this purpose. 1415 */ 1416 if (ipi == IPI_STOP_HARD) 1417 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1418 1419 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1420 cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1421} 1422 1423int 1424ipi_nmi_handler() 1425{ 1426 u_int cpuid; 1427 1428 /* 1429 * As long as there is not a simple way to know about a NMI's 1430 * source, if the bitmask for the current CPU is present in 1431 * the global pending bitword an IPI_STOP_HARD has been issued 1432 * and should be handled. 1433 */ 1434 cpuid = PCPU_GET(cpuid); 1435 if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) 1436 return (1); 1437 1438 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); 1439 cpustop_handler(); 1440 return (0); 1441} 1442 1443/* 1444 * Handle an IPI_STOP by saving our current context and spinning until we 1445 * are resumed. 1446 */ 1447void 1448cpustop_handler(void) 1449{ 1450 u_int cpu; 1451 1452 cpu = PCPU_GET(cpuid); 1453 1454 savectx(&stoppcbs[cpu]); 1455 1456 /* Indicate that we are stopped */ 1457 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1458 1459 /* Wait for restart */ 1460 while (!CPU_ISSET(cpu, &started_cpus)) 1461 ia32_pause(); 1462 1463 CPU_CLR_ATOMIC(cpu, &started_cpus); 1464 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1465 1466#ifdef DDB 1467 amd64_db_resume_dbreg(); 1468#endif 1469 1470 if (cpu == 0 && cpustop_restartfunc != NULL) { 1471 cpustop_restartfunc(); 1472 cpustop_restartfunc = NULL; 1473 } 1474} 1475 1476/* 1477 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1478 * are resumed. 1479 */ 1480void 1481cpususpend_handler(void) 1482{ 1483 u_int cpu; 1484 1485 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1486 1487 cpu = PCPU_GET(cpuid); 1488 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1489 fpususpend(susppcbs[cpu]->sp_fpususpend); 1490 wbinvd(); 1491 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1492 /* 1493 * Hack for xen, which does not use resumectx() so never 1494 * uses the next clause: set resuming_cpus early so that 1495 * resume_cpus() can wait on the same bitmap for acpi and 1496 * xen. resuming_cpus now means eventually_resumable_cpus. 1497 */ 1498 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1499 } else { 1500 fpuresume(susppcbs[cpu]->sp_fpususpend); 1501 pmap_init_pat(); 1502 initializecpu(); 1503 PCPU_SET(switchtime, 0); 1504 PCPU_SET(switchticks, ticks); 1505 1506 /* Indicate that we are resuming */ 1507 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1508 } 1509 1510 /* Wait for resume directive */ 1511 while (!CPU_ISSET(cpu, &toresume_cpus)) 1512 ia32_pause(); 1513 1514 if (cpu_ops.cpu_resume) 1515 cpu_ops.cpu_resume(); 1516 if (vmm_resume_p) 1517 vmm_resume_p(); 1518 1519 /* Resume MCA and local APIC */ 1520 mca_resume(); 1521 lapic_setup(0); 1522 1523 /* Indicate that we are resumed */ 1524 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1525 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1526 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1527} 1528 1529/* 1530 * Handlers for TLB related IPIs 1531 */ 1532void 1533invltlb_handler(void) 1534{ 1535#ifdef COUNT_XINVLTLB_HITS 1536 xhits_gbl[PCPU_GET(cpuid)]++; 1537#endif /* COUNT_XINVLTLB_HITS */ 1538#ifdef COUNT_IPIS 1539 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1540#endif /* COUNT_IPIS */ 1541 1542 invltlb(); 1543 atomic_add_int(&smp_tlb_wait, 1); 1544} 1545 1546void 1547invltlb_pcid_handler(void) 1548{ 1549 uint64_t cr3; 1550 u_int cpuid; 1551#ifdef COUNT_XINVLTLB_HITS 1552 xhits_gbl[PCPU_GET(cpuid)]++; 1553#endif /* COUNT_XINVLTLB_HITS */ 1554#ifdef COUNT_IPIS 1555 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1556#endif /* COUNT_IPIS */ 1557 1558 if (smp_tlb_invpcid.pcid != (uint64_t)-1 && 1559 smp_tlb_invpcid.pcid != 0) { 1560 if (invpcid_works) { 1561 invpcid(&smp_tlb_invpcid, INVPCID_CTX); 1562 } else { 1563 /* Otherwise reload %cr3 twice. */ 1564 cr3 = rcr3(); 1565 if (cr3 != pcid_cr3) { 1566 load_cr3(pcid_cr3); 1567 cr3 |= CR3_PCID_SAVE; 1568 } 1569 load_cr3(cr3); 1570 } 1571 } else { 1572 invltlb_globpcid(); 1573 } 1574 if (smp_tlb_pmap != NULL) { 1575 cpuid = PCPU_GET(cpuid); 1576 if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active)) 1577 CPU_CLR_ATOMIC(cpuid, &smp_tlb_pmap->pm_save); 1578 } 1579 1580 atomic_add_int(&smp_tlb_wait, 1); 1581} 1582 1583void 1584invlpg_handler(void) 1585{ 1586#ifdef COUNT_XINVLTLB_HITS 1587 xhits_pg[PCPU_GET(cpuid)]++; 1588#endif /* COUNT_XINVLTLB_HITS */ 1589#ifdef COUNT_IPIS 1590 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1591#endif /* COUNT_IPIS */ 1592 1593 invlpg(smp_tlb_invpcid.addr); 1594 atomic_add_int(&smp_tlb_wait, 1); 1595} 1596 1597void 1598invlpg_pcid_handler(void) 1599{ 1600 uint64_t cr3; 1601#ifdef COUNT_XINVLTLB_HITS 1602 xhits_pg[PCPU_GET(cpuid)]++; 1603#endif /* COUNT_XINVLTLB_HITS */ 1604#ifdef COUNT_IPIS 1605 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1606#endif /* COUNT_IPIS */ 1607 1608 if (smp_tlb_invpcid.pcid == (uint64_t)-1) { 1609 invltlb_globpcid(); 1610 } else if (smp_tlb_invpcid.pcid == 0) { 1611 invlpg(smp_tlb_invpcid.addr); 1612 } else if (invpcid_works) { 1613 invpcid(&smp_tlb_invpcid, INVPCID_ADDR); 1614 } else { 1615 /* 1616 * PCID supported, but INVPCID is not. 1617 * Temporarily switch to the target address 1618 * space and do INVLPG. 1619 */ 1620 cr3 = rcr3(); 1621 if (cr3 != pcid_cr3) 1622 load_cr3(pcid_cr3 | CR3_PCID_SAVE); 1623 invlpg(smp_tlb_invpcid.addr); 1624 load_cr3(cr3 | CR3_PCID_SAVE); 1625 } 1626 1627 atomic_add_int(&smp_tlb_wait, 1); 1628} 1629 1630static inline void 1631invlpg_range(vm_offset_t start, vm_offset_t end) 1632{ 1633 1634 do { 1635 invlpg(start); 1636 start += PAGE_SIZE; 1637 } while (start < end); 1638} 1639 1640void 1641invlrng_handler(void) 1642{ 1643 struct invpcid_descr d; 1644 vm_offset_t addr; 1645 uint64_t cr3; 1646 u_int cpuid; 1647#ifdef COUNT_XINVLTLB_HITS 1648 xhits_rng[PCPU_GET(cpuid)]++; 1649#endif /* COUNT_XINVLTLB_HITS */ 1650#ifdef COUNT_IPIS 1651 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1652#endif /* COUNT_IPIS */ 1653 1654 addr = smp_tlb_invpcid.addr; 1655 if (pmap_pcid_enabled) { 1656 if (smp_tlb_invpcid.pcid == 0) { 1657 /* 1658 * kernel pmap - use invlpg to invalidate 1659 * global mapping. 1660 */ 1661 invlpg_range(addr, smp_tlb_addr2); 1662 } else if (smp_tlb_invpcid.pcid == (uint64_t)-1) { 1663 invltlb_globpcid(); 1664 if (smp_tlb_pmap != NULL) { 1665 cpuid = PCPU_GET(cpuid); 1666 if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active)) 1667 CPU_CLR_ATOMIC(cpuid, 1668 &smp_tlb_pmap->pm_save); 1669 } 1670 } else if (invpcid_works) { 1671 d = smp_tlb_invpcid; 1672 do { 1673 invpcid(&d, INVPCID_ADDR); 1674 d.addr += PAGE_SIZE; 1675 } while (d.addr <= smp_tlb_addr2); 1676 } else { 1677 cr3 = rcr3(); 1678 if (cr3 != pcid_cr3) 1679 load_cr3(pcid_cr3 | CR3_PCID_SAVE); 1680 invlpg_range(addr, smp_tlb_addr2); 1681 load_cr3(cr3 | CR3_PCID_SAVE); 1682 } 1683 } else { 1684 invlpg_range(addr, smp_tlb_addr2); 1685 } 1686 1687 atomic_add_int(&smp_tlb_wait, 1); 1688} 1689 1690void 1691invlcache_handler(void) 1692{ 1693#ifdef COUNT_IPIS 1694 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1695#endif /* COUNT_IPIS */ 1696 1697 wbinvd(); 1698 atomic_add_int(&smp_tlb_wait, 1); 1699} 1700 1701/* 1702 * This is called once the rest of the system is up and running and we're 1703 * ready to let the AP's out of the pen. 1704 */ 1705static void 1706release_aps(void *dummy __unused) 1707{ 1708 1709 if (mp_ncpus == 1) 1710 return; 1711 atomic_store_rel_int(&aps_ready, 1); 1712 while (smp_started == 0) 1713 ia32_pause(); 1714} 1715SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1716 1717#ifdef COUNT_IPIS 1718/* 1719 * Setup interrupt counters for IPI handlers. 1720 */ 1721static void 1722mp_ipi_intrcnt(void *dummy) 1723{ 1724 char buf[64]; 1725 int i; 1726 1727 CPU_FOREACH(i) { 1728 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1729 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1730 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1731 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1732 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1733 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1734 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1735 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1736 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1737 intrcnt_add(buf, &ipi_preempt_counts[i]); 1738 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1739 intrcnt_add(buf, &ipi_ast_counts[i]); 1740 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1741 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1742 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1743 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1744 } 1745} 1746SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1747#endif 1748 1749