machdep.c revision 272913
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/10/sys/i386/i386/machdep.c 272913 2014-10-10 20:47:23Z jhb $"); 42 43#include "opt_apic.h" 44#include "opt_atalk.h" 45#include "opt_atpic.h" 46#include "opt_compat.h" 47#include "opt_cpu.h" 48#include "opt_ddb.h" 49#include "opt_inet.h" 50#include "opt_ipx.h" 51#include "opt_isa.h" 52#include "opt_kstack_pages.h" 53#include "opt_maxmem.h" 54#include "opt_mp_watchdog.h" 55#include "opt_npx.h" 56#include "opt_perfmon.h" 57#include "opt_platform.h" 58#include "opt_xbox.h" 59#include "opt_kdtrace.h" 60 61#include <sys/param.h> 62#include <sys/proc.h> 63#include <sys/systm.h> 64#include <sys/bio.h> 65#include <sys/buf.h> 66#include <sys/bus.h> 67#include <sys/callout.h> 68#include <sys/cons.h> 69#include <sys/cpu.h> 70#include <sys/eventhandler.h> 71#include <sys/exec.h> 72#include <sys/imgact.h> 73#include <sys/kdb.h> 74#include <sys/kernel.h> 75#include <sys/ktr.h> 76#include <sys/linker.h> 77#include <sys/lock.h> 78#include <sys/malloc.h> 79#include <sys/memrange.h> 80#include <sys/msgbuf.h> 81#include <sys/mutex.h> 82#include <sys/pcpu.h> 83#include <sys/ptrace.h> 84#include <sys/reboot.h> 85#include <sys/rwlock.h> 86#include <sys/sched.h> 87#include <sys/signalvar.h> 88#ifdef SMP 89#include <sys/smp.h> 90#endif 91#include <sys/syscallsubr.h> 92#include <sys/sysctl.h> 93#include <sys/sysent.h> 94#include <sys/sysproto.h> 95#include <sys/ucontext.h> 96#include <sys/vmmeter.h> 97 98#include <vm/vm.h> 99#include <vm/vm_extern.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_page.h> 102#include <vm/vm_map.h> 103#include <vm/vm_object.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_param.h> 106 107#ifdef DDB 108#ifndef KDB 109#error KDB must be enabled in order for DDB to work! 110#endif 111#include <ddb/ddb.h> 112#include <ddb/db_sym.h> 113#endif 114 115#include <isa/rtc.h> 116 117#include <net/netisr.h> 118 119#include <machine/bootinfo.h> 120#include <machine/clock.h> 121#include <machine/cpu.h> 122#include <machine/cputypes.h> 123#include <machine/intr_machdep.h> 124#include <x86/mca.h> 125#include <machine/md_var.h> 126#include <machine/metadata.h> 127#include <machine/mp_watchdog.h> 128#include <machine/pc/bios.h> 129#include <machine/pcb.h> 130#include <machine/pcb_ext.h> 131#include <machine/proc.h> 132#include <machine/reg.h> 133#include <machine/sigframe.h> 134#include <machine/specialreg.h> 135#include <machine/vm86.h> 136#ifdef PERFMON 137#include <machine/perfmon.h> 138#endif 139#ifdef SMP 140#include <machine/smp.h> 141#endif 142#ifdef FDT 143#include <x86/fdt.h> 144#endif 145 146#ifdef DEV_APIC 147#include <machine/apicvar.h> 148#endif 149 150#ifdef DEV_ISA 151#include <x86/isa/icu.h> 152#endif 153 154#ifdef XBOX 155#include <machine/xbox.h> 156 157int arch_i386_is_xbox = 0; 158uint32_t arch_i386_xbox_memsize = 0; 159#endif 160 161#ifdef XEN 162/* XEN includes */ 163#include <xen/xen-os.h> 164#include <xen/hypervisor.h> 165#include <machine/xen/xenvar.h> 166#include <machine/xen/xenfunc.h> 167#include <xen/xen_intr.h> 168 169void Xhypervisor_callback(void); 170void failsafe_callback(void); 171 172extern trap_info_t trap_table[]; 173struct proc_ldt default_proc_ldt; 174extern int init_first; 175int running_xen = 1; 176extern unsigned long physfree; 177#endif /* XEN */ 178 179/* Sanity check for __curthread() */ 180CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 181 182extern void init386(int first); 183extern void dblfault_handler(void); 184 185extern void printcpuinfo(void); /* XXX header file */ 186extern void finishidentcpu(void); 187extern void panicifcpuunsupported(void); 188 189#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 190#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 191 192#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 193#define CPU_ENABLE_SSE 194#endif 195 196static void cpu_startup(void *); 197static void fpstate_drop(struct thread *td); 198static void get_fpcontext(struct thread *td, mcontext_t *mcp); 199static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 200#ifdef CPU_ENABLE_SSE 201static void set_fpregs_xmm(struct save87 *, struct savexmm *); 202static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 203#endif /* CPU_ENABLE_SSE */ 204SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 205 206#ifdef DDB 207extern vm_offset_t ksym_start, ksym_end; 208#endif 209 210/* Intel ICH registers */ 211#define ICH_PMBASE 0x400 212#define ICH_SMI_EN ICH_PMBASE + 0x30 213 214int _udatasel, _ucodesel; 215u_int basemem; 216 217int cold = 1; 218 219#ifdef COMPAT_43 220static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 221#endif 222#ifdef COMPAT_FREEBSD4 223static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 224#endif 225 226long Maxmem = 0; 227long realmem = 0; 228 229#ifdef PAE 230FEATURE(pae, "Physical Address Extensions"); 231#endif 232 233/* 234 * The number of PHYSMAP entries must be one less than the number of 235 * PHYSSEG entries because the PHYSMAP entry that spans the largest 236 * physical address that is accessible by ISA DMA is split into two 237 * PHYSSEG entries. 238 */ 239#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 240 241vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 242vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 243 244/* must be 2 less so 0 0 can signal end of chunks */ 245#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 246#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 247 248struct kva_md_info kmi; 249 250static struct trapframe proc0_tf; 251struct pcpu __pcpu[MAXCPU]; 252 253struct mtx icu_lock; 254 255struct mem_range_softc mem_range_softc; 256 257static void 258cpu_startup(dummy) 259 void *dummy; 260{ 261 uintmax_t memsize; 262 char *sysenv; 263 264 /* 265 * On MacBooks, we need to disallow the legacy USB circuit to 266 * generate an SMI# because this can cause several problems, 267 * namely: incorrect CPU frequency detection and failure to 268 * start the APs. 269 * We do this by disabling a bit in the SMI_EN (SMI Control and 270 * Enable register) of the Intel ICH LPC Interface Bridge. 271 */ 272 sysenv = getenv("smbios.system.product"); 273 if (sysenv != NULL) { 274 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 275 strncmp(sysenv, "MacBook3,1", 10) == 0 || 276 strncmp(sysenv, "MacBook4,1", 10) == 0 || 277 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 278 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 279 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 280 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 281 strncmp(sysenv, "Macmini1,1", 10) == 0) { 282 if (bootverbose) 283 printf("Disabling LEGACY_USB_EN bit on " 284 "Intel ICH.\n"); 285 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 286 } 287 freeenv(sysenv); 288 } 289 290 /* 291 * Good {morning,afternoon,evening,night}. 292 */ 293 startrtclock(); 294 printcpuinfo(); 295 panicifcpuunsupported(); 296#ifdef PERFMON 297 perfmon_init(); 298#endif 299 300 /* 301 * Display physical memory if SMBIOS reports reasonable amount. 302 */ 303 memsize = 0; 304 sysenv = getenv("smbios.memory.enabled"); 305 if (sysenv != NULL) { 306 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 307 freeenv(sysenv); 308 } 309 if (memsize < ptoa((uintmax_t)cnt.v_free_count)) 310 memsize = ptoa((uintmax_t)Maxmem); 311 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 312 realmem = atop(memsize); 313 314 /* 315 * Display any holes after the first chunk of extended memory. 316 */ 317 if (bootverbose) { 318 int indx; 319 320 printf("Physical memory chunk(s):\n"); 321 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 322 vm_paddr_t size; 323 324 size = phys_avail[indx + 1] - phys_avail[indx]; 325 printf( 326 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 327 (uintmax_t)phys_avail[indx], 328 (uintmax_t)phys_avail[indx + 1] - 1, 329 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 330 } 331 } 332 333 vm_ksubmap_init(&kmi); 334 335 printf("avail memory = %ju (%ju MB)\n", 336 ptoa((uintmax_t)cnt.v_free_count), 337 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 338 339 /* 340 * Set up buffers, so they can be used to read disk labels. 341 */ 342 bufinit(); 343 vm_pager_bufferinit(); 344#ifndef XEN 345 cpu_setregs(); 346#endif 347} 348 349/* 350 * Send an interrupt to process. 351 * 352 * Stack is set up to allow sigcode stored 353 * at top to call routine, followed by kcall 354 * to sigreturn routine below. After sigreturn 355 * resets the signal mask, the stack, and the 356 * frame pointer, it returns to the user 357 * specified pc, psl. 358 */ 359#ifdef COMPAT_43 360static void 361osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 362{ 363 struct osigframe sf, *fp; 364 struct proc *p; 365 struct thread *td; 366 struct sigacts *psp; 367 struct trapframe *regs; 368 int sig; 369 int oonstack; 370 371 td = curthread; 372 p = td->td_proc; 373 PROC_LOCK_ASSERT(p, MA_OWNED); 374 sig = ksi->ksi_signo; 375 psp = p->p_sigacts; 376 mtx_assert(&psp->ps_mtx, MA_OWNED); 377 regs = td->td_frame; 378 oonstack = sigonstack(regs->tf_esp); 379 380 /* Allocate space for the signal handler context. */ 381 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 382 SIGISMEMBER(psp->ps_sigonstack, sig)) { 383 fp = (struct osigframe *)(td->td_sigstk.ss_sp + 384 td->td_sigstk.ss_size - sizeof(struct osigframe)); 385#if defined(COMPAT_43) 386 td->td_sigstk.ss_flags |= SS_ONSTACK; 387#endif 388 } else 389 fp = (struct osigframe *)regs->tf_esp - 1; 390 391 /* Translate the signal if appropriate. */ 392 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 393 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 394 395 /* Build the argument list for the signal handler. */ 396 sf.sf_signum = sig; 397 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 398 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 399 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 400 /* Signal handler installed with SA_SIGINFO. */ 401 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 402 sf.sf_siginfo.si_signo = sig; 403 sf.sf_siginfo.si_code = ksi->ksi_code; 404 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 405 sf.sf_addr = 0; 406 } else { 407 /* Old FreeBSD-style arguments. */ 408 sf.sf_arg2 = ksi->ksi_code; 409 sf.sf_addr = (register_t)ksi->ksi_addr; 410 sf.sf_ahu.sf_handler = catcher; 411 } 412 mtx_unlock(&psp->ps_mtx); 413 PROC_UNLOCK(p); 414 415 /* Save most if not all of trap frame. */ 416 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 417 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 418 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 419 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 420 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 421 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 422 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 423 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 424 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 425 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 426 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 427 sf.sf_siginfo.si_sc.sc_gs = rgs(); 428 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 429 430 /* Build the signal context to be used by osigreturn(). */ 431 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 432 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 433 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 434 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 435 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 436 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 437 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 438 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 439 440 /* 441 * If we're a vm86 process, we want to save the segment registers. 442 * We also change eflags to be our emulated eflags, not the actual 443 * eflags. 444 */ 445 if (regs->tf_eflags & PSL_VM) { 446 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 447 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 448 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 449 450 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 451 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 452 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 453 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 454 455 if (vm86->vm86_has_vme == 0) 456 sf.sf_siginfo.si_sc.sc_ps = 457 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 458 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 459 460 /* See sendsig() for comments. */ 461 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 462 } 463 464 /* 465 * Copy the sigframe out to the user's stack. 466 */ 467 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 468#ifdef DEBUG 469 printf("process %ld has trashed its stack\n", (long)p->p_pid); 470#endif 471 PROC_LOCK(p); 472 sigexit(td, SIGILL); 473 } 474 475 regs->tf_esp = (int)fp; 476 if (p->p_sysent->sv_sigcode_base != 0) { 477 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 478 szosigcode; 479 } else { 480 /* a.out sysentvec does not use shared page */ 481 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 482 } 483 regs->tf_eflags &= ~(PSL_T | PSL_D); 484 regs->tf_cs = _ucodesel; 485 regs->tf_ds = _udatasel; 486 regs->tf_es = _udatasel; 487 regs->tf_fs = _udatasel; 488 load_gs(_udatasel); 489 regs->tf_ss = _udatasel; 490 PROC_LOCK(p); 491 mtx_lock(&psp->ps_mtx); 492} 493#endif /* COMPAT_43 */ 494 495#ifdef COMPAT_FREEBSD4 496static void 497freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 498{ 499 struct sigframe4 sf, *sfp; 500 struct proc *p; 501 struct thread *td; 502 struct sigacts *psp; 503 struct trapframe *regs; 504 int sig; 505 int oonstack; 506 507 td = curthread; 508 p = td->td_proc; 509 PROC_LOCK_ASSERT(p, MA_OWNED); 510 sig = ksi->ksi_signo; 511 psp = p->p_sigacts; 512 mtx_assert(&psp->ps_mtx, MA_OWNED); 513 regs = td->td_frame; 514 oonstack = sigonstack(regs->tf_esp); 515 516 /* Save user context. */ 517 bzero(&sf, sizeof(sf)); 518 sf.sf_uc.uc_sigmask = *mask; 519 sf.sf_uc.uc_stack = td->td_sigstk; 520 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 521 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 522 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 523 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 524 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 525 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 526 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 527 bzero(sf.sf_uc.uc_mcontext.__spare__, 528 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 529 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 530 531 /* Allocate space for the signal handler context. */ 532 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 533 SIGISMEMBER(psp->ps_sigonstack, sig)) { 534 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + 535 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 536#if defined(COMPAT_43) 537 td->td_sigstk.ss_flags |= SS_ONSTACK; 538#endif 539 } else 540 sfp = (struct sigframe4 *)regs->tf_esp - 1; 541 542 /* Translate the signal if appropriate. */ 543 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 544 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 545 546 /* Build the argument list for the signal handler. */ 547 sf.sf_signum = sig; 548 sf.sf_ucontext = (register_t)&sfp->sf_uc; 549 bzero(&sf.sf_si, sizeof(sf.sf_si)); 550 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 551 /* Signal handler installed with SA_SIGINFO. */ 552 sf.sf_siginfo = (register_t)&sfp->sf_si; 553 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 554 555 /* Fill in POSIX parts */ 556 sf.sf_si.si_signo = sig; 557 sf.sf_si.si_code = ksi->ksi_code; 558 sf.sf_si.si_addr = ksi->ksi_addr; 559 } else { 560 /* Old FreeBSD-style arguments. */ 561 sf.sf_siginfo = ksi->ksi_code; 562 sf.sf_addr = (register_t)ksi->ksi_addr; 563 sf.sf_ahu.sf_handler = catcher; 564 } 565 mtx_unlock(&psp->ps_mtx); 566 PROC_UNLOCK(p); 567 568 /* 569 * If we're a vm86 process, we want to save the segment registers. 570 * We also change eflags to be our emulated eflags, not the actual 571 * eflags. 572 */ 573 if (regs->tf_eflags & PSL_VM) { 574 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 575 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 576 577 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 578 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 579 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 580 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 581 582 if (vm86->vm86_has_vme == 0) 583 sf.sf_uc.uc_mcontext.mc_eflags = 584 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 585 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 586 587 /* 588 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 589 * syscalls made by the signal handler. This just avoids 590 * wasting time for our lazy fixup of such faults. PSL_NT 591 * does nothing in vm86 mode, but vm86 programs can set it 592 * almost legitimately in probes for old cpu types. 593 */ 594 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 595 } 596 597 /* 598 * Copy the sigframe out to the user's stack. 599 */ 600 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 601#ifdef DEBUG 602 printf("process %ld has trashed its stack\n", (long)p->p_pid); 603#endif 604 PROC_LOCK(p); 605 sigexit(td, SIGILL); 606 } 607 608 regs->tf_esp = (int)sfp; 609 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 610 szfreebsd4_sigcode; 611 regs->tf_eflags &= ~(PSL_T | PSL_D); 612 regs->tf_cs = _ucodesel; 613 regs->tf_ds = _udatasel; 614 regs->tf_es = _udatasel; 615 regs->tf_fs = _udatasel; 616 regs->tf_ss = _udatasel; 617 PROC_LOCK(p); 618 mtx_lock(&psp->ps_mtx); 619} 620#endif /* COMPAT_FREEBSD4 */ 621 622void 623sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 624{ 625 struct sigframe sf, *sfp; 626 struct proc *p; 627 struct thread *td; 628 struct sigacts *psp; 629 char *sp; 630 struct trapframe *regs; 631 struct segment_descriptor *sdp; 632 int sig; 633 int oonstack; 634 635 td = curthread; 636 p = td->td_proc; 637 PROC_LOCK_ASSERT(p, MA_OWNED); 638 sig = ksi->ksi_signo; 639 psp = p->p_sigacts; 640 mtx_assert(&psp->ps_mtx, MA_OWNED); 641#ifdef COMPAT_FREEBSD4 642 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 643 freebsd4_sendsig(catcher, ksi, mask); 644 return; 645 } 646#endif 647#ifdef COMPAT_43 648 if (SIGISMEMBER(psp->ps_osigset, sig)) { 649 osendsig(catcher, ksi, mask); 650 return; 651 } 652#endif 653 regs = td->td_frame; 654 oonstack = sigonstack(regs->tf_esp); 655 656 /* Save user context. */ 657 bzero(&sf, sizeof(sf)); 658 sf.sf_uc.uc_sigmask = *mask; 659 sf.sf_uc.uc_stack = td->td_sigstk; 660 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 661 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 662 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 663 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 664 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 665 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 666 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 667 fpstate_drop(td); 668 /* 669 * Unconditionally fill the fsbase and gsbase into the mcontext. 670 */ 671 sdp = &td->td_pcb->pcb_fsd; 672 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 673 sdp->sd_lobase; 674 sdp = &td->td_pcb->pcb_gsd; 675 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 676 sdp->sd_lobase; 677 sf.sf_uc.uc_mcontext.mc_flags = 0; 678 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 679 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 680 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 681 682 /* Allocate space for the signal handler context. */ 683 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 684 SIGISMEMBER(psp->ps_sigonstack, sig)) { 685 sp = td->td_sigstk.ss_sp + 686 td->td_sigstk.ss_size - sizeof(struct sigframe); 687#if defined(COMPAT_43) 688 td->td_sigstk.ss_flags |= SS_ONSTACK; 689#endif 690 } else 691 sp = (char *)regs->tf_esp - sizeof(struct sigframe); 692 /* Align to 16 bytes. */ 693 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 694 695 /* Translate the signal if appropriate. */ 696 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 697 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 698 699 /* Build the argument list for the signal handler. */ 700 sf.sf_signum = sig; 701 sf.sf_ucontext = (register_t)&sfp->sf_uc; 702 bzero(&sf.sf_si, sizeof(sf.sf_si)); 703 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 704 /* Signal handler installed with SA_SIGINFO. */ 705 sf.sf_siginfo = (register_t)&sfp->sf_si; 706 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 707 708 /* Fill in POSIX parts */ 709 sf.sf_si = ksi->ksi_info; 710 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 711 } else { 712 /* Old FreeBSD-style arguments. */ 713 sf.sf_siginfo = ksi->ksi_code; 714 sf.sf_addr = (register_t)ksi->ksi_addr; 715 sf.sf_ahu.sf_handler = catcher; 716 } 717 mtx_unlock(&psp->ps_mtx); 718 PROC_UNLOCK(p); 719 720 /* 721 * If we're a vm86 process, we want to save the segment registers. 722 * We also change eflags to be our emulated eflags, not the actual 723 * eflags. 724 */ 725 if (regs->tf_eflags & PSL_VM) { 726 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 727 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 728 729 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 730 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 731 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 732 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 733 734 if (vm86->vm86_has_vme == 0) 735 sf.sf_uc.uc_mcontext.mc_eflags = 736 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 737 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 738 739 /* 740 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 741 * syscalls made by the signal handler. This just avoids 742 * wasting time for our lazy fixup of such faults. PSL_NT 743 * does nothing in vm86 mode, but vm86 programs can set it 744 * almost legitimately in probes for old cpu types. 745 */ 746 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 747 } 748 749 /* 750 * Copy the sigframe out to the user's stack. 751 */ 752 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 753#ifdef DEBUG 754 printf("process %ld has trashed its stack\n", (long)p->p_pid); 755#endif 756 PROC_LOCK(p); 757 sigexit(td, SIGILL); 758 } 759 760 regs->tf_esp = (int)sfp; 761 regs->tf_eip = p->p_sysent->sv_sigcode_base; 762 if (regs->tf_eip == 0) 763 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 764 regs->tf_eflags &= ~(PSL_T | PSL_D); 765 regs->tf_cs = _ucodesel; 766 regs->tf_ds = _udatasel; 767 regs->tf_es = _udatasel; 768 regs->tf_fs = _udatasel; 769 regs->tf_ss = _udatasel; 770 PROC_LOCK(p); 771 mtx_lock(&psp->ps_mtx); 772} 773 774/* 775 * System call to cleanup state after a signal 776 * has been taken. Reset signal mask and 777 * stack state from context left by sendsig (above). 778 * Return to previous pc and psl as specified by 779 * context left by sendsig. Check carefully to 780 * make sure that the user has not modified the 781 * state to gain improper privileges. 782 * 783 * MPSAFE 784 */ 785#ifdef COMPAT_43 786int 787osigreturn(td, uap) 788 struct thread *td; 789 struct osigreturn_args /* { 790 struct osigcontext *sigcntxp; 791 } */ *uap; 792{ 793 struct osigcontext sc; 794 struct trapframe *regs; 795 struct osigcontext *scp; 796 int eflags, error; 797 ksiginfo_t ksi; 798 799 regs = td->td_frame; 800 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 801 if (error != 0) 802 return (error); 803 scp = ≻ 804 eflags = scp->sc_ps; 805 if (eflags & PSL_VM) { 806 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 807 struct vm86_kernel *vm86; 808 809 /* 810 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 811 * set up the vm86 area, and we can't enter vm86 mode. 812 */ 813 if (td->td_pcb->pcb_ext == 0) 814 return (EINVAL); 815 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 816 if (vm86->vm86_inited == 0) 817 return (EINVAL); 818 819 /* Go back to user mode if both flags are set. */ 820 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 821 ksiginfo_init_trap(&ksi); 822 ksi.ksi_signo = SIGBUS; 823 ksi.ksi_code = BUS_OBJERR; 824 ksi.ksi_addr = (void *)regs->tf_eip; 825 trapsignal(td, &ksi); 826 } 827 828 if (vm86->vm86_has_vme) { 829 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 830 (eflags & VME_USERCHANGE) | PSL_VM; 831 } else { 832 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 833 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 834 (eflags & VM_USERCHANGE) | PSL_VM; 835 } 836 tf->tf_vm86_ds = scp->sc_ds; 837 tf->tf_vm86_es = scp->sc_es; 838 tf->tf_vm86_fs = scp->sc_fs; 839 tf->tf_vm86_gs = scp->sc_gs; 840 tf->tf_ds = _udatasel; 841 tf->tf_es = _udatasel; 842 tf->tf_fs = _udatasel; 843 } else { 844 /* 845 * Don't allow users to change privileged or reserved flags. 846 */ 847 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 848 return (EINVAL); 849 } 850 851 /* 852 * Don't allow users to load a valid privileged %cs. Let the 853 * hardware check for invalid selectors, excess privilege in 854 * other selectors, invalid %eip's and invalid %esp's. 855 */ 856 if (!CS_SECURE(scp->sc_cs)) { 857 ksiginfo_init_trap(&ksi); 858 ksi.ksi_signo = SIGBUS; 859 ksi.ksi_code = BUS_OBJERR; 860 ksi.ksi_trapno = T_PROTFLT; 861 ksi.ksi_addr = (void *)regs->tf_eip; 862 trapsignal(td, &ksi); 863 return (EINVAL); 864 } 865 regs->tf_ds = scp->sc_ds; 866 regs->tf_es = scp->sc_es; 867 regs->tf_fs = scp->sc_fs; 868 } 869 870 /* Restore remaining registers. */ 871 regs->tf_eax = scp->sc_eax; 872 regs->tf_ebx = scp->sc_ebx; 873 regs->tf_ecx = scp->sc_ecx; 874 regs->tf_edx = scp->sc_edx; 875 regs->tf_esi = scp->sc_esi; 876 regs->tf_edi = scp->sc_edi; 877 regs->tf_cs = scp->sc_cs; 878 regs->tf_ss = scp->sc_ss; 879 regs->tf_isp = scp->sc_isp; 880 regs->tf_ebp = scp->sc_fp; 881 regs->tf_esp = scp->sc_sp; 882 regs->tf_eip = scp->sc_pc; 883 regs->tf_eflags = eflags; 884 885#if defined(COMPAT_43) 886 if (scp->sc_onstack & 1) 887 td->td_sigstk.ss_flags |= SS_ONSTACK; 888 else 889 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 890#endif 891 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 892 SIGPROCMASK_OLD); 893 return (EJUSTRETURN); 894} 895#endif /* COMPAT_43 */ 896 897#ifdef COMPAT_FREEBSD4 898/* 899 * MPSAFE 900 */ 901int 902freebsd4_sigreturn(td, uap) 903 struct thread *td; 904 struct freebsd4_sigreturn_args /* { 905 const ucontext4 *sigcntxp; 906 } */ *uap; 907{ 908 struct ucontext4 uc; 909 struct trapframe *regs; 910 struct ucontext4 *ucp; 911 int cs, eflags, error; 912 ksiginfo_t ksi; 913 914 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 915 if (error != 0) 916 return (error); 917 ucp = &uc; 918 regs = td->td_frame; 919 eflags = ucp->uc_mcontext.mc_eflags; 920 if (eflags & PSL_VM) { 921 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 922 struct vm86_kernel *vm86; 923 924 /* 925 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 926 * set up the vm86 area, and we can't enter vm86 mode. 927 */ 928 if (td->td_pcb->pcb_ext == 0) 929 return (EINVAL); 930 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 931 if (vm86->vm86_inited == 0) 932 return (EINVAL); 933 934 /* Go back to user mode if both flags are set. */ 935 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 936 ksiginfo_init_trap(&ksi); 937 ksi.ksi_signo = SIGBUS; 938 ksi.ksi_code = BUS_OBJERR; 939 ksi.ksi_addr = (void *)regs->tf_eip; 940 trapsignal(td, &ksi); 941 } 942 if (vm86->vm86_has_vme) { 943 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 944 (eflags & VME_USERCHANGE) | PSL_VM; 945 } else { 946 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 947 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 948 (eflags & VM_USERCHANGE) | PSL_VM; 949 } 950 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 951 tf->tf_eflags = eflags; 952 tf->tf_vm86_ds = tf->tf_ds; 953 tf->tf_vm86_es = tf->tf_es; 954 tf->tf_vm86_fs = tf->tf_fs; 955 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 956 tf->tf_ds = _udatasel; 957 tf->tf_es = _udatasel; 958 tf->tf_fs = _udatasel; 959 } else { 960 /* 961 * Don't allow users to change privileged or reserved flags. 962 */ 963 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 964 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 965 td->td_proc->p_pid, td->td_name, eflags); 966 return (EINVAL); 967 } 968 969 /* 970 * Don't allow users to load a valid privileged %cs. Let the 971 * hardware check for invalid selectors, excess privilege in 972 * other selectors, invalid %eip's and invalid %esp's. 973 */ 974 cs = ucp->uc_mcontext.mc_cs; 975 if (!CS_SECURE(cs)) { 976 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 977 td->td_proc->p_pid, td->td_name, cs); 978 ksiginfo_init_trap(&ksi); 979 ksi.ksi_signo = SIGBUS; 980 ksi.ksi_code = BUS_OBJERR; 981 ksi.ksi_trapno = T_PROTFLT; 982 ksi.ksi_addr = (void *)regs->tf_eip; 983 trapsignal(td, &ksi); 984 return (EINVAL); 985 } 986 987 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 988 } 989 990#if defined(COMPAT_43) 991 if (ucp->uc_mcontext.mc_onstack & 1) 992 td->td_sigstk.ss_flags |= SS_ONSTACK; 993 else 994 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 995#endif 996 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 997 return (EJUSTRETURN); 998} 999#endif /* COMPAT_FREEBSD4 */ 1000 1001/* 1002 * MPSAFE 1003 */ 1004int 1005sys_sigreturn(td, uap) 1006 struct thread *td; 1007 struct sigreturn_args /* { 1008 const struct __ucontext *sigcntxp; 1009 } */ *uap; 1010{ 1011 ucontext_t uc; 1012 struct trapframe *regs; 1013 ucontext_t *ucp; 1014 int cs, eflags, error, ret; 1015 ksiginfo_t ksi; 1016 1017 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1018 if (error != 0) 1019 return (error); 1020 ucp = &uc; 1021 regs = td->td_frame; 1022 eflags = ucp->uc_mcontext.mc_eflags; 1023 if (eflags & PSL_VM) { 1024 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1025 struct vm86_kernel *vm86; 1026 1027 /* 1028 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1029 * set up the vm86 area, and we can't enter vm86 mode. 1030 */ 1031 if (td->td_pcb->pcb_ext == 0) 1032 return (EINVAL); 1033 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1034 if (vm86->vm86_inited == 0) 1035 return (EINVAL); 1036 1037 /* Go back to user mode if both flags are set. */ 1038 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1039 ksiginfo_init_trap(&ksi); 1040 ksi.ksi_signo = SIGBUS; 1041 ksi.ksi_code = BUS_OBJERR; 1042 ksi.ksi_addr = (void *)regs->tf_eip; 1043 trapsignal(td, &ksi); 1044 } 1045 1046 if (vm86->vm86_has_vme) { 1047 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1048 (eflags & VME_USERCHANGE) | PSL_VM; 1049 } else { 1050 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1051 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1052 (eflags & VM_USERCHANGE) | PSL_VM; 1053 } 1054 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1055 tf->tf_eflags = eflags; 1056 tf->tf_vm86_ds = tf->tf_ds; 1057 tf->tf_vm86_es = tf->tf_es; 1058 tf->tf_vm86_fs = tf->tf_fs; 1059 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1060 tf->tf_ds = _udatasel; 1061 tf->tf_es = _udatasel; 1062 tf->tf_fs = _udatasel; 1063 } else { 1064 /* 1065 * Don't allow users to change privileged or reserved flags. 1066 */ 1067 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1068 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1069 td->td_proc->p_pid, td->td_name, eflags); 1070 return (EINVAL); 1071 } 1072 1073 /* 1074 * Don't allow users to load a valid privileged %cs. Let the 1075 * hardware check for invalid selectors, excess privilege in 1076 * other selectors, invalid %eip's and invalid %esp's. 1077 */ 1078 cs = ucp->uc_mcontext.mc_cs; 1079 if (!CS_SECURE(cs)) { 1080 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1081 td->td_proc->p_pid, td->td_name, cs); 1082 ksiginfo_init_trap(&ksi); 1083 ksi.ksi_signo = SIGBUS; 1084 ksi.ksi_code = BUS_OBJERR; 1085 ksi.ksi_trapno = T_PROTFLT; 1086 ksi.ksi_addr = (void *)regs->tf_eip; 1087 trapsignal(td, &ksi); 1088 return (EINVAL); 1089 } 1090 1091 ret = set_fpcontext(td, &ucp->uc_mcontext); 1092 if (ret != 0) 1093 return (ret); 1094 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1095 } 1096 1097#if defined(COMPAT_43) 1098 if (ucp->uc_mcontext.mc_onstack & 1) 1099 td->td_sigstk.ss_flags |= SS_ONSTACK; 1100 else 1101 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1102#endif 1103 1104 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1105 return (EJUSTRETURN); 1106} 1107 1108/* 1109 * Machine dependent boot() routine 1110 * 1111 * I haven't seen anything to put here yet 1112 * Possibly some stuff might be grafted back here from boot() 1113 */ 1114void 1115cpu_boot(int howto) 1116{ 1117} 1118 1119/* 1120 * Flush the D-cache for non-DMA I/O so that the I-cache can 1121 * be made coherent later. 1122 */ 1123void 1124cpu_flush_dcache(void *ptr, size_t len) 1125{ 1126 /* Not applicable */ 1127} 1128 1129/* Get current clock frequency for the given cpu id. */ 1130int 1131cpu_est_clockrate(int cpu_id, uint64_t *rate) 1132{ 1133 uint64_t tsc1, tsc2; 1134 uint64_t acnt, mcnt, perf; 1135 register_t reg; 1136 1137 if (pcpu_find(cpu_id) == NULL || rate == NULL) 1138 return (EINVAL); 1139 if ((cpu_feature & CPUID_TSC) == 0) 1140 return (EOPNOTSUPP); 1141 1142 /* 1143 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, 1144 * DELAY(9) based logic fails. 1145 */ 1146 if (tsc_is_invariant && !tsc_perf_stat) 1147 return (EOPNOTSUPP); 1148 1149#ifdef SMP 1150 if (smp_cpus > 1) { 1151 /* Schedule ourselves on the indicated cpu. */ 1152 thread_lock(curthread); 1153 sched_bind(curthread, cpu_id); 1154 thread_unlock(curthread); 1155 } 1156#endif 1157 1158 /* Calibrate by measuring a short delay. */ 1159 reg = intr_disable(); 1160 if (tsc_is_invariant) { 1161 wrmsr(MSR_MPERF, 0); 1162 wrmsr(MSR_APERF, 0); 1163 tsc1 = rdtsc(); 1164 DELAY(1000); 1165 mcnt = rdmsr(MSR_MPERF); 1166 acnt = rdmsr(MSR_APERF); 1167 tsc2 = rdtsc(); 1168 intr_restore(reg); 1169 perf = 1000 * acnt / mcnt; 1170 *rate = (tsc2 - tsc1) * perf; 1171 } else { 1172 tsc1 = rdtsc(); 1173 DELAY(1000); 1174 tsc2 = rdtsc(); 1175 intr_restore(reg); 1176 *rate = (tsc2 - tsc1) * 1000; 1177 } 1178 1179#ifdef SMP 1180 if (smp_cpus > 1) { 1181 thread_lock(curthread); 1182 sched_unbind(curthread); 1183 thread_unlock(curthread); 1184 } 1185#endif 1186 1187 return (0); 1188} 1189 1190#ifdef XEN 1191 1192static void 1193idle_block(void) 1194{ 1195 1196 HYPERVISOR_sched_op(SCHEDOP_block, 0); 1197} 1198 1199void 1200cpu_halt(void) 1201{ 1202 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 1203} 1204 1205int scheduler_running; 1206 1207static void 1208cpu_idle_hlt(sbintime_t sbt) 1209{ 1210 1211 scheduler_running = 1; 1212 enable_intr(); 1213 idle_block(); 1214} 1215 1216#else 1217/* 1218 * Shutdown the CPU as much as possible 1219 */ 1220void 1221cpu_halt(void) 1222{ 1223 for (;;) 1224 halt(); 1225} 1226 1227#endif 1228 1229void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ 1230static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ 1231static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ 1232TUNABLE_INT("machdep.idle_mwait", &idle_mwait); 1233SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait, 1234 0, "Use MONITOR/MWAIT for short idle"); 1235 1236#define STATE_RUNNING 0x0 1237#define STATE_MWAIT 0x1 1238#define STATE_SLEEPING 0x2 1239 1240static void 1241cpu_idle_acpi(sbintime_t sbt) 1242{ 1243 int *state; 1244 1245 state = (int *)PCPU_PTR(monitorbuf); 1246 *state = STATE_SLEEPING; 1247 1248 /* See comments in cpu_idle_hlt(). */ 1249 disable_intr(); 1250 if (sched_runnable()) 1251 enable_intr(); 1252 else if (cpu_idle_hook) 1253 cpu_idle_hook(sbt); 1254 else 1255 __asm __volatile("sti; hlt"); 1256 *state = STATE_RUNNING; 1257} 1258 1259#ifndef XEN 1260static void 1261cpu_idle_hlt(sbintime_t sbt) 1262{ 1263 int *state; 1264 1265 state = (int *)PCPU_PTR(monitorbuf); 1266 *state = STATE_SLEEPING; 1267 1268 /* 1269 * Since we may be in a critical section from cpu_idle(), if 1270 * an interrupt fires during that critical section we may have 1271 * a pending preemption. If the CPU halts, then that thread 1272 * may not execute until a later interrupt awakens the CPU. 1273 * To handle this race, check for a runnable thread after 1274 * disabling interrupts and immediately return if one is 1275 * found. Also, we must absolutely guarentee that hlt is 1276 * the next instruction after sti. This ensures that any 1277 * interrupt that fires after the call to disable_intr() will 1278 * immediately awaken the CPU from hlt. Finally, please note 1279 * that on x86 this works fine because of interrupts enabled only 1280 * after the instruction following sti takes place, while IF is set 1281 * to 1 immediately, allowing hlt instruction to acknowledge the 1282 * interrupt. 1283 */ 1284 disable_intr(); 1285 if (sched_runnable()) 1286 enable_intr(); 1287 else 1288 __asm __volatile("sti; hlt"); 1289 *state = STATE_RUNNING; 1290} 1291#endif 1292 1293/* 1294 * MWAIT cpu power states. Lower 4 bits are sub-states. 1295 */ 1296#define MWAIT_C0 0xf0 1297#define MWAIT_C1 0x00 1298#define MWAIT_C2 0x10 1299#define MWAIT_C3 0x20 1300#define MWAIT_C4 0x30 1301 1302static void 1303cpu_idle_mwait(sbintime_t sbt) 1304{ 1305 int *state; 1306 1307 state = (int *)PCPU_PTR(monitorbuf); 1308 *state = STATE_MWAIT; 1309 1310 /* See comments in cpu_idle_hlt(). */ 1311 disable_intr(); 1312 if (sched_runnable()) { 1313 enable_intr(); 1314 *state = STATE_RUNNING; 1315 return; 1316 } 1317 cpu_monitor(state, 0, 0); 1318 if (*state == STATE_MWAIT) 1319 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); 1320 else 1321 enable_intr(); 1322 *state = STATE_RUNNING; 1323} 1324 1325static void 1326cpu_idle_spin(sbintime_t sbt) 1327{ 1328 int *state; 1329 int i; 1330 1331 state = (int *)PCPU_PTR(monitorbuf); 1332 *state = STATE_RUNNING; 1333 1334 /* 1335 * The sched_runnable() call is racy but as long as there is 1336 * a loop missing it one time will have just a little impact if any 1337 * (and it is much better than missing the check at all). 1338 */ 1339 for (i = 0; i < 1000; i++) { 1340 if (sched_runnable()) 1341 return; 1342 cpu_spinwait(); 1343 } 1344} 1345 1346/* 1347 * C1E renders the local APIC timer dead, so we disable it by 1348 * reading the Interrupt Pending Message register and clearing 1349 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). 1350 * 1351 * Reference: 1352 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" 1353 * #32559 revision 3.00+ 1354 */ 1355#define MSR_AMDK8_IPM 0xc0010055 1356#define AMDK8_SMIONCMPHALT (1ULL << 27) 1357#define AMDK8_C1EONCMPHALT (1ULL << 28) 1358#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) 1359 1360static void 1361cpu_probe_amdc1e(void) 1362{ 1363 1364 /* 1365 * Detect the presence of C1E capability mostly on latest 1366 * dual-cores (or future) k8 family. 1367 */ 1368 if (cpu_vendor_id == CPU_VENDOR_AMD && 1369 (cpu_id & 0x00000f00) == 0x00000f00 && 1370 (cpu_id & 0x0fff0000) >= 0x00040000) { 1371 cpu_ident_amdc1e = 1; 1372 } 1373} 1374 1375#ifdef XEN 1376void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; 1377#else 1378void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; 1379#endif 1380 1381void 1382cpu_idle(int busy) 1383{ 1384#ifndef XEN 1385 uint64_t msr; 1386#endif 1387 sbintime_t sbt = -1; 1388 1389 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", 1390 busy, curcpu); 1391#if defined(MP_WATCHDOG) && !defined(XEN) 1392 ap_watchdog(PCPU_GET(cpuid)); 1393#endif 1394#ifndef XEN 1395 /* If we are busy - try to use fast methods. */ 1396 if (busy) { 1397 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { 1398 cpu_idle_mwait(busy); 1399 goto out; 1400 } 1401 } 1402#endif 1403 1404 /* If we have time - switch timers into idle mode. */ 1405 if (!busy) { 1406 critical_enter(); 1407 sbt = cpu_idleclock(); 1408 } 1409 1410#ifndef XEN 1411 /* Apply AMD APIC timer C1E workaround. */ 1412 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) { 1413 msr = rdmsr(MSR_AMDK8_IPM); 1414 if (msr & AMDK8_CMPHALT) 1415 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); 1416 } 1417#endif 1418 1419 /* Call main idle method. */ 1420 cpu_idle_fn(sbt); 1421 1422 /* Switch timers mack into active mode. */ 1423 if (!busy) { 1424 cpu_activeclock(); 1425 critical_exit(); 1426 } 1427#ifndef XEN 1428out: 1429#endif 1430 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", 1431 busy, curcpu); 1432} 1433 1434int 1435cpu_idle_wakeup(int cpu) 1436{ 1437 struct pcpu *pcpu; 1438 int *state; 1439 1440 pcpu = pcpu_find(cpu); 1441 state = (int *)pcpu->pc_monitorbuf; 1442 /* 1443 * This doesn't need to be atomic since missing the race will 1444 * simply result in unnecessary IPIs. 1445 */ 1446 if (*state == STATE_SLEEPING) 1447 return (0); 1448 if (*state == STATE_MWAIT) 1449 *state = STATE_RUNNING; 1450 return (1); 1451} 1452 1453/* 1454 * Ordered by speed/power consumption. 1455 */ 1456struct { 1457 void *id_fn; 1458 char *id_name; 1459} idle_tbl[] = { 1460 { cpu_idle_spin, "spin" }, 1461 { cpu_idle_mwait, "mwait" }, 1462 { cpu_idle_hlt, "hlt" }, 1463 { cpu_idle_acpi, "acpi" }, 1464 { NULL, NULL } 1465}; 1466 1467static int 1468idle_sysctl_available(SYSCTL_HANDLER_ARGS) 1469{ 1470 char *avail, *p; 1471 int error; 1472 int i; 1473 1474 avail = malloc(256, M_TEMP, M_WAITOK); 1475 p = avail; 1476 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1477 if (strstr(idle_tbl[i].id_name, "mwait") && 1478 (cpu_feature2 & CPUID2_MON) == 0) 1479 continue; 1480 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1481 cpu_idle_hook == NULL) 1482 continue; 1483 p += sprintf(p, "%s%s", p != avail ? ", " : "", 1484 idle_tbl[i].id_name); 1485 } 1486 error = sysctl_handle_string(oidp, avail, 0, req); 1487 free(avail, M_TEMP); 1488 return (error); 1489} 1490 1491SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 1492 0, 0, idle_sysctl_available, "A", "list of available idle functions"); 1493 1494static int 1495idle_sysctl(SYSCTL_HANDLER_ARGS) 1496{ 1497 char buf[16]; 1498 int error; 1499 char *p; 1500 int i; 1501 1502 p = "unknown"; 1503 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1504 if (idle_tbl[i].id_fn == cpu_idle_fn) { 1505 p = idle_tbl[i].id_name; 1506 break; 1507 } 1508 } 1509 strncpy(buf, p, sizeof(buf)); 1510 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1511 if (error != 0 || req->newptr == NULL) 1512 return (error); 1513 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1514 if (strstr(idle_tbl[i].id_name, "mwait") && 1515 (cpu_feature2 & CPUID2_MON) == 0) 1516 continue; 1517 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1518 cpu_idle_hook == NULL) 1519 continue; 1520 if (strcmp(idle_tbl[i].id_name, buf)) 1521 continue; 1522 cpu_idle_fn = idle_tbl[i].id_fn; 1523 return (0); 1524 } 1525 return (EINVAL); 1526} 1527 1528SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, 1529 idle_sysctl, "A", "currently selected idle function"); 1530 1531/* 1532 * Reset registers to default values on exec. 1533 */ 1534void 1535exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1536{ 1537 struct trapframe *regs = td->td_frame; 1538 struct pcb *pcb = td->td_pcb; 1539 1540 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1541 pcb->pcb_gs = _udatasel; 1542 load_gs(_udatasel); 1543 1544 mtx_lock_spin(&dt_lock); 1545 if (td->td_proc->p_md.md_ldt) 1546 user_ldt_free(td); 1547 else 1548 mtx_unlock_spin(&dt_lock); 1549 1550 bzero((char *)regs, sizeof(struct trapframe)); 1551 regs->tf_eip = imgp->entry_addr; 1552 regs->tf_esp = stack; 1553 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1554 regs->tf_ss = _udatasel; 1555 regs->tf_ds = _udatasel; 1556 regs->tf_es = _udatasel; 1557 regs->tf_fs = _udatasel; 1558 regs->tf_cs = _ucodesel; 1559 1560 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1561 regs->tf_ebx = imgp->ps_strings; 1562 1563 /* 1564 * Reset the hardware debug registers if they were in use. 1565 * They won't have any meaning for the newly exec'd process. 1566 */ 1567 if (pcb->pcb_flags & PCB_DBREGS) { 1568 pcb->pcb_dr0 = 0; 1569 pcb->pcb_dr1 = 0; 1570 pcb->pcb_dr2 = 0; 1571 pcb->pcb_dr3 = 0; 1572 pcb->pcb_dr6 = 0; 1573 pcb->pcb_dr7 = 0; 1574 if (pcb == curpcb) { 1575 /* 1576 * Clear the debug registers on the running 1577 * CPU, otherwise they will end up affecting 1578 * the next process we switch to. 1579 */ 1580 reset_dbregs(); 1581 } 1582 pcb->pcb_flags &= ~PCB_DBREGS; 1583 } 1584 1585 /* 1586 * Initialize the math emulator (if any) for the current process. 1587 * Actually, just clear the bit that says that the emulator has 1588 * been initialized. Initialization is delayed until the process 1589 * traps to the emulator (if it is done at all) mainly because 1590 * emulators don't provide an entry point for initialization. 1591 */ 1592 td->td_pcb->pcb_flags &= ~FP_SOFTFP; 1593 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1594 1595 /* 1596 * Drop the FP state if we hold it, so that the process gets a 1597 * clean FP state if it uses the FPU again. 1598 */ 1599 fpstate_drop(td); 1600 1601 /* 1602 * XXX - Linux emulator 1603 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1604 * on it. 1605 */ 1606 td->td_retval[1] = 0; 1607} 1608 1609void 1610cpu_setregs(void) 1611{ 1612 unsigned int cr0; 1613 1614 cr0 = rcr0(); 1615 1616 /* 1617 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1618 * 1619 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1620 * instructions. We must set the CR0_MP bit and use the CR0_TS 1621 * bit to control the trap, because setting the CR0_EM bit does 1622 * not cause WAIT instructions to trap. It's important to trap 1623 * WAIT instructions - otherwise the "wait" variants of no-wait 1624 * control instructions would degenerate to the "no-wait" variants 1625 * after FP context switches but work correctly otherwise. It's 1626 * particularly important to trap WAITs when there is no NPX - 1627 * otherwise the "wait" variants would always degenerate. 1628 * 1629 * Try setting CR0_NE to get correct error reporting on 486DX's. 1630 * Setting it should fail or do nothing on lesser processors. 1631 */ 1632 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1633 load_cr0(cr0); 1634 load_gs(_udatasel); 1635} 1636 1637u_long bootdev; /* not a struct cdev *- encoding is different */ 1638SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1639 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1640 1641static char bootmethod[16] = "BIOS"; 1642SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1643 "System firmware boot method"); 1644 1645/* 1646 * Initialize 386 and configure to run kernel 1647 */ 1648 1649/* 1650 * Initialize segments & interrupt table 1651 */ 1652 1653int _default_ldt; 1654 1655#ifdef XEN 1656union descriptor *gdt; 1657union descriptor *ldt; 1658#else 1659union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1660union descriptor ldt[NLDT]; /* local descriptor table */ 1661#endif 1662static struct gate_descriptor idt0[NIDT]; 1663struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1664struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1665struct mtx dt_lock; /* lock for GDT and LDT */ 1666 1667#if defined(I586_CPU) && !defined(NO_F00F_HACK) 1668extern int has_f00f_bug; 1669#endif 1670 1671static struct i386tss dblfault_tss; 1672static char dblfault_stack[PAGE_SIZE]; 1673 1674extern vm_offset_t proc0kstack; 1675 1676 1677/* 1678 * software prototypes -- in more palatable form. 1679 * 1680 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1681 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1682 */ 1683struct soft_segment_descriptor gdt_segs[] = { 1684/* GNULL_SEL 0 Null Descriptor */ 1685{ .ssd_base = 0x0, 1686 .ssd_limit = 0x0, 1687 .ssd_type = 0, 1688 .ssd_dpl = SEL_KPL, 1689 .ssd_p = 0, 1690 .ssd_xx = 0, .ssd_xx1 = 0, 1691 .ssd_def32 = 0, 1692 .ssd_gran = 0 }, 1693/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1694{ .ssd_base = 0x0, 1695 .ssd_limit = 0xfffff, 1696 .ssd_type = SDT_MEMRWA, 1697 .ssd_dpl = SEL_KPL, 1698 .ssd_p = 1, 1699 .ssd_xx = 0, .ssd_xx1 = 0, 1700 .ssd_def32 = 1, 1701 .ssd_gran = 1 }, 1702/* GUFS_SEL 2 %fs Descriptor for user */ 1703{ .ssd_base = 0x0, 1704 .ssd_limit = 0xfffff, 1705 .ssd_type = SDT_MEMRWA, 1706 .ssd_dpl = SEL_UPL, 1707 .ssd_p = 1, 1708 .ssd_xx = 0, .ssd_xx1 = 0, 1709 .ssd_def32 = 1, 1710 .ssd_gran = 1 }, 1711/* GUGS_SEL 3 %gs Descriptor for user */ 1712{ .ssd_base = 0x0, 1713 .ssd_limit = 0xfffff, 1714 .ssd_type = SDT_MEMRWA, 1715 .ssd_dpl = SEL_UPL, 1716 .ssd_p = 1, 1717 .ssd_xx = 0, .ssd_xx1 = 0, 1718 .ssd_def32 = 1, 1719 .ssd_gran = 1 }, 1720/* GCODE_SEL 4 Code Descriptor for kernel */ 1721{ .ssd_base = 0x0, 1722 .ssd_limit = 0xfffff, 1723 .ssd_type = SDT_MEMERA, 1724 .ssd_dpl = SEL_KPL, 1725 .ssd_p = 1, 1726 .ssd_xx = 0, .ssd_xx1 = 0, 1727 .ssd_def32 = 1, 1728 .ssd_gran = 1 }, 1729/* GDATA_SEL 5 Data Descriptor for kernel */ 1730{ .ssd_base = 0x0, 1731 .ssd_limit = 0xfffff, 1732 .ssd_type = SDT_MEMRWA, 1733 .ssd_dpl = SEL_KPL, 1734 .ssd_p = 1, 1735 .ssd_xx = 0, .ssd_xx1 = 0, 1736 .ssd_def32 = 1, 1737 .ssd_gran = 1 }, 1738/* GUCODE_SEL 6 Code Descriptor for user */ 1739{ .ssd_base = 0x0, 1740 .ssd_limit = 0xfffff, 1741 .ssd_type = SDT_MEMERA, 1742 .ssd_dpl = SEL_UPL, 1743 .ssd_p = 1, 1744 .ssd_xx = 0, .ssd_xx1 = 0, 1745 .ssd_def32 = 1, 1746 .ssd_gran = 1 }, 1747/* GUDATA_SEL 7 Data Descriptor for user */ 1748{ .ssd_base = 0x0, 1749 .ssd_limit = 0xfffff, 1750 .ssd_type = SDT_MEMRWA, 1751 .ssd_dpl = SEL_UPL, 1752 .ssd_p = 1, 1753 .ssd_xx = 0, .ssd_xx1 = 0, 1754 .ssd_def32 = 1, 1755 .ssd_gran = 1 }, 1756/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1757{ .ssd_base = 0x400, 1758 .ssd_limit = 0xfffff, 1759 .ssd_type = SDT_MEMRWA, 1760 .ssd_dpl = SEL_KPL, 1761 .ssd_p = 1, 1762 .ssd_xx = 0, .ssd_xx1 = 0, 1763 .ssd_def32 = 1, 1764 .ssd_gran = 1 }, 1765#ifndef XEN 1766/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1767{ 1768 .ssd_base = 0x0, 1769 .ssd_limit = sizeof(struct i386tss)-1, 1770 .ssd_type = SDT_SYS386TSS, 1771 .ssd_dpl = 0, 1772 .ssd_p = 1, 1773 .ssd_xx = 0, .ssd_xx1 = 0, 1774 .ssd_def32 = 0, 1775 .ssd_gran = 0 }, 1776/* GLDT_SEL 10 LDT Descriptor */ 1777{ .ssd_base = (int) ldt, 1778 .ssd_limit = sizeof(ldt)-1, 1779 .ssd_type = SDT_SYSLDT, 1780 .ssd_dpl = SEL_UPL, 1781 .ssd_p = 1, 1782 .ssd_xx = 0, .ssd_xx1 = 0, 1783 .ssd_def32 = 0, 1784 .ssd_gran = 0 }, 1785/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1786{ .ssd_base = (int) ldt, 1787 .ssd_limit = (512 * sizeof(union descriptor)-1), 1788 .ssd_type = SDT_SYSLDT, 1789 .ssd_dpl = 0, 1790 .ssd_p = 1, 1791 .ssd_xx = 0, .ssd_xx1 = 0, 1792 .ssd_def32 = 0, 1793 .ssd_gran = 0 }, 1794/* GPANIC_SEL 12 Panic Tss Descriptor */ 1795{ .ssd_base = (int) &dblfault_tss, 1796 .ssd_limit = sizeof(struct i386tss)-1, 1797 .ssd_type = SDT_SYS386TSS, 1798 .ssd_dpl = 0, 1799 .ssd_p = 1, 1800 .ssd_xx = 0, .ssd_xx1 = 0, 1801 .ssd_def32 = 0, 1802 .ssd_gran = 0 }, 1803/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1804{ .ssd_base = 0, 1805 .ssd_limit = 0xfffff, 1806 .ssd_type = SDT_MEMERA, 1807 .ssd_dpl = 0, 1808 .ssd_p = 1, 1809 .ssd_xx = 0, .ssd_xx1 = 0, 1810 .ssd_def32 = 0, 1811 .ssd_gran = 1 }, 1812/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1813{ .ssd_base = 0, 1814 .ssd_limit = 0xfffff, 1815 .ssd_type = SDT_MEMERA, 1816 .ssd_dpl = 0, 1817 .ssd_p = 1, 1818 .ssd_xx = 0, .ssd_xx1 = 0, 1819 .ssd_def32 = 0, 1820 .ssd_gran = 1 }, 1821/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1822{ .ssd_base = 0, 1823 .ssd_limit = 0xfffff, 1824 .ssd_type = SDT_MEMRWA, 1825 .ssd_dpl = 0, 1826 .ssd_p = 1, 1827 .ssd_xx = 0, .ssd_xx1 = 0, 1828 .ssd_def32 = 1, 1829 .ssd_gran = 1 }, 1830/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1831{ .ssd_base = 0, 1832 .ssd_limit = 0xfffff, 1833 .ssd_type = SDT_MEMRWA, 1834 .ssd_dpl = 0, 1835 .ssd_p = 1, 1836 .ssd_xx = 0, .ssd_xx1 = 0, 1837 .ssd_def32 = 0, 1838 .ssd_gran = 1 }, 1839/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1840{ .ssd_base = 0, 1841 .ssd_limit = 0xfffff, 1842 .ssd_type = SDT_MEMRWA, 1843 .ssd_dpl = 0, 1844 .ssd_p = 1, 1845 .ssd_xx = 0, .ssd_xx1 = 0, 1846 .ssd_def32 = 0, 1847 .ssd_gran = 1 }, 1848/* GNDIS_SEL 18 NDIS Descriptor */ 1849{ .ssd_base = 0x0, 1850 .ssd_limit = 0x0, 1851 .ssd_type = 0, 1852 .ssd_dpl = 0, 1853 .ssd_p = 0, 1854 .ssd_xx = 0, .ssd_xx1 = 0, 1855 .ssd_def32 = 0, 1856 .ssd_gran = 0 }, 1857#endif /* !XEN */ 1858}; 1859 1860static struct soft_segment_descriptor ldt_segs[] = { 1861 /* Null Descriptor - overwritten by call gate */ 1862{ .ssd_base = 0x0, 1863 .ssd_limit = 0x0, 1864 .ssd_type = 0, 1865 .ssd_dpl = 0, 1866 .ssd_p = 0, 1867 .ssd_xx = 0, .ssd_xx1 = 0, 1868 .ssd_def32 = 0, 1869 .ssd_gran = 0 }, 1870 /* Null Descriptor - overwritten by call gate */ 1871{ .ssd_base = 0x0, 1872 .ssd_limit = 0x0, 1873 .ssd_type = 0, 1874 .ssd_dpl = 0, 1875 .ssd_p = 0, 1876 .ssd_xx = 0, .ssd_xx1 = 0, 1877 .ssd_def32 = 0, 1878 .ssd_gran = 0 }, 1879 /* Null Descriptor - overwritten by call gate */ 1880{ .ssd_base = 0x0, 1881 .ssd_limit = 0x0, 1882 .ssd_type = 0, 1883 .ssd_dpl = 0, 1884 .ssd_p = 0, 1885 .ssd_xx = 0, .ssd_xx1 = 0, 1886 .ssd_def32 = 0, 1887 .ssd_gran = 0 }, 1888 /* Code Descriptor for user */ 1889{ .ssd_base = 0x0, 1890 .ssd_limit = 0xfffff, 1891 .ssd_type = SDT_MEMERA, 1892 .ssd_dpl = SEL_UPL, 1893 .ssd_p = 1, 1894 .ssd_xx = 0, .ssd_xx1 = 0, 1895 .ssd_def32 = 1, 1896 .ssd_gran = 1 }, 1897 /* Null Descriptor - overwritten by call gate */ 1898{ .ssd_base = 0x0, 1899 .ssd_limit = 0x0, 1900 .ssd_type = 0, 1901 .ssd_dpl = 0, 1902 .ssd_p = 0, 1903 .ssd_xx = 0, .ssd_xx1 = 0, 1904 .ssd_def32 = 0, 1905 .ssd_gran = 0 }, 1906 /* Data Descriptor for user */ 1907{ .ssd_base = 0x0, 1908 .ssd_limit = 0xfffff, 1909 .ssd_type = SDT_MEMRWA, 1910 .ssd_dpl = SEL_UPL, 1911 .ssd_p = 1, 1912 .ssd_xx = 0, .ssd_xx1 = 0, 1913 .ssd_def32 = 1, 1914 .ssd_gran = 1 }, 1915}; 1916 1917void 1918setidt(idx, func, typ, dpl, selec) 1919 int idx; 1920 inthand_t *func; 1921 int typ; 1922 int dpl; 1923 int selec; 1924{ 1925 struct gate_descriptor *ip; 1926 1927 ip = idt + idx; 1928 ip->gd_looffset = (int)func; 1929 ip->gd_selector = selec; 1930 ip->gd_stkcpy = 0; 1931 ip->gd_xx = 0; 1932 ip->gd_type = typ; 1933 ip->gd_dpl = dpl; 1934 ip->gd_p = 1; 1935 ip->gd_hioffset = ((int)func)>>16 ; 1936} 1937 1938extern inthand_t 1939 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1940 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1941 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1942 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1943 IDTVEC(xmm), 1944#ifdef KDTRACE_HOOKS 1945 IDTVEC(dtrace_ret), 1946#endif 1947#ifdef XENHVM 1948 IDTVEC(xen_intr_upcall), 1949#endif 1950 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1951 1952#ifdef DDB 1953/* 1954 * Display the index and function name of any IDT entries that don't use 1955 * the default 'rsvd' entry point. 1956 */ 1957DB_SHOW_COMMAND(idt, db_show_idt) 1958{ 1959 struct gate_descriptor *ip; 1960 int idx; 1961 uintptr_t func; 1962 1963 ip = idt; 1964 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1965 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1966 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1967 db_printf("%3d\t", idx); 1968 db_printsym(func, DB_STGY_PROC); 1969 db_printf("\n"); 1970 } 1971 ip++; 1972 } 1973} 1974 1975/* Show privileged registers. */ 1976DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1977{ 1978 uint64_t idtr, gdtr; 1979 1980 idtr = ridt(); 1981 db_printf("idtr\t0x%08x/%04x\n", 1982 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1983 gdtr = rgdt(); 1984 db_printf("gdtr\t0x%08x/%04x\n", 1985 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1986 db_printf("ldtr\t0x%04x\n", rldt()); 1987 db_printf("tr\t0x%04x\n", rtr()); 1988 db_printf("cr0\t0x%08x\n", rcr0()); 1989 db_printf("cr2\t0x%08x\n", rcr2()); 1990 db_printf("cr3\t0x%08x\n", rcr3()); 1991 db_printf("cr4\t0x%08x\n", rcr4()); 1992} 1993#endif 1994 1995void 1996sdtossd(sd, ssd) 1997 struct segment_descriptor *sd; 1998 struct soft_segment_descriptor *ssd; 1999{ 2000 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 2001 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 2002 ssd->ssd_type = sd->sd_type; 2003 ssd->ssd_dpl = sd->sd_dpl; 2004 ssd->ssd_p = sd->sd_p; 2005 ssd->ssd_def32 = sd->sd_def32; 2006 ssd->ssd_gran = sd->sd_gran; 2007} 2008 2009#ifndef XEN 2010static int 2011add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 2012{ 2013 int i, insert_idx, physmap_idx; 2014 2015 physmap_idx = *physmap_idxp; 2016 2017 if (boothowto & RB_VERBOSE) 2018 printf("SMAP type=%02x base=%016llx len=%016llx\n", 2019 smap->type, smap->base, smap->length); 2020 2021 if (smap->type != SMAP_TYPE_MEMORY) 2022 return (1); 2023 2024 if (smap->length == 0) 2025 return (1); 2026 2027#ifndef PAE 2028 if (smap->base > 0xffffffff) { 2029 printf("%uK of memory above 4GB ignored\n", 2030 (u_int)(smap->length / 1024)); 2031 return (1); 2032 } 2033#endif 2034 2035 /* 2036 * Find insertion point while checking for overlap. Start off by 2037 * assuming the new entry will be added to the end. 2038 */ 2039 insert_idx = physmap_idx + 2; 2040 for (i = 0; i <= physmap_idx; i += 2) { 2041 if (smap->base < physmap[i + 1]) { 2042 if (smap->base + smap->length <= physmap[i]) { 2043 insert_idx = i; 2044 break; 2045 } 2046 if (boothowto & RB_VERBOSE) 2047 printf( 2048 "Overlapping memory regions, ignoring second region\n"); 2049 return (1); 2050 } 2051 } 2052 2053 /* See if we can prepend to the next entry. */ 2054 if (insert_idx <= physmap_idx && 2055 smap->base + smap->length == physmap[insert_idx]) { 2056 physmap[insert_idx] = smap->base; 2057 return (1); 2058 } 2059 2060 /* See if we can append to the previous entry. */ 2061 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { 2062 physmap[insert_idx - 1] += smap->length; 2063 return (1); 2064 } 2065 2066 physmap_idx += 2; 2067 *physmap_idxp = physmap_idx; 2068 if (physmap_idx == PHYSMAP_SIZE) { 2069 printf( 2070 "Too many segments in the physical address map, giving up\n"); 2071 return (0); 2072 } 2073 2074 /* 2075 * Move the last 'N' entries down to make room for the new 2076 * entry if needed. 2077 */ 2078 for (i = physmap_idx; i > insert_idx; i -= 2) { 2079 physmap[i] = physmap[i - 2]; 2080 physmap[i + 1] = physmap[i - 1]; 2081 } 2082 2083 /* Insert the new entry. */ 2084 physmap[insert_idx] = smap->base; 2085 physmap[insert_idx + 1] = smap->base + smap->length; 2086 return (1); 2087} 2088 2089static void 2090basemem_setup(void) 2091{ 2092 vm_paddr_t pa; 2093 pt_entry_t *pte; 2094 int i; 2095 2096 if (basemem > 640) { 2097 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 2098 basemem); 2099 basemem = 640; 2100 } 2101 2102 /* 2103 * XXX if biosbasemem is now < 640, there is a `hole' 2104 * between the end of base memory and the start of 2105 * ISA memory. The hole may be empty or it may 2106 * contain BIOS code or data. Map it read/write so 2107 * that the BIOS can write to it. (Memory from 0 to 2108 * the physical end of the kernel is mapped read-only 2109 * to begin with and then parts of it are remapped. 2110 * The parts that aren't remapped form holes that 2111 * remain read-only and are unused by the kernel. 2112 * The base memory area is below the physical end of 2113 * the kernel and right now forms a read-only hole. 2114 * The part of it from PAGE_SIZE to 2115 * (trunc_page(biosbasemem * 1024) - 1) will be 2116 * remapped and used by the kernel later.) 2117 * 2118 * This code is similar to the code used in 2119 * pmap_mapdev, but since no memory needs to be 2120 * allocated we simply change the mapping. 2121 */ 2122 for (pa = trunc_page(basemem * 1024); 2123 pa < ISA_HOLE_START; pa += PAGE_SIZE) 2124 pmap_kenter(KERNBASE + pa, pa); 2125 2126 /* 2127 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 2128 * the vm86 page table so that vm86 can scribble on them using 2129 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 2130 * page 0, at least as initialized here? 2131 */ 2132 pte = (pt_entry_t *)vm86paddr; 2133 for (i = basemem / 4; i < 160; i++) 2134 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 2135} 2136#endif 2137 2138/* 2139 * Populate the (physmap) array with base/bound pairs describing the 2140 * available physical memory in the system, then test this memory and 2141 * build the phys_avail array describing the actually-available memory. 2142 * 2143 * If we cannot accurately determine the physical memory map, then use 2144 * value from the 0xE801 call, and failing that, the RTC. 2145 * 2146 * Total memory size may be set by the kernel environment variable 2147 * hw.physmem or the compile-time define MAXMEM. 2148 * 2149 * XXX first should be vm_paddr_t. 2150 */ 2151static void 2152getmemsize(int first) 2153{ 2154 int has_smap, off, physmap_idx, pa_indx, da_indx; 2155 u_long physmem_tunable, memtest; 2156 vm_paddr_t physmap[PHYSMAP_SIZE]; 2157 pt_entry_t *pte; 2158 quad_t dcons_addr, dcons_size; 2159#ifndef XEN 2160 int hasbrokenint12, i, res; 2161 u_int extmem; 2162 struct vm86frame vmf; 2163 struct vm86context vmc; 2164 vm_paddr_t pa; 2165 struct bios_smap *smap, *smapbase, *smapend; 2166 u_int32_t smapsize; 2167 caddr_t kmdp; 2168#endif 2169 2170 has_smap = 0; 2171#if defined(XEN) 2172 Maxmem = xen_start_info->nr_pages - init_first; 2173 physmem = Maxmem; 2174 basemem = 0; 2175 physmap[0] = init_first << PAGE_SHIFT; 2176 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); 2177 physmap_idx = 0; 2178#else 2179#ifdef XBOX 2180 if (arch_i386_is_xbox) { 2181 /* 2182 * We queried the memory size before, so chop off 4MB for 2183 * the framebuffer and inform the OS of this. 2184 */ 2185 physmap[0] = 0; 2186 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2187 physmap_idx = 0; 2188 goto physmap_done; 2189 } 2190#endif 2191 bzero(&vmf, sizeof(vmf)); 2192 bzero(physmap, sizeof(physmap)); 2193 basemem = 0; 2194 2195 /* 2196 * Check if the loader supplied an SMAP memory map. If so, 2197 * use that and do not make any VM86 calls. 2198 */ 2199 physmap_idx = 0; 2200 smapbase = NULL; 2201 kmdp = preload_search_by_type("elf kernel"); 2202 if (kmdp == NULL) 2203 kmdp = preload_search_by_type("elf32 kernel"); 2204 if (kmdp != NULL) 2205 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2206 MODINFO_METADATA | MODINFOMD_SMAP); 2207 if (smapbase != NULL) { 2208 /* 2209 * subr_module.c says: 2210 * "Consumer may safely assume that size value precedes data." 2211 * ie: an int32_t immediately precedes SMAP. 2212 */ 2213 smapsize = *((u_int32_t *)smapbase - 1); 2214 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 2215 has_smap = 1; 2216 2217 for (smap = smapbase; smap < smapend; smap++) 2218 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2219 break; 2220 goto have_smap; 2221 } 2222 2223 /* 2224 * Some newer BIOSes have a broken INT 12H implementation 2225 * which causes a kernel panic immediately. In this case, we 2226 * need use the SMAP to determine the base memory size. 2227 */ 2228 hasbrokenint12 = 0; 2229 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2230 if (hasbrokenint12 == 0) { 2231 /* Use INT12 to determine base memory size. */ 2232 vm86_intcall(0x12, &vmf); 2233 basemem = vmf.vmf_ax; 2234 basemem_setup(); 2235 } 2236 2237 /* 2238 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2239 * the kernel page table so we can use it as a buffer. The 2240 * kernel will unmap this page later. 2241 */ 2242 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2243 vmc.npages = 0; 2244 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2245 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2246 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2247 2248 vmf.vmf_ebx = 0; 2249 do { 2250 vmf.vmf_eax = 0xE820; 2251 vmf.vmf_edx = SMAP_SIG; 2252 vmf.vmf_ecx = sizeof(struct bios_smap); 2253 i = vm86_datacall(0x15, &vmf, &vmc); 2254 if (i || vmf.vmf_eax != SMAP_SIG) 2255 break; 2256 has_smap = 1; 2257 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2258 break; 2259 } while (vmf.vmf_ebx != 0); 2260 2261have_smap: 2262 /* 2263 * If we didn't fetch the "base memory" size from INT12, 2264 * figure it out from the SMAP (or just guess). 2265 */ 2266 if (basemem == 0) { 2267 for (i = 0; i <= physmap_idx; i += 2) { 2268 if (physmap[i] == 0x00000000) { 2269 basemem = physmap[i + 1] / 1024; 2270 break; 2271 } 2272 } 2273 2274 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2275 if (basemem == 0) 2276 basemem = 640; 2277 basemem_setup(); 2278 } 2279 2280 if (physmap[1] != 0) 2281 goto physmap_done; 2282 2283 /* 2284 * If we failed to find an SMAP, figure out the extended 2285 * memory size. We will then build a simple memory map with 2286 * two segments, one for "base memory" and the second for 2287 * "extended memory". Note that "extended memory" starts at a 2288 * physical address of 1MB and that both basemem and extmem 2289 * are in units of 1KB. 2290 * 2291 * First, try to fetch the extended memory size via INT 15:E801. 2292 */ 2293 vmf.vmf_ax = 0xE801; 2294 if (vm86_intcall(0x15, &vmf) == 0) { 2295 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2296 } else { 2297 /* 2298 * If INT15:E801 fails, this is our last ditch effort 2299 * to determine the extended memory size. Currently 2300 * we prefer the RTC value over INT15:88. 2301 */ 2302#if 0 2303 vmf.vmf_ah = 0x88; 2304 vm86_intcall(0x15, &vmf); 2305 extmem = vmf.vmf_ax; 2306#else 2307 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2308#endif 2309 } 2310 2311 /* 2312 * Special hack for chipsets that still remap the 384k hole when 2313 * there's 16MB of memory - this really confuses people that 2314 * are trying to use bus mastering ISA controllers with the 2315 * "16MB limit"; they only have 16MB, but the remapping puts 2316 * them beyond the limit. 2317 * 2318 * If extended memory is between 15-16MB (16-17MB phys address range), 2319 * chop it to 15MB. 2320 */ 2321 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2322 extmem = 15 * 1024; 2323 2324 physmap[0] = 0; 2325 physmap[1] = basemem * 1024; 2326 physmap_idx = 2; 2327 physmap[physmap_idx] = 0x100000; 2328 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2329 2330physmap_done: 2331#endif 2332 /* 2333 * Now, physmap contains a map of physical memory. 2334 */ 2335 2336#ifdef SMP 2337 /* make hole for AP bootstrap code */ 2338 physmap[1] = mp_bootaddress(physmap[1]); 2339#endif 2340 2341 /* 2342 * Maxmem isn't the "maximum memory", it's one larger than the 2343 * highest page of the physical address space. It should be 2344 * called something like "Maxphyspage". We may adjust this 2345 * based on ``hw.physmem'' and the results of the memory test. 2346 */ 2347 Maxmem = atop(physmap[physmap_idx + 1]); 2348 2349#ifdef MAXMEM 2350 Maxmem = MAXMEM / 4; 2351#endif 2352 2353 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2354 Maxmem = atop(physmem_tunable); 2355 2356 /* 2357 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2358 * the amount of memory in the system. 2359 */ 2360 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2361 Maxmem = atop(physmap[physmap_idx + 1]); 2362 2363 /* 2364 * By default enable the memory test on real hardware, and disable 2365 * it if we appear to be running in a VM. This avoids touching all 2366 * pages unnecessarily, which doesn't matter on real hardware but is 2367 * bad for shared VM hosts. Use a general name so that 2368 * one could eventually do more with the code than just disable it. 2369 */ 2370 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2371 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2372 2373 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2374 (boothowto & RB_VERBOSE)) 2375 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2376 2377 /* 2378 * If Maxmem has been increased beyond what the system has detected, 2379 * extend the last memory segment to the new limit. 2380 */ 2381 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2382 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2383 2384 /* call pmap initialization to make new kernel address space */ 2385 pmap_bootstrap(first); 2386 2387 /* 2388 * Size up each available chunk of physical memory. 2389 */ 2390 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2391 pa_indx = 0; 2392 da_indx = 1; 2393 phys_avail[pa_indx++] = physmap[0]; 2394 phys_avail[pa_indx] = physmap[0]; 2395 dump_avail[da_indx] = physmap[0]; 2396 pte = CMAP3; 2397 2398 /* 2399 * Get dcons buffer address 2400 */ 2401 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2402 getenv_quad("dcons.size", &dcons_size) == 0) 2403 dcons_addr = 0; 2404 2405#ifndef XEN 2406 /* 2407 * physmap is in bytes, so when converting to page boundaries, 2408 * round up the start address and round down the end address. 2409 */ 2410 for (i = 0; i <= physmap_idx; i += 2) { 2411 vm_paddr_t end; 2412 2413 end = ptoa((vm_paddr_t)Maxmem); 2414 if (physmap[i + 1] < end) 2415 end = trunc_page(physmap[i + 1]); 2416 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2417 int tmp, page_bad, full; 2418 int *ptr = (int *)CADDR3; 2419 2420 full = FALSE; 2421 /* 2422 * block out kernel memory as not available. 2423 */ 2424 if (pa >= KERNLOAD && pa < first) 2425 goto do_dump_avail; 2426 2427 /* 2428 * block out dcons buffer 2429 */ 2430 if (dcons_addr > 0 2431 && pa >= trunc_page(dcons_addr) 2432 && pa < dcons_addr + dcons_size) 2433 goto do_dump_avail; 2434 2435 page_bad = FALSE; 2436 if (memtest == 0) 2437 goto skip_memtest; 2438 2439 /* 2440 * map page into kernel: valid, read/write,non-cacheable 2441 */ 2442 *pte = pa | PG_V | PG_RW | PG_N; 2443 invltlb(); 2444 2445 tmp = *(int *)ptr; 2446 /* 2447 * Test for alternating 1's and 0's 2448 */ 2449 *(volatile int *)ptr = 0xaaaaaaaa; 2450 if (*(volatile int *)ptr != 0xaaaaaaaa) 2451 page_bad = TRUE; 2452 /* 2453 * Test for alternating 0's and 1's 2454 */ 2455 *(volatile int *)ptr = 0x55555555; 2456 if (*(volatile int *)ptr != 0x55555555) 2457 page_bad = TRUE; 2458 /* 2459 * Test for all 1's 2460 */ 2461 *(volatile int *)ptr = 0xffffffff; 2462 if (*(volatile int *)ptr != 0xffffffff) 2463 page_bad = TRUE; 2464 /* 2465 * Test for all 0's 2466 */ 2467 *(volatile int *)ptr = 0x0; 2468 if (*(volatile int *)ptr != 0x0) 2469 page_bad = TRUE; 2470 /* 2471 * Restore original value. 2472 */ 2473 *(int *)ptr = tmp; 2474 2475skip_memtest: 2476 /* 2477 * Adjust array of valid/good pages. 2478 */ 2479 if (page_bad == TRUE) 2480 continue; 2481 /* 2482 * If this good page is a continuation of the 2483 * previous set of good pages, then just increase 2484 * the end pointer. Otherwise start a new chunk. 2485 * Note that "end" points one higher than end, 2486 * making the range >= start and < end. 2487 * If we're also doing a speculative memory 2488 * test and we at or past the end, bump up Maxmem 2489 * so that we keep going. The first bad page 2490 * will terminate the loop. 2491 */ 2492 if (phys_avail[pa_indx] == pa) { 2493 phys_avail[pa_indx] += PAGE_SIZE; 2494 } else { 2495 pa_indx++; 2496 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2497 printf( 2498 "Too many holes in the physical address space, giving up\n"); 2499 pa_indx--; 2500 full = TRUE; 2501 goto do_dump_avail; 2502 } 2503 phys_avail[pa_indx++] = pa; /* start */ 2504 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2505 } 2506 physmem++; 2507do_dump_avail: 2508 if (dump_avail[da_indx] == pa) { 2509 dump_avail[da_indx] += PAGE_SIZE; 2510 } else { 2511 da_indx++; 2512 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2513 da_indx--; 2514 goto do_next; 2515 } 2516 dump_avail[da_indx++] = pa; /* start */ 2517 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2518 } 2519do_next: 2520 if (full) 2521 break; 2522 } 2523 } 2524 *pte = 0; 2525 invltlb(); 2526#else 2527 phys_avail[0] = physfree; 2528 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2529 dump_avail[0] = 0; 2530 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2531 2532#endif 2533 2534 /* 2535 * XXX 2536 * The last chunk must contain at least one page plus the message 2537 * buffer to avoid complicating other code (message buffer address 2538 * calculation, etc.). 2539 */ 2540 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2541 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2542 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2543 phys_avail[pa_indx--] = 0; 2544 phys_avail[pa_indx--] = 0; 2545 } 2546 2547 Maxmem = atop(phys_avail[pa_indx]); 2548 2549 /* Trim off space for the message buffer. */ 2550 phys_avail[pa_indx] -= round_page(msgbufsize); 2551 2552 /* Map the message buffer. */ 2553 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2554 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2555 off); 2556 2557 PT_UPDATES_FLUSH(); 2558} 2559 2560#ifdef XEN 2561#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 2562 2563void 2564init386(first) 2565 int first; 2566{ 2567 unsigned long gdtmachpfn; 2568 int error, gsel_tss, metadata_missing, x, pa; 2569 size_t kstack0_sz; 2570 struct pcpu *pc; 2571 struct callback_register event = { 2572 .type = CALLBACKTYPE_event, 2573 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, 2574 }; 2575 struct callback_register failsafe = { 2576 .type = CALLBACKTYPE_failsafe, 2577 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, 2578 }; 2579 2580 thread0.td_kstack = proc0kstack; 2581 thread0.td_kstack_pages = KSTACK_PAGES; 2582 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2583 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2584 2585 /* 2586 * This may be done better later if it gets more high level 2587 * components in it. If so just link td->td_proc here. 2588 */ 2589 proc_linkup0(&proc0, &thread0); 2590 2591 metadata_missing = 0; 2592 if (xen_start_info->mod_start) { 2593 preload_metadata = (caddr_t)xen_start_info->mod_start; 2594 preload_bootstrap_relocate(KERNBASE); 2595 } else { 2596 metadata_missing = 1; 2597 } 2598 if (envmode == 1) 2599 kern_envp = static_env; 2600 else if ((caddr_t)xen_start_info->cmd_line) 2601 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); 2602 2603 boothowto |= xen_boothowto(kern_envp); 2604 2605 /* Init basic tunables, hz etc */ 2606 init_param1(); 2607 2608 /* 2609 * XEN occupies a portion of the upper virtual address space 2610 * At its base it manages an array mapping machine page frames 2611 * to physical page frames - hence we need to be able to 2612 * access 4GB - (64MB - 4MB + 64k) 2613 */ 2614 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2615 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2616 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2617 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2618 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2619 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2620 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2621 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2622 2623 pc = &__pcpu[0]; 2624 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2625 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2626 2627 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); 2628 bzero(gdt, PAGE_SIZE); 2629 for (x = 0; x < NGDT; x++) 2630 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2631 2632 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2633 2634 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; 2635 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); 2636 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); 2637 lgdt(&r_gdt); 2638 gdtset = 1; 2639 2640 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { 2641 panic("set_trap_table failed - error %d\n", error); 2642 } 2643 2644 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); 2645 if (error == 0) 2646 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); 2647#if CONFIG_XEN_COMPAT <= 0x030002 2648 if (error == -ENOXENSYS) 2649 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), 2650 (unsigned long)Xhypervisor_callback, 2651 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 2652#endif 2653 pcpu_init(pc, 0, sizeof(struct pcpu)); 2654 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2655 pmap_kenter(pa + KERNBASE, pa); 2656 dpcpu_init((void *)(first + KERNBASE), 0); 2657 first += DPCPU_SIZE; 2658 physfree += DPCPU_SIZE; 2659 init_first += DPCPU_SIZE / PAGE_SIZE; 2660 2661 PCPU_SET(prvspace, pc); 2662 PCPU_SET(curthread, &thread0); 2663 PCPU_SET(curpcb, thread0.td_pcb); 2664 2665 /* 2666 * Initialize mutexes. 2667 * 2668 * icu_lock: in order to allow an interrupt to occur in a critical 2669 * section, to set pcpu->ipending (etc...) properly, we 2670 * must be able to get the icu lock, so it can't be 2671 * under witness. 2672 */ 2673 mutex_init(); 2674 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2675 2676 /* make ldt memory segments */ 2677 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); 2678 bzero(ldt, PAGE_SIZE); 2679 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2680 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2681 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2682 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2683 2684 default_proc_ldt.ldt_base = (caddr_t)ldt; 2685 default_proc_ldt.ldt_len = 6; 2686 _default_ldt = (int)&default_proc_ldt; 2687 PCPU_SET(currentldt, _default_ldt); 2688 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); 2689 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); 2690 2691#if defined(XEN_PRIVILEGED) 2692 /* 2693 * Initialize the i8254 before the console so that console 2694 * initialization can use DELAY(). 2695 */ 2696 i8254_init(); 2697#endif 2698 2699 /* 2700 * Initialize the console before we print anything out. 2701 */ 2702 cninit(); 2703 2704 if (metadata_missing) 2705 printf("WARNING: loader(8) metadata is missing!\n"); 2706 2707#ifdef DEV_ISA 2708#ifdef DEV_ATPIC 2709 elcr_probe(); 2710 atpic_startup(); 2711#else 2712 /* Reset and mask the atpics and leave them shut down. */ 2713 atpic_reset(); 2714 2715 /* 2716 * Point the ICU spurious interrupt vectors at the APIC spurious 2717 * interrupt handler. 2718 */ 2719 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2720 GSEL(GCODE_SEL, SEL_KPL)); 2721 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2722 GSEL(GCODE_SEL, SEL_KPL)); 2723#endif 2724#endif 2725 2726#ifdef DDB 2727 ksym_start = bootinfo.bi_symtab; 2728 ksym_end = bootinfo.bi_esymtab; 2729#endif 2730 2731 kdb_init(); 2732 2733#ifdef KDB 2734 if (boothowto & RB_KDB) 2735 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2736#endif 2737 2738 finishidentcpu(); /* Final stage of CPU initialization */ 2739 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2740 GSEL(GCODE_SEL, SEL_KPL)); 2741 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2742 GSEL(GCODE_SEL, SEL_KPL)); 2743 initializecpu(); /* Initialize CPU registers */ 2744 2745 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2746 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2747 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2748 kstack0_sz - sizeof(struct pcb) - 16); 2749 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2750 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2751 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), 2752 PCPU_GET(common_tss.tss_esp0)); 2753 2754 /* pointer to selector slot for %fs/%gs */ 2755 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2756 2757 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2758 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2759 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2760 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2761#ifdef PAE 2762 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2763#else 2764 dblfault_tss.tss_cr3 = (int)IdlePTD; 2765#endif 2766 dblfault_tss.tss_eip = (int)dblfault_handler; 2767 dblfault_tss.tss_eflags = PSL_KERNEL; 2768 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2769 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2770 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2771 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2772 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2773 2774 vm86_initialize(); 2775 getmemsize(first); 2776 init_param2(physmem); 2777 2778 /* now running on new page tables, configured,and u/iom is accessible */ 2779 2780 msgbufinit(msgbufp, msgbufsize); 2781 /* transfer to user mode */ 2782 2783 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2784 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2785 2786 /* setup proc 0's pcb */ 2787 thread0.td_pcb->pcb_flags = 0; 2788#ifdef PAE 2789 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2790#else 2791 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2792#endif 2793 thread0.td_pcb->pcb_ext = 0; 2794 thread0.td_frame = &proc0_tf; 2795 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; 2796 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; 2797 2798 cpu_probe_amdc1e(); 2799} 2800 2801#else 2802void 2803init386(first) 2804 int first; 2805{ 2806 struct gate_descriptor *gdp; 2807 int gsel_tss, metadata_missing, x, pa; 2808 size_t kstack0_sz; 2809 struct pcpu *pc; 2810 2811 thread0.td_kstack = proc0kstack; 2812 thread0.td_kstack_pages = KSTACK_PAGES; 2813 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2814 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2815 2816 /* 2817 * This may be done better later if it gets more high level 2818 * components in it. If so just link td->td_proc here. 2819 */ 2820 proc_linkup0(&proc0, &thread0); 2821 2822 metadata_missing = 0; 2823 if (bootinfo.bi_modulep) { 2824 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2825 preload_bootstrap_relocate(KERNBASE); 2826 } else { 2827 metadata_missing = 1; 2828 } 2829 if (envmode == 1) 2830 kern_envp = static_env; 2831 else if (bootinfo.bi_envp) 2832 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2833 2834 /* Init basic tunables, hz etc */ 2835 init_param1(); 2836 2837 /* 2838 * Make gdt memory segments. All segments cover the full 4GB 2839 * of address space and permissions are enforced at page level. 2840 */ 2841 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2842 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2843 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2844 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2845 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2846 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2847 2848 pc = &__pcpu[0]; 2849 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2850 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2851 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2852 2853 for (x = 0; x < NGDT; x++) 2854 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2855 2856 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2857 r_gdt.rd_base = (int) gdt; 2858 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2859 lgdt(&r_gdt); 2860 2861 pcpu_init(pc, 0, sizeof(struct pcpu)); 2862 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2863 pmap_kenter(pa + KERNBASE, pa); 2864 dpcpu_init((void *)(first + KERNBASE), 0); 2865 first += DPCPU_SIZE; 2866 PCPU_SET(prvspace, pc); 2867 PCPU_SET(curthread, &thread0); 2868 PCPU_SET(curpcb, thread0.td_pcb); 2869 2870 /* 2871 * Initialize mutexes. 2872 * 2873 * icu_lock: in order to allow an interrupt to occur in a critical 2874 * section, to set pcpu->ipending (etc...) properly, we 2875 * must be able to get the icu lock, so it can't be 2876 * under witness. 2877 */ 2878 mutex_init(); 2879 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2880 2881 /* make ldt memory segments */ 2882 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2883 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2884 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2885 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2886 2887 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2888 lldt(_default_ldt); 2889 PCPU_SET(currentldt, _default_ldt); 2890 2891 /* exceptions */ 2892 for (x = 0; x < NIDT; x++) 2893 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2894 GSEL(GCODE_SEL, SEL_KPL)); 2895 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2896 GSEL(GCODE_SEL, SEL_KPL)); 2897 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2898 GSEL(GCODE_SEL, SEL_KPL)); 2899 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2900 GSEL(GCODE_SEL, SEL_KPL)); 2901 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2902 GSEL(GCODE_SEL, SEL_KPL)); 2903 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2904 GSEL(GCODE_SEL, SEL_KPL)); 2905 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2906 GSEL(GCODE_SEL, SEL_KPL)); 2907 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2908 GSEL(GCODE_SEL, SEL_KPL)); 2909 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2910 , GSEL(GCODE_SEL, SEL_KPL)); 2911 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2912 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2913 GSEL(GCODE_SEL, SEL_KPL)); 2914 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2915 GSEL(GCODE_SEL, SEL_KPL)); 2916 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2917 GSEL(GCODE_SEL, SEL_KPL)); 2918 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2919 GSEL(GCODE_SEL, SEL_KPL)); 2920 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2921 GSEL(GCODE_SEL, SEL_KPL)); 2922 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2923 GSEL(GCODE_SEL, SEL_KPL)); 2924 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2925 GSEL(GCODE_SEL, SEL_KPL)); 2926 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2927 GSEL(GCODE_SEL, SEL_KPL)); 2928 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2929 GSEL(GCODE_SEL, SEL_KPL)); 2930 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2931 GSEL(GCODE_SEL, SEL_KPL)); 2932 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2933 GSEL(GCODE_SEL, SEL_KPL)); 2934#ifdef KDTRACE_HOOKS 2935 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2936 GSEL(GCODE_SEL, SEL_KPL)); 2937#endif 2938#ifdef XENHVM 2939 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, 2940 GSEL(GCODE_SEL, SEL_KPL)); 2941#endif 2942 2943 r_idt.rd_limit = sizeof(idt0) - 1; 2944 r_idt.rd_base = (int) idt; 2945 lidt(&r_idt); 2946 2947#ifdef XBOX 2948 /* 2949 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2950 * This should be 0x10de / 0x02a5. 2951 * 2952 * This is exactly what Linux does. 2953 */ 2954 outl(0xcf8, 0x80000000); 2955 if (inl(0xcfc) == 0x02a510de) { 2956 arch_i386_is_xbox = 1; 2957 pic16l_setled(XBOX_LED_GREEN); 2958 2959 /* 2960 * We are an XBOX, but we may have either 64MB or 128MB of 2961 * memory. The PCI host bridge should be programmed for this, 2962 * so we just query it. 2963 */ 2964 outl(0xcf8, 0x80000084); 2965 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2966 } 2967#endif /* XBOX */ 2968 2969 /* 2970 * Initialize the i8254 before the console so that console 2971 * initialization can use DELAY(). 2972 */ 2973 i8254_init(); 2974 2975 /* 2976 * Initialize the console before we print anything out. 2977 */ 2978 cninit(); 2979 2980 if (metadata_missing) 2981 printf("WARNING: loader(8) metadata is missing!\n"); 2982 2983#ifdef DEV_ISA 2984#ifdef DEV_ATPIC 2985 elcr_probe(); 2986 atpic_startup(); 2987#else 2988 /* Reset and mask the atpics and leave them shut down. */ 2989 atpic_reset(); 2990 2991 /* 2992 * Point the ICU spurious interrupt vectors at the APIC spurious 2993 * interrupt handler. 2994 */ 2995 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2996 GSEL(GCODE_SEL, SEL_KPL)); 2997 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2998 GSEL(GCODE_SEL, SEL_KPL)); 2999#endif 3000#endif 3001 3002#ifdef DDB 3003 ksym_start = bootinfo.bi_symtab; 3004 ksym_end = bootinfo.bi_esymtab; 3005#endif 3006 3007 kdb_init(); 3008 3009#ifdef KDB 3010 if (boothowto & RB_KDB) 3011 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 3012#endif 3013 3014 finishidentcpu(); /* Final stage of CPU initialization */ 3015 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 3016 GSEL(GCODE_SEL, SEL_KPL)); 3017 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 3018 GSEL(GCODE_SEL, SEL_KPL)); 3019 initializecpu(); /* Initialize CPU registers */ 3020 3021 /* make an initial tss so cpu can get interrupt stack on syscall! */ 3022 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 3023 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 3024 kstack0_sz - sizeof(struct pcb) - 16); 3025 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 3026 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 3027 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 3028 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 3029 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 3030 ltr(gsel_tss); 3031 3032 /* pointer to selector slot for %fs/%gs */ 3033 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 3034 3035 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 3036 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 3037 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 3038 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 3039#ifdef PAE 3040 dblfault_tss.tss_cr3 = (int)IdlePDPT; 3041#else 3042 dblfault_tss.tss_cr3 = (int)IdlePTD; 3043#endif 3044 dblfault_tss.tss_eip = (int)dblfault_handler; 3045 dblfault_tss.tss_eflags = PSL_KERNEL; 3046 dblfault_tss.tss_ds = dblfault_tss.tss_es = 3047 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 3048 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 3049 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 3050 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 3051 3052 vm86_initialize(); 3053 getmemsize(first); 3054 init_param2(physmem); 3055 3056 /* now running on new page tables, configured,and u/iom is accessible */ 3057 3058 msgbufinit(msgbufp, msgbufsize); 3059 3060 /* make a call gate to reenter kernel with */ 3061 gdp = &ldt[LSYS5CALLS_SEL].gd; 3062 3063 x = (int) &IDTVEC(lcall_syscall); 3064 gdp->gd_looffset = x; 3065 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 3066 gdp->gd_stkcpy = 1; 3067 gdp->gd_type = SDT_SYS386CGT; 3068 gdp->gd_dpl = SEL_UPL; 3069 gdp->gd_p = 1; 3070 gdp->gd_hioffset = x >> 16; 3071 3072 /* XXX does this work? */ 3073 /* XXX yes! */ 3074 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3075 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3076 3077 /* transfer to user mode */ 3078 3079 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 3080 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 3081 3082 /* setup proc 0's pcb */ 3083 thread0.td_pcb->pcb_flags = 0; 3084#ifdef PAE 3085 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 3086#else 3087 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 3088#endif 3089 thread0.td_pcb->pcb_ext = 0; 3090 thread0.td_frame = &proc0_tf; 3091 3092 cpu_probe_amdc1e(); 3093 3094#ifdef FDT 3095 x86_init_fdt(); 3096#endif 3097} 3098#endif 3099 3100void 3101cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 3102{ 3103 3104 pcpu->pc_acpi_id = 0xffffffff; 3105} 3106 3107static int 3108smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 3109{ 3110 struct bios_smap *smapbase; 3111 struct bios_smap_xattr smap; 3112 caddr_t kmdp; 3113 uint32_t *smapattr; 3114 int count, error, i; 3115 3116 /* Retrieve the system memory map from the loader. */ 3117 kmdp = preload_search_by_type("elf kernel"); 3118 if (kmdp == NULL) 3119 kmdp = preload_search_by_type("elf32 kernel"); 3120 smapbase = (struct bios_smap *)preload_search_info(kmdp, 3121 MODINFO_METADATA | MODINFOMD_SMAP); 3122 if (smapbase == NULL) 3123 return (0); 3124 smapattr = (uint32_t *)preload_search_info(kmdp, 3125 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 3126 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 3127 error = 0; 3128 for (i = 0; i < count; i++) { 3129 smap.base = smapbase[i].base; 3130 smap.length = smapbase[i].length; 3131 smap.type = smapbase[i].type; 3132 if (smapattr != NULL) 3133 smap.xattr = smapattr[i]; 3134 else 3135 smap.xattr = 0; 3136 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 3137 } 3138 return (error); 3139} 3140SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 3141 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 3142 3143void 3144spinlock_enter(void) 3145{ 3146 struct thread *td; 3147 register_t flags; 3148 3149 td = curthread; 3150 if (td->td_md.md_spinlock_count == 0) { 3151 flags = intr_disable(); 3152 td->td_md.md_spinlock_count = 1; 3153 td->td_md.md_saved_flags = flags; 3154 } else 3155 td->td_md.md_spinlock_count++; 3156 critical_enter(); 3157} 3158 3159void 3160spinlock_exit(void) 3161{ 3162 struct thread *td; 3163 register_t flags; 3164 3165 td = curthread; 3166 critical_exit(); 3167 flags = td->td_md.md_saved_flags; 3168 td->td_md.md_spinlock_count--; 3169 if (td->td_md.md_spinlock_count == 0) 3170 intr_restore(flags); 3171} 3172 3173#if defined(I586_CPU) && !defined(NO_F00F_HACK) 3174static void f00f_hack(void *unused); 3175SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 3176 3177static void 3178f00f_hack(void *unused) 3179{ 3180 struct gate_descriptor *new_idt; 3181 vm_offset_t tmp; 3182 3183 if (!has_f00f_bug) 3184 return; 3185 3186 GIANT_REQUIRED; 3187 3188 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 3189 3190 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 3191 if (tmp == 0) 3192 panic("kmem_malloc returned 0"); 3193 3194 /* Put the problematic entry (#6) at the end of the lower page. */ 3195 new_idt = (struct gate_descriptor*) 3196 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 3197 bcopy(idt, new_idt, sizeof(idt0)); 3198 r_idt.rd_base = (u_int)new_idt; 3199 lidt(&r_idt); 3200 idt = new_idt; 3201 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 3202} 3203#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 3204 3205/* 3206 * Construct a PCB from a trapframe. This is called from kdb_trap() where 3207 * we want to start a backtrace from the function that caused us to enter 3208 * the debugger. We have the context in the trapframe, but base the trace 3209 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 3210 * enough for a backtrace. 3211 */ 3212void 3213makectx(struct trapframe *tf, struct pcb *pcb) 3214{ 3215 3216 pcb->pcb_edi = tf->tf_edi; 3217 pcb->pcb_esi = tf->tf_esi; 3218 pcb->pcb_ebp = tf->tf_ebp; 3219 pcb->pcb_ebx = tf->tf_ebx; 3220 pcb->pcb_eip = tf->tf_eip; 3221 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 3222} 3223 3224int 3225ptrace_set_pc(struct thread *td, u_long addr) 3226{ 3227 3228 td->td_frame->tf_eip = addr; 3229 return (0); 3230} 3231 3232int 3233ptrace_single_step(struct thread *td) 3234{ 3235 td->td_frame->tf_eflags |= PSL_T; 3236 return (0); 3237} 3238 3239int 3240ptrace_clear_single_step(struct thread *td) 3241{ 3242 td->td_frame->tf_eflags &= ~PSL_T; 3243 return (0); 3244} 3245 3246int 3247fill_regs(struct thread *td, struct reg *regs) 3248{ 3249 struct pcb *pcb; 3250 struct trapframe *tp; 3251 3252 tp = td->td_frame; 3253 pcb = td->td_pcb; 3254 regs->r_gs = pcb->pcb_gs; 3255 return (fill_frame_regs(tp, regs)); 3256} 3257 3258int 3259fill_frame_regs(struct trapframe *tp, struct reg *regs) 3260{ 3261 regs->r_fs = tp->tf_fs; 3262 regs->r_es = tp->tf_es; 3263 regs->r_ds = tp->tf_ds; 3264 regs->r_edi = tp->tf_edi; 3265 regs->r_esi = tp->tf_esi; 3266 regs->r_ebp = tp->tf_ebp; 3267 regs->r_ebx = tp->tf_ebx; 3268 regs->r_edx = tp->tf_edx; 3269 regs->r_ecx = tp->tf_ecx; 3270 regs->r_eax = tp->tf_eax; 3271 regs->r_eip = tp->tf_eip; 3272 regs->r_cs = tp->tf_cs; 3273 regs->r_eflags = tp->tf_eflags; 3274 regs->r_esp = tp->tf_esp; 3275 regs->r_ss = tp->tf_ss; 3276 return (0); 3277} 3278 3279int 3280set_regs(struct thread *td, struct reg *regs) 3281{ 3282 struct pcb *pcb; 3283 struct trapframe *tp; 3284 3285 tp = td->td_frame; 3286 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 3287 !CS_SECURE(regs->r_cs)) 3288 return (EINVAL); 3289 pcb = td->td_pcb; 3290 tp->tf_fs = regs->r_fs; 3291 tp->tf_es = regs->r_es; 3292 tp->tf_ds = regs->r_ds; 3293 tp->tf_edi = regs->r_edi; 3294 tp->tf_esi = regs->r_esi; 3295 tp->tf_ebp = regs->r_ebp; 3296 tp->tf_ebx = regs->r_ebx; 3297 tp->tf_edx = regs->r_edx; 3298 tp->tf_ecx = regs->r_ecx; 3299 tp->tf_eax = regs->r_eax; 3300 tp->tf_eip = regs->r_eip; 3301 tp->tf_cs = regs->r_cs; 3302 tp->tf_eflags = regs->r_eflags; 3303 tp->tf_esp = regs->r_esp; 3304 tp->tf_ss = regs->r_ss; 3305 pcb->pcb_gs = regs->r_gs; 3306 return (0); 3307} 3308 3309#ifdef CPU_ENABLE_SSE 3310static void 3311fill_fpregs_xmm(sv_xmm, sv_87) 3312 struct savexmm *sv_xmm; 3313 struct save87 *sv_87; 3314{ 3315 register struct env87 *penv_87 = &sv_87->sv_env; 3316 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3317 int i; 3318 3319 bzero(sv_87, sizeof(*sv_87)); 3320 3321 /* FPU control/status */ 3322 penv_87->en_cw = penv_xmm->en_cw; 3323 penv_87->en_sw = penv_xmm->en_sw; 3324 penv_87->en_tw = penv_xmm->en_tw; 3325 penv_87->en_fip = penv_xmm->en_fip; 3326 penv_87->en_fcs = penv_xmm->en_fcs; 3327 penv_87->en_opcode = penv_xmm->en_opcode; 3328 penv_87->en_foo = penv_xmm->en_foo; 3329 penv_87->en_fos = penv_xmm->en_fos; 3330 3331 /* FPU registers */ 3332 for (i = 0; i < 8; ++i) 3333 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3334} 3335 3336static void 3337set_fpregs_xmm(sv_87, sv_xmm) 3338 struct save87 *sv_87; 3339 struct savexmm *sv_xmm; 3340{ 3341 register struct env87 *penv_87 = &sv_87->sv_env; 3342 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3343 int i; 3344 3345 /* FPU control/status */ 3346 penv_xmm->en_cw = penv_87->en_cw; 3347 penv_xmm->en_sw = penv_87->en_sw; 3348 penv_xmm->en_tw = penv_87->en_tw; 3349 penv_xmm->en_fip = penv_87->en_fip; 3350 penv_xmm->en_fcs = penv_87->en_fcs; 3351 penv_xmm->en_opcode = penv_87->en_opcode; 3352 penv_xmm->en_foo = penv_87->en_foo; 3353 penv_xmm->en_fos = penv_87->en_fos; 3354 3355 /* FPU registers */ 3356 for (i = 0; i < 8; ++i) 3357 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3358} 3359#endif /* CPU_ENABLE_SSE */ 3360 3361int 3362fill_fpregs(struct thread *td, struct fpreg *fpregs) 3363{ 3364 3365 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 3366 P_SHOULDSTOP(td->td_proc), 3367 ("not suspended thread %p", td)); 3368#ifdef DEV_NPX 3369 npxgetregs(td); 3370#else 3371 bzero(fpregs, sizeof(*fpregs)); 3372#endif 3373#ifdef CPU_ENABLE_SSE 3374 if (cpu_fxsr) 3375 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm, 3376 (struct save87 *)fpregs); 3377 else 3378#endif /* CPU_ENABLE_SSE */ 3379 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs, 3380 sizeof(*fpregs)); 3381 return (0); 3382} 3383 3384int 3385set_fpregs(struct thread *td, struct fpreg *fpregs) 3386{ 3387 3388#ifdef CPU_ENABLE_SSE 3389 if (cpu_fxsr) 3390 set_fpregs_xmm((struct save87 *)fpregs, 3391 &td->td_pcb->pcb_user_save.sv_xmm); 3392 else 3393#endif /* CPU_ENABLE_SSE */ 3394 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87, 3395 sizeof(*fpregs)); 3396#ifdef DEV_NPX 3397 npxuserinited(td); 3398#endif 3399 return (0); 3400} 3401 3402/* 3403 * Get machine context. 3404 */ 3405int 3406get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3407{ 3408 struct trapframe *tp; 3409 struct segment_descriptor *sdp; 3410 3411 tp = td->td_frame; 3412 3413 PROC_LOCK(curthread->td_proc); 3414 mcp->mc_onstack = sigonstack(tp->tf_esp); 3415 PROC_UNLOCK(curthread->td_proc); 3416 mcp->mc_gs = td->td_pcb->pcb_gs; 3417 mcp->mc_fs = tp->tf_fs; 3418 mcp->mc_es = tp->tf_es; 3419 mcp->mc_ds = tp->tf_ds; 3420 mcp->mc_edi = tp->tf_edi; 3421 mcp->mc_esi = tp->tf_esi; 3422 mcp->mc_ebp = tp->tf_ebp; 3423 mcp->mc_isp = tp->tf_isp; 3424 mcp->mc_eflags = tp->tf_eflags; 3425 if (flags & GET_MC_CLEAR_RET) { 3426 mcp->mc_eax = 0; 3427 mcp->mc_edx = 0; 3428 mcp->mc_eflags &= ~PSL_C; 3429 } else { 3430 mcp->mc_eax = tp->tf_eax; 3431 mcp->mc_edx = tp->tf_edx; 3432 } 3433 mcp->mc_ebx = tp->tf_ebx; 3434 mcp->mc_ecx = tp->tf_ecx; 3435 mcp->mc_eip = tp->tf_eip; 3436 mcp->mc_cs = tp->tf_cs; 3437 mcp->mc_esp = tp->tf_esp; 3438 mcp->mc_ss = tp->tf_ss; 3439 mcp->mc_len = sizeof(*mcp); 3440 get_fpcontext(td, mcp); 3441 sdp = &td->td_pcb->pcb_fsd; 3442 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3443 sdp = &td->td_pcb->pcb_gsd; 3444 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3445 mcp->mc_flags = 0; 3446 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3447 return (0); 3448} 3449 3450/* 3451 * Set machine context. 3452 * 3453 * However, we don't set any but the user modifiable flags, and we won't 3454 * touch the cs selector. 3455 */ 3456int 3457set_mcontext(struct thread *td, const mcontext_t *mcp) 3458{ 3459 struct trapframe *tp; 3460 int eflags, ret; 3461 3462 tp = td->td_frame; 3463 if (mcp->mc_len != sizeof(*mcp)) 3464 return (EINVAL); 3465 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3466 (tp->tf_eflags & ~PSL_USERCHANGE); 3467 if ((ret = set_fpcontext(td, mcp)) == 0) { 3468 tp->tf_fs = mcp->mc_fs; 3469 tp->tf_es = mcp->mc_es; 3470 tp->tf_ds = mcp->mc_ds; 3471 tp->tf_edi = mcp->mc_edi; 3472 tp->tf_esi = mcp->mc_esi; 3473 tp->tf_ebp = mcp->mc_ebp; 3474 tp->tf_ebx = mcp->mc_ebx; 3475 tp->tf_edx = mcp->mc_edx; 3476 tp->tf_ecx = mcp->mc_ecx; 3477 tp->tf_eax = mcp->mc_eax; 3478 tp->tf_eip = mcp->mc_eip; 3479 tp->tf_eflags = eflags; 3480 tp->tf_esp = mcp->mc_esp; 3481 tp->tf_ss = mcp->mc_ss; 3482 td->td_pcb->pcb_gs = mcp->mc_gs; 3483 ret = 0; 3484 } 3485 return (ret); 3486} 3487 3488static void 3489get_fpcontext(struct thread *td, mcontext_t *mcp) 3490{ 3491 3492#ifndef DEV_NPX 3493 mcp->mc_fpformat = _MC_FPFMT_NODEV; 3494 mcp->mc_ownedfp = _MC_FPOWNED_NONE; 3495 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); 3496#else 3497 mcp->mc_ownedfp = npxgetregs(td); 3498 bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0], 3499 sizeof(mcp->mc_fpstate)); 3500 mcp->mc_fpformat = npxformat(); 3501#endif 3502} 3503 3504static int 3505set_fpcontext(struct thread *td, const mcontext_t *mcp) 3506{ 3507 3508 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3509 return (0); 3510 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3511 mcp->mc_fpformat != _MC_FPFMT_XMM) 3512 return (EINVAL); 3513 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) 3514 /* We don't care what state is left in the FPU or PCB. */ 3515 fpstate_drop(td); 3516 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3517 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3518#ifdef DEV_NPX 3519#ifdef CPU_ENABLE_SSE 3520 if (cpu_fxsr) 3521 ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env. 3522 en_mxcsr &= cpu_mxcsr_mask; 3523#endif 3524 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate); 3525#endif 3526 } else 3527 return (EINVAL); 3528 return (0); 3529} 3530 3531static void 3532fpstate_drop(struct thread *td) 3533{ 3534 3535 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3536 critical_enter(); 3537#ifdef DEV_NPX 3538 if (PCPU_GET(fpcurthread) == td) 3539 npxdrop(); 3540#endif 3541 /* 3542 * XXX force a full drop of the npx. The above only drops it if we 3543 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3544 * 3545 * XXX I don't much like npxgetregs()'s semantics of doing a full 3546 * drop. Dropping only to the pcb matches fnsave's behaviour. 3547 * We only need to drop to !PCB_INITDONE in sendsig(). But 3548 * sendsig() is the only caller of npxgetregs()... perhaps we just 3549 * have too many layers. 3550 */ 3551 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3552 PCB_NPXUSERINITDONE); 3553 critical_exit(); 3554} 3555 3556int 3557fill_dbregs(struct thread *td, struct dbreg *dbregs) 3558{ 3559 struct pcb *pcb; 3560 3561 if (td == NULL) { 3562 dbregs->dr[0] = rdr0(); 3563 dbregs->dr[1] = rdr1(); 3564 dbregs->dr[2] = rdr2(); 3565 dbregs->dr[3] = rdr3(); 3566 dbregs->dr[4] = rdr4(); 3567 dbregs->dr[5] = rdr5(); 3568 dbregs->dr[6] = rdr6(); 3569 dbregs->dr[7] = rdr7(); 3570 } else { 3571 pcb = td->td_pcb; 3572 dbregs->dr[0] = pcb->pcb_dr0; 3573 dbregs->dr[1] = pcb->pcb_dr1; 3574 dbregs->dr[2] = pcb->pcb_dr2; 3575 dbregs->dr[3] = pcb->pcb_dr3; 3576 dbregs->dr[4] = 0; 3577 dbregs->dr[5] = 0; 3578 dbregs->dr[6] = pcb->pcb_dr6; 3579 dbregs->dr[7] = pcb->pcb_dr7; 3580 } 3581 return (0); 3582} 3583 3584int 3585set_dbregs(struct thread *td, struct dbreg *dbregs) 3586{ 3587 struct pcb *pcb; 3588 int i; 3589 3590 if (td == NULL) { 3591 load_dr0(dbregs->dr[0]); 3592 load_dr1(dbregs->dr[1]); 3593 load_dr2(dbregs->dr[2]); 3594 load_dr3(dbregs->dr[3]); 3595 load_dr4(dbregs->dr[4]); 3596 load_dr5(dbregs->dr[5]); 3597 load_dr6(dbregs->dr[6]); 3598 load_dr7(dbregs->dr[7]); 3599 } else { 3600 /* 3601 * Don't let an illegal value for dr7 get set. Specifically, 3602 * check for undefined settings. Setting these bit patterns 3603 * result in undefined behaviour and can lead to an unexpected 3604 * TRCTRAP. 3605 */ 3606 for (i = 0; i < 4; i++) { 3607 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3608 return (EINVAL); 3609 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3610 return (EINVAL); 3611 } 3612 3613 pcb = td->td_pcb; 3614 3615 /* 3616 * Don't let a process set a breakpoint that is not within the 3617 * process's address space. If a process could do this, it 3618 * could halt the system by setting a breakpoint in the kernel 3619 * (if ddb was enabled). Thus, we need to check to make sure 3620 * that no breakpoints are being enabled for addresses outside 3621 * process's address space. 3622 * 3623 * XXX - what about when the watched area of the user's 3624 * address space is written into from within the kernel 3625 * ... wouldn't that still cause a breakpoint to be generated 3626 * from within kernel mode? 3627 */ 3628 3629 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3630 /* dr0 is enabled */ 3631 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3632 return (EINVAL); 3633 } 3634 3635 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3636 /* dr1 is enabled */ 3637 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3638 return (EINVAL); 3639 } 3640 3641 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3642 /* dr2 is enabled */ 3643 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3644 return (EINVAL); 3645 } 3646 3647 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3648 /* dr3 is enabled */ 3649 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3650 return (EINVAL); 3651 } 3652 3653 pcb->pcb_dr0 = dbregs->dr[0]; 3654 pcb->pcb_dr1 = dbregs->dr[1]; 3655 pcb->pcb_dr2 = dbregs->dr[2]; 3656 pcb->pcb_dr3 = dbregs->dr[3]; 3657 pcb->pcb_dr6 = dbregs->dr[6]; 3658 pcb->pcb_dr7 = dbregs->dr[7]; 3659 3660 pcb->pcb_flags |= PCB_DBREGS; 3661 } 3662 3663 return (0); 3664} 3665 3666/* 3667 * Return > 0 if a hardware breakpoint has been hit, and the 3668 * breakpoint was in user space. Return 0, otherwise. 3669 */ 3670int 3671user_dbreg_trap(void) 3672{ 3673 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ 3674 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3675 int nbp; /* number of breakpoints that triggered */ 3676 caddr_t addr[4]; /* breakpoint addresses */ 3677 int i; 3678 3679 dr7 = rdr7(); 3680 if ((dr7 & 0x000000ff) == 0) { 3681 /* 3682 * all GE and LE bits in the dr7 register are zero, 3683 * thus the trap couldn't have been caused by the 3684 * hardware debug registers 3685 */ 3686 return 0; 3687 } 3688 3689 nbp = 0; 3690 dr6 = rdr6(); 3691 bp = dr6 & 0x0000000f; 3692 3693 if (!bp) { 3694 /* 3695 * None of the breakpoint bits are set meaning this 3696 * trap was not caused by any of the debug registers 3697 */ 3698 return 0; 3699 } 3700 3701 /* 3702 * at least one of the breakpoints were hit, check to see 3703 * which ones and if any of them are user space addresses 3704 */ 3705 3706 if (bp & 0x01) { 3707 addr[nbp++] = (caddr_t)rdr0(); 3708 } 3709 if (bp & 0x02) { 3710 addr[nbp++] = (caddr_t)rdr1(); 3711 } 3712 if (bp & 0x04) { 3713 addr[nbp++] = (caddr_t)rdr2(); 3714 } 3715 if (bp & 0x08) { 3716 addr[nbp++] = (caddr_t)rdr3(); 3717 } 3718 3719 for (i = 0; i < nbp; i++) { 3720 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3721 /* 3722 * addr[i] is in user space 3723 */ 3724 return nbp; 3725 } 3726 } 3727 3728 /* 3729 * None of the breakpoints are in user space. 3730 */ 3731 return 0; 3732} 3733 3734#ifdef KDB 3735 3736/* 3737 * Provide inb() and outb() as functions. They are normally only available as 3738 * inline functions, thus cannot be called from the debugger. 3739 */ 3740 3741/* silence compiler warnings */ 3742u_char inb_(u_short); 3743void outb_(u_short, u_char); 3744 3745u_char 3746inb_(u_short port) 3747{ 3748 return inb(port); 3749} 3750 3751void 3752outb_(u_short port, u_char data) 3753{ 3754 outb(port, data); 3755} 3756 3757#endif /* KDB */ 3758