machdep.c revision 276070
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/10/sys/i386/i386/machdep.c 276070 2014-12-22 18:40:59Z jhb $"); 42 43#include "opt_apic.h" 44#include "opt_atalk.h" 45#include "opt_atpic.h" 46#include "opt_compat.h" 47#include "opt_cpu.h" 48#include "opt_ddb.h" 49#include "opt_inet.h" 50#include "opt_ipx.h" 51#include "opt_isa.h" 52#include "opt_kstack_pages.h" 53#include "opt_maxmem.h" 54#include "opt_mp_watchdog.h" 55#include "opt_npx.h" 56#include "opt_perfmon.h" 57#include "opt_platform.h" 58#include "opt_xbox.h" 59#include "opt_kdtrace.h" 60 61#include <sys/param.h> 62#include <sys/proc.h> 63#include <sys/systm.h> 64#include <sys/bio.h> 65#include <sys/buf.h> 66#include <sys/bus.h> 67#include <sys/callout.h> 68#include <sys/cons.h> 69#include <sys/cpu.h> 70#include <sys/eventhandler.h> 71#include <sys/exec.h> 72#include <sys/imgact.h> 73#include <sys/kdb.h> 74#include <sys/kernel.h> 75#include <sys/ktr.h> 76#include <sys/linker.h> 77#include <sys/lock.h> 78#include <sys/malloc.h> 79#include <sys/memrange.h> 80#include <sys/msgbuf.h> 81#include <sys/mutex.h> 82#include <sys/pcpu.h> 83#include <sys/ptrace.h> 84#include <sys/reboot.h> 85#include <sys/rwlock.h> 86#include <sys/sched.h> 87#include <sys/signalvar.h> 88#ifdef SMP 89#include <sys/smp.h> 90#endif 91#include <sys/syscallsubr.h> 92#include <sys/sysctl.h> 93#include <sys/sysent.h> 94#include <sys/sysproto.h> 95#include <sys/ucontext.h> 96#include <sys/vmmeter.h> 97 98#include <vm/vm.h> 99#include <vm/vm_extern.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_page.h> 102#include <vm/vm_map.h> 103#include <vm/vm_object.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_param.h> 106 107#ifdef DDB 108#ifndef KDB 109#error KDB must be enabled in order for DDB to work! 110#endif 111#include <ddb/ddb.h> 112#include <ddb/db_sym.h> 113#endif 114 115#include <isa/rtc.h> 116 117#include <net/netisr.h> 118 119#include <machine/bootinfo.h> 120#include <machine/clock.h> 121#include <machine/cpu.h> 122#include <machine/cputypes.h> 123#include <machine/intr_machdep.h> 124#include <x86/mca.h> 125#include <machine/md_var.h> 126#include <machine/metadata.h> 127#include <machine/mp_watchdog.h> 128#include <machine/pc/bios.h> 129#include <machine/pcb.h> 130#include <machine/pcb_ext.h> 131#include <machine/proc.h> 132#include <machine/reg.h> 133#include <machine/sigframe.h> 134#include <machine/specialreg.h> 135#include <machine/vm86.h> 136#ifdef PERFMON 137#include <machine/perfmon.h> 138#endif 139#ifdef SMP 140#include <machine/smp.h> 141#endif 142#ifdef FDT 143#include <x86/fdt.h> 144#endif 145 146#ifdef DEV_APIC 147#include <machine/apicvar.h> 148#endif 149 150#ifdef DEV_ISA 151#include <x86/isa/icu.h> 152#endif 153 154#ifdef XBOX 155#include <machine/xbox.h> 156 157int arch_i386_is_xbox = 0; 158uint32_t arch_i386_xbox_memsize = 0; 159#endif 160 161#ifdef XEN 162/* XEN includes */ 163#include <xen/xen-os.h> 164#include <xen/hypervisor.h> 165#include <machine/xen/xenvar.h> 166#include <machine/xen/xenfunc.h> 167#include <xen/xen_intr.h> 168 169void Xhypervisor_callback(void); 170void failsafe_callback(void); 171 172extern trap_info_t trap_table[]; 173struct proc_ldt default_proc_ldt; 174extern int init_first; 175int running_xen = 1; 176extern unsigned long physfree; 177#endif /* XEN */ 178 179/* Sanity check for __curthread() */ 180CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 181 182extern void init386(int first); 183extern void dblfault_handler(void); 184 185#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 186#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 187 188#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 189#define CPU_ENABLE_SSE 190#endif 191 192static void cpu_startup(void *); 193static void fpstate_drop(struct thread *td); 194static void get_fpcontext(struct thread *td, mcontext_t *mcp); 195static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 196#ifdef CPU_ENABLE_SSE 197static void set_fpregs_xmm(struct save87 *, struct savexmm *); 198static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 199#endif /* CPU_ENABLE_SSE */ 200SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 201 202#ifdef DDB 203extern vm_offset_t ksym_start, ksym_end; 204#endif 205 206/* Intel ICH registers */ 207#define ICH_PMBASE 0x400 208#define ICH_SMI_EN ICH_PMBASE + 0x30 209 210int _udatasel, _ucodesel; 211u_int basemem; 212 213int cold = 1; 214 215#ifdef COMPAT_43 216static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 217#endif 218#ifdef COMPAT_FREEBSD4 219static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 220#endif 221 222long Maxmem = 0; 223long realmem = 0; 224 225#ifdef PAE 226FEATURE(pae, "Physical Address Extensions"); 227#endif 228 229/* 230 * The number of PHYSMAP entries must be one less than the number of 231 * PHYSSEG entries because the PHYSMAP entry that spans the largest 232 * physical address that is accessible by ISA DMA is split into two 233 * PHYSSEG entries. 234 */ 235#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 236 237vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 238vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 239 240/* must be 2 less so 0 0 can signal end of chunks */ 241#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 242#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 243 244struct kva_md_info kmi; 245 246static struct trapframe proc0_tf; 247struct pcpu __pcpu[MAXCPU]; 248 249struct mtx icu_lock; 250 251struct mem_range_softc mem_range_softc; 252 253static void 254cpu_startup(dummy) 255 void *dummy; 256{ 257 uintmax_t memsize; 258 char *sysenv; 259 260 /* 261 * On MacBooks, we need to disallow the legacy USB circuit to 262 * generate an SMI# because this can cause several problems, 263 * namely: incorrect CPU frequency detection and failure to 264 * start the APs. 265 * We do this by disabling a bit in the SMI_EN (SMI Control and 266 * Enable register) of the Intel ICH LPC Interface Bridge. 267 */ 268 sysenv = getenv("smbios.system.product"); 269 if (sysenv != NULL) { 270 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 271 strncmp(sysenv, "MacBook3,1", 10) == 0 || 272 strncmp(sysenv, "MacBook4,1", 10) == 0 || 273 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 274 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 275 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 276 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 277 strncmp(sysenv, "Macmini1,1", 10) == 0) { 278 if (bootverbose) 279 printf("Disabling LEGACY_USB_EN bit on " 280 "Intel ICH.\n"); 281 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 282 } 283 freeenv(sysenv); 284 } 285 286 /* 287 * Good {morning,afternoon,evening,night}. 288 */ 289 startrtclock(); 290 printcpuinfo(); 291 panicifcpuunsupported(); 292#ifdef PERFMON 293 perfmon_init(); 294#endif 295 296 /* 297 * Display physical memory if SMBIOS reports reasonable amount. 298 */ 299 memsize = 0; 300 sysenv = getenv("smbios.memory.enabled"); 301 if (sysenv != NULL) { 302 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 303 freeenv(sysenv); 304 } 305 if (memsize < ptoa((uintmax_t)cnt.v_free_count)) 306 memsize = ptoa((uintmax_t)Maxmem); 307 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 308 realmem = atop(memsize); 309 310 /* 311 * Display any holes after the first chunk of extended memory. 312 */ 313 if (bootverbose) { 314 int indx; 315 316 printf("Physical memory chunk(s):\n"); 317 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 318 vm_paddr_t size; 319 320 size = phys_avail[indx + 1] - phys_avail[indx]; 321 printf( 322 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 323 (uintmax_t)phys_avail[indx], 324 (uintmax_t)phys_avail[indx + 1] - 1, 325 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 326 } 327 } 328 329 vm_ksubmap_init(&kmi); 330 331 printf("avail memory = %ju (%ju MB)\n", 332 ptoa((uintmax_t)cnt.v_free_count), 333 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 334 335 /* 336 * Set up buffers, so they can be used to read disk labels. 337 */ 338 bufinit(); 339 vm_pager_bufferinit(); 340#ifndef XEN 341 cpu_setregs(); 342#endif 343} 344 345/* 346 * Send an interrupt to process. 347 * 348 * Stack is set up to allow sigcode stored 349 * at top to call routine, followed by kcall 350 * to sigreturn routine below. After sigreturn 351 * resets the signal mask, the stack, and the 352 * frame pointer, it returns to the user 353 * specified pc, psl. 354 */ 355#ifdef COMPAT_43 356static void 357osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 358{ 359 struct osigframe sf, *fp; 360 struct proc *p; 361 struct thread *td; 362 struct sigacts *psp; 363 struct trapframe *regs; 364 int sig; 365 int oonstack; 366 367 td = curthread; 368 p = td->td_proc; 369 PROC_LOCK_ASSERT(p, MA_OWNED); 370 sig = ksi->ksi_signo; 371 psp = p->p_sigacts; 372 mtx_assert(&psp->ps_mtx, MA_OWNED); 373 regs = td->td_frame; 374 oonstack = sigonstack(regs->tf_esp); 375 376 /* Allocate space for the signal handler context. */ 377 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 378 SIGISMEMBER(psp->ps_sigonstack, sig)) { 379 fp = (struct osigframe *)(td->td_sigstk.ss_sp + 380 td->td_sigstk.ss_size - sizeof(struct osigframe)); 381#if defined(COMPAT_43) 382 td->td_sigstk.ss_flags |= SS_ONSTACK; 383#endif 384 } else 385 fp = (struct osigframe *)regs->tf_esp - 1; 386 387 /* Translate the signal if appropriate. */ 388 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 389 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 390 391 /* Build the argument list for the signal handler. */ 392 sf.sf_signum = sig; 393 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 394 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 395 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 396 /* Signal handler installed with SA_SIGINFO. */ 397 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 398 sf.sf_siginfo.si_signo = sig; 399 sf.sf_siginfo.si_code = ksi->ksi_code; 400 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 401 sf.sf_addr = 0; 402 } else { 403 /* Old FreeBSD-style arguments. */ 404 sf.sf_arg2 = ksi->ksi_code; 405 sf.sf_addr = (register_t)ksi->ksi_addr; 406 sf.sf_ahu.sf_handler = catcher; 407 } 408 mtx_unlock(&psp->ps_mtx); 409 PROC_UNLOCK(p); 410 411 /* Save most if not all of trap frame. */ 412 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 413 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 414 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 415 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 416 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 417 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 418 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 419 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 420 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 421 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 422 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 423 sf.sf_siginfo.si_sc.sc_gs = rgs(); 424 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 425 426 /* Build the signal context to be used by osigreturn(). */ 427 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 428 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 429 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 430 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 431 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 432 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 433 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 434 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 435 436 /* 437 * If we're a vm86 process, we want to save the segment registers. 438 * We also change eflags to be our emulated eflags, not the actual 439 * eflags. 440 */ 441 if (regs->tf_eflags & PSL_VM) { 442 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 443 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 444 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 445 446 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 447 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 448 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 449 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 450 451 if (vm86->vm86_has_vme == 0) 452 sf.sf_siginfo.si_sc.sc_ps = 453 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 454 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 455 456 /* See sendsig() for comments. */ 457 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 458 } 459 460 /* 461 * Copy the sigframe out to the user's stack. 462 */ 463 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 464#ifdef DEBUG 465 printf("process %ld has trashed its stack\n", (long)p->p_pid); 466#endif 467 PROC_LOCK(p); 468 sigexit(td, SIGILL); 469 } 470 471 regs->tf_esp = (int)fp; 472 if (p->p_sysent->sv_sigcode_base != 0) { 473 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 474 szosigcode; 475 } else { 476 /* a.out sysentvec does not use shared page */ 477 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 478 } 479 regs->tf_eflags &= ~(PSL_T | PSL_D); 480 regs->tf_cs = _ucodesel; 481 regs->tf_ds = _udatasel; 482 regs->tf_es = _udatasel; 483 regs->tf_fs = _udatasel; 484 load_gs(_udatasel); 485 regs->tf_ss = _udatasel; 486 PROC_LOCK(p); 487 mtx_lock(&psp->ps_mtx); 488} 489#endif /* COMPAT_43 */ 490 491#ifdef COMPAT_FREEBSD4 492static void 493freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 494{ 495 struct sigframe4 sf, *sfp; 496 struct proc *p; 497 struct thread *td; 498 struct sigacts *psp; 499 struct trapframe *regs; 500 int sig; 501 int oonstack; 502 503 td = curthread; 504 p = td->td_proc; 505 PROC_LOCK_ASSERT(p, MA_OWNED); 506 sig = ksi->ksi_signo; 507 psp = p->p_sigacts; 508 mtx_assert(&psp->ps_mtx, MA_OWNED); 509 regs = td->td_frame; 510 oonstack = sigonstack(regs->tf_esp); 511 512 /* Save user context. */ 513 bzero(&sf, sizeof(sf)); 514 sf.sf_uc.uc_sigmask = *mask; 515 sf.sf_uc.uc_stack = td->td_sigstk; 516 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 517 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 518 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 519 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 520 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 521 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 522 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 523 bzero(sf.sf_uc.uc_mcontext.__spare__, 524 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 525 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 526 527 /* Allocate space for the signal handler context. */ 528 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 529 SIGISMEMBER(psp->ps_sigonstack, sig)) { 530 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + 531 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 532#if defined(COMPAT_43) 533 td->td_sigstk.ss_flags |= SS_ONSTACK; 534#endif 535 } else 536 sfp = (struct sigframe4 *)regs->tf_esp - 1; 537 538 /* Translate the signal if appropriate. */ 539 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 540 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 541 542 /* Build the argument list for the signal handler. */ 543 sf.sf_signum = sig; 544 sf.sf_ucontext = (register_t)&sfp->sf_uc; 545 bzero(&sf.sf_si, sizeof(sf.sf_si)); 546 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 547 /* Signal handler installed with SA_SIGINFO. */ 548 sf.sf_siginfo = (register_t)&sfp->sf_si; 549 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 550 551 /* Fill in POSIX parts */ 552 sf.sf_si.si_signo = sig; 553 sf.sf_si.si_code = ksi->ksi_code; 554 sf.sf_si.si_addr = ksi->ksi_addr; 555 } else { 556 /* Old FreeBSD-style arguments. */ 557 sf.sf_siginfo = ksi->ksi_code; 558 sf.sf_addr = (register_t)ksi->ksi_addr; 559 sf.sf_ahu.sf_handler = catcher; 560 } 561 mtx_unlock(&psp->ps_mtx); 562 PROC_UNLOCK(p); 563 564 /* 565 * If we're a vm86 process, we want to save the segment registers. 566 * We also change eflags to be our emulated eflags, not the actual 567 * eflags. 568 */ 569 if (regs->tf_eflags & PSL_VM) { 570 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 571 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 572 573 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 574 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 575 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 576 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 577 578 if (vm86->vm86_has_vme == 0) 579 sf.sf_uc.uc_mcontext.mc_eflags = 580 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 581 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 582 583 /* 584 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 585 * syscalls made by the signal handler. This just avoids 586 * wasting time for our lazy fixup of such faults. PSL_NT 587 * does nothing in vm86 mode, but vm86 programs can set it 588 * almost legitimately in probes for old cpu types. 589 */ 590 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 591 } 592 593 /* 594 * Copy the sigframe out to the user's stack. 595 */ 596 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 597#ifdef DEBUG 598 printf("process %ld has trashed its stack\n", (long)p->p_pid); 599#endif 600 PROC_LOCK(p); 601 sigexit(td, SIGILL); 602 } 603 604 regs->tf_esp = (int)sfp; 605 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 606 szfreebsd4_sigcode; 607 regs->tf_eflags &= ~(PSL_T | PSL_D); 608 regs->tf_cs = _ucodesel; 609 regs->tf_ds = _udatasel; 610 regs->tf_es = _udatasel; 611 regs->tf_fs = _udatasel; 612 regs->tf_ss = _udatasel; 613 PROC_LOCK(p); 614 mtx_lock(&psp->ps_mtx); 615} 616#endif /* COMPAT_FREEBSD4 */ 617 618void 619sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 620{ 621 struct sigframe sf, *sfp; 622 struct proc *p; 623 struct thread *td; 624 struct sigacts *psp; 625 char *sp; 626 struct trapframe *regs; 627 struct segment_descriptor *sdp; 628 int sig; 629 int oonstack; 630 631 td = curthread; 632 p = td->td_proc; 633 PROC_LOCK_ASSERT(p, MA_OWNED); 634 sig = ksi->ksi_signo; 635 psp = p->p_sigacts; 636 mtx_assert(&psp->ps_mtx, MA_OWNED); 637#ifdef COMPAT_FREEBSD4 638 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 639 freebsd4_sendsig(catcher, ksi, mask); 640 return; 641 } 642#endif 643#ifdef COMPAT_43 644 if (SIGISMEMBER(psp->ps_osigset, sig)) { 645 osendsig(catcher, ksi, mask); 646 return; 647 } 648#endif 649 regs = td->td_frame; 650 oonstack = sigonstack(regs->tf_esp); 651 652 /* Save user context. */ 653 bzero(&sf, sizeof(sf)); 654 sf.sf_uc.uc_sigmask = *mask; 655 sf.sf_uc.uc_stack = td->td_sigstk; 656 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 657 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 658 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 659 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 660 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 661 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 662 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 663 fpstate_drop(td); 664 /* 665 * Unconditionally fill the fsbase and gsbase into the mcontext. 666 */ 667 sdp = &td->td_pcb->pcb_fsd; 668 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 669 sdp->sd_lobase; 670 sdp = &td->td_pcb->pcb_gsd; 671 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 672 sdp->sd_lobase; 673 sf.sf_uc.uc_mcontext.mc_flags = 0; 674 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 675 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 676 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 677 678 /* Allocate space for the signal handler context. */ 679 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 680 SIGISMEMBER(psp->ps_sigonstack, sig)) { 681 sp = td->td_sigstk.ss_sp + 682 td->td_sigstk.ss_size - sizeof(struct sigframe); 683#if defined(COMPAT_43) 684 td->td_sigstk.ss_flags |= SS_ONSTACK; 685#endif 686 } else 687 sp = (char *)regs->tf_esp - sizeof(struct sigframe); 688 /* Align to 16 bytes. */ 689 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 690 691 /* Translate the signal if appropriate. */ 692 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 693 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 694 695 /* Build the argument list for the signal handler. */ 696 sf.sf_signum = sig; 697 sf.sf_ucontext = (register_t)&sfp->sf_uc; 698 bzero(&sf.sf_si, sizeof(sf.sf_si)); 699 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 700 /* Signal handler installed with SA_SIGINFO. */ 701 sf.sf_siginfo = (register_t)&sfp->sf_si; 702 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 703 704 /* Fill in POSIX parts */ 705 sf.sf_si = ksi->ksi_info; 706 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 707 } else { 708 /* Old FreeBSD-style arguments. */ 709 sf.sf_siginfo = ksi->ksi_code; 710 sf.sf_addr = (register_t)ksi->ksi_addr; 711 sf.sf_ahu.sf_handler = catcher; 712 } 713 mtx_unlock(&psp->ps_mtx); 714 PROC_UNLOCK(p); 715 716 /* 717 * If we're a vm86 process, we want to save the segment registers. 718 * We also change eflags to be our emulated eflags, not the actual 719 * eflags. 720 */ 721 if (regs->tf_eflags & PSL_VM) { 722 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 723 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 724 725 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 726 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 727 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 728 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 729 730 if (vm86->vm86_has_vme == 0) 731 sf.sf_uc.uc_mcontext.mc_eflags = 732 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 733 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 734 735 /* 736 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 737 * syscalls made by the signal handler. This just avoids 738 * wasting time for our lazy fixup of such faults. PSL_NT 739 * does nothing in vm86 mode, but vm86 programs can set it 740 * almost legitimately in probes for old cpu types. 741 */ 742 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 743 } 744 745 /* 746 * Copy the sigframe out to the user's stack. 747 */ 748 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 749#ifdef DEBUG 750 printf("process %ld has trashed its stack\n", (long)p->p_pid); 751#endif 752 PROC_LOCK(p); 753 sigexit(td, SIGILL); 754 } 755 756 regs->tf_esp = (int)sfp; 757 regs->tf_eip = p->p_sysent->sv_sigcode_base; 758 if (regs->tf_eip == 0) 759 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 760 regs->tf_eflags &= ~(PSL_T | PSL_D); 761 regs->tf_cs = _ucodesel; 762 regs->tf_ds = _udatasel; 763 regs->tf_es = _udatasel; 764 regs->tf_fs = _udatasel; 765 regs->tf_ss = _udatasel; 766 PROC_LOCK(p); 767 mtx_lock(&psp->ps_mtx); 768} 769 770/* 771 * System call to cleanup state after a signal 772 * has been taken. Reset signal mask and 773 * stack state from context left by sendsig (above). 774 * Return to previous pc and psl as specified by 775 * context left by sendsig. Check carefully to 776 * make sure that the user has not modified the 777 * state to gain improper privileges. 778 * 779 * MPSAFE 780 */ 781#ifdef COMPAT_43 782int 783osigreturn(td, uap) 784 struct thread *td; 785 struct osigreturn_args /* { 786 struct osigcontext *sigcntxp; 787 } */ *uap; 788{ 789 struct osigcontext sc; 790 struct trapframe *regs; 791 struct osigcontext *scp; 792 int eflags, error; 793 ksiginfo_t ksi; 794 795 regs = td->td_frame; 796 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 797 if (error != 0) 798 return (error); 799 scp = ≻ 800 eflags = scp->sc_ps; 801 if (eflags & PSL_VM) { 802 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 803 struct vm86_kernel *vm86; 804 805 /* 806 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 807 * set up the vm86 area, and we can't enter vm86 mode. 808 */ 809 if (td->td_pcb->pcb_ext == 0) 810 return (EINVAL); 811 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 812 if (vm86->vm86_inited == 0) 813 return (EINVAL); 814 815 /* Go back to user mode if both flags are set. */ 816 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 817 ksiginfo_init_trap(&ksi); 818 ksi.ksi_signo = SIGBUS; 819 ksi.ksi_code = BUS_OBJERR; 820 ksi.ksi_addr = (void *)regs->tf_eip; 821 trapsignal(td, &ksi); 822 } 823 824 if (vm86->vm86_has_vme) { 825 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 826 (eflags & VME_USERCHANGE) | PSL_VM; 827 } else { 828 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 829 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 830 (eflags & VM_USERCHANGE) | PSL_VM; 831 } 832 tf->tf_vm86_ds = scp->sc_ds; 833 tf->tf_vm86_es = scp->sc_es; 834 tf->tf_vm86_fs = scp->sc_fs; 835 tf->tf_vm86_gs = scp->sc_gs; 836 tf->tf_ds = _udatasel; 837 tf->tf_es = _udatasel; 838 tf->tf_fs = _udatasel; 839 } else { 840 /* 841 * Don't allow users to change privileged or reserved flags. 842 */ 843 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 844 return (EINVAL); 845 } 846 847 /* 848 * Don't allow users to load a valid privileged %cs. Let the 849 * hardware check for invalid selectors, excess privilege in 850 * other selectors, invalid %eip's and invalid %esp's. 851 */ 852 if (!CS_SECURE(scp->sc_cs)) { 853 ksiginfo_init_trap(&ksi); 854 ksi.ksi_signo = SIGBUS; 855 ksi.ksi_code = BUS_OBJERR; 856 ksi.ksi_trapno = T_PROTFLT; 857 ksi.ksi_addr = (void *)regs->tf_eip; 858 trapsignal(td, &ksi); 859 return (EINVAL); 860 } 861 regs->tf_ds = scp->sc_ds; 862 regs->tf_es = scp->sc_es; 863 regs->tf_fs = scp->sc_fs; 864 } 865 866 /* Restore remaining registers. */ 867 regs->tf_eax = scp->sc_eax; 868 regs->tf_ebx = scp->sc_ebx; 869 regs->tf_ecx = scp->sc_ecx; 870 regs->tf_edx = scp->sc_edx; 871 regs->tf_esi = scp->sc_esi; 872 regs->tf_edi = scp->sc_edi; 873 regs->tf_cs = scp->sc_cs; 874 regs->tf_ss = scp->sc_ss; 875 regs->tf_isp = scp->sc_isp; 876 regs->tf_ebp = scp->sc_fp; 877 regs->tf_esp = scp->sc_sp; 878 regs->tf_eip = scp->sc_pc; 879 regs->tf_eflags = eflags; 880 881#if defined(COMPAT_43) 882 if (scp->sc_onstack & 1) 883 td->td_sigstk.ss_flags |= SS_ONSTACK; 884 else 885 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 886#endif 887 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 888 SIGPROCMASK_OLD); 889 return (EJUSTRETURN); 890} 891#endif /* COMPAT_43 */ 892 893#ifdef COMPAT_FREEBSD4 894/* 895 * MPSAFE 896 */ 897int 898freebsd4_sigreturn(td, uap) 899 struct thread *td; 900 struct freebsd4_sigreturn_args /* { 901 const ucontext4 *sigcntxp; 902 } */ *uap; 903{ 904 struct ucontext4 uc; 905 struct trapframe *regs; 906 struct ucontext4 *ucp; 907 int cs, eflags, error; 908 ksiginfo_t ksi; 909 910 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 911 if (error != 0) 912 return (error); 913 ucp = &uc; 914 regs = td->td_frame; 915 eflags = ucp->uc_mcontext.mc_eflags; 916 if (eflags & PSL_VM) { 917 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 918 struct vm86_kernel *vm86; 919 920 /* 921 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 922 * set up the vm86 area, and we can't enter vm86 mode. 923 */ 924 if (td->td_pcb->pcb_ext == 0) 925 return (EINVAL); 926 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 927 if (vm86->vm86_inited == 0) 928 return (EINVAL); 929 930 /* Go back to user mode if both flags are set. */ 931 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 932 ksiginfo_init_trap(&ksi); 933 ksi.ksi_signo = SIGBUS; 934 ksi.ksi_code = BUS_OBJERR; 935 ksi.ksi_addr = (void *)regs->tf_eip; 936 trapsignal(td, &ksi); 937 } 938 if (vm86->vm86_has_vme) { 939 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 940 (eflags & VME_USERCHANGE) | PSL_VM; 941 } else { 942 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 943 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 944 (eflags & VM_USERCHANGE) | PSL_VM; 945 } 946 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 947 tf->tf_eflags = eflags; 948 tf->tf_vm86_ds = tf->tf_ds; 949 tf->tf_vm86_es = tf->tf_es; 950 tf->tf_vm86_fs = tf->tf_fs; 951 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 952 tf->tf_ds = _udatasel; 953 tf->tf_es = _udatasel; 954 tf->tf_fs = _udatasel; 955 } else { 956 /* 957 * Don't allow users to change privileged or reserved flags. 958 */ 959 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 960 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 961 td->td_proc->p_pid, td->td_name, eflags); 962 return (EINVAL); 963 } 964 965 /* 966 * Don't allow users to load a valid privileged %cs. Let the 967 * hardware check for invalid selectors, excess privilege in 968 * other selectors, invalid %eip's and invalid %esp's. 969 */ 970 cs = ucp->uc_mcontext.mc_cs; 971 if (!CS_SECURE(cs)) { 972 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 973 td->td_proc->p_pid, td->td_name, cs); 974 ksiginfo_init_trap(&ksi); 975 ksi.ksi_signo = SIGBUS; 976 ksi.ksi_code = BUS_OBJERR; 977 ksi.ksi_trapno = T_PROTFLT; 978 ksi.ksi_addr = (void *)regs->tf_eip; 979 trapsignal(td, &ksi); 980 return (EINVAL); 981 } 982 983 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 984 } 985 986#if defined(COMPAT_43) 987 if (ucp->uc_mcontext.mc_onstack & 1) 988 td->td_sigstk.ss_flags |= SS_ONSTACK; 989 else 990 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 991#endif 992 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 993 return (EJUSTRETURN); 994} 995#endif /* COMPAT_FREEBSD4 */ 996 997/* 998 * MPSAFE 999 */ 1000int 1001sys_sigreturn(td, uap) 1002 struct thread *td; 1003 struct sigreturn_args /* { 1004 const struct __ucontext *sigcntxp; 1005 } */ *uap; 1006{ 1007 ucontext_t uc; 1008 struct trapframe *regs; 1009 ucontext_t *ucp; 1010 int cs, eflags, error, ret; 1011 ksiginfo_t ksi; 1012 1013 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1014 if (error != 0) 1015 return (error); 1016 ucp = &uc; 1017 regs = td->td_frame; 1018 eflags = ucp->uc_mcontext.mc_eflags; 1019 if (eflags & PSL_VM) { 1020 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1021 struct vm86_kernel *vm86; 1022 1023 /* 1024 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1025 * set up the vm86 area, and we can't enter vm86 mode. 1026 */ 1027 if (td->td_pcb->pcb_ext == 0) 1028 return (EINVAL); 1029 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1030 if (vm86->vm86_inited == 0) 1031 return (EINVAL); 1032 1033 /* Go back to user mode if both flags are set. */ 1034 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1035 ksiginfo_init_trap(&ksi); 1036 ksi.ksi_signo = SIGBUS; 1037 ksi.ksi_code = BUS_OBJERR; 1038 ksi.ksi_addr = (void *)regs->tf_eip; 1039 trapsignal(td, &ksi); 1040 } 1041 1042 if (vm86->vm86_has_vme) { 1043 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1044 (eflags & VME_USERCHANGE) | PSL_VM; 1045 } else { 1046 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1047 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1048 (eflags & VM_USERCHANGE) | PSL_VM; 1049 } 1050 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1051 tf->tf_eflags = eflags; 1052 tf->tf_vm86_ds = tf->tf_ds; 1053 tf->tf_vm86_es = tf->tf_es; 1054 tf->tf_vm86_fs = tf->tf_fs; 1055 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1056 tf->tf_ds = _udatasel; 1057 tf->tf_es = _udatasel; 1058 tf->tf_fs = _udatasel; 1059 } else { 1060 /* 1061 * Don't allow users to change privileged or reserved flags. 1062 */ 1063 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1064 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1065 td->td_proc->p_pid, td->td_name, eflags); 1066 return (EINVAL); 1067 } 1068 1069 /* 1070 * Don't allow users to load a valid privileged %cs. Let the 1071 * hardware check for invalid selectors, excess privilege in 1072 * other selectors, invalid %eip's and invalid %esp's. 1073 */ 1074 cs = ucp->uc_mcontext.mc_cs; 1075 if (!CS_SECURE(cs)) { 1076 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1077 td->td_proc->p_pid, td->td_name, cs); 1078 ksiginfo_init_trap(&ksi); 1079 ksi.ksi_signo = SIGBUS; 1080 ksi.ksi_code = BUS_OBJERR; 1081 ksi.ksi_trapno = T_PROTFLT; 1082 ksi.ksi_addr = (void *)regs->tf_eip; 1083 trapsignal(td, &ksi); 1084 return (EINVAL); 1085 } 1086 1087 ret = set_fpcontext(td, &ucp->uc_mcontext); 1088 if (ret != 0) 1089 return (ret); 1090 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1091 } 1092 1093#if defined(COMPAT_43) 1094 if (ucp->uc_mcontext.mc_onstack & 1) 1095 td->td_sigstk.ss_flags |= SS_ONSTACK; 1096 else 1097 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1098#endif 1099 1100 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1101 return (EJUSTRETURN); 1102} 1103 1104/* 1105 * Machine dependent boot() routine 1106 * 1107 * I haven't seen anything to put here yet 1108 * Possibly some stuff might be grafted back here from boot() 1109 */ 1110void 1111cpu_boot(int howto) 1112{ 1113} 1114 1115/* 1116 * Flush the D-cache for non-DMA I/O so that the I-cache can 1117 * be made coherent later. 1118 */ 1119void 1120cpu_flush_dcache(void *ptr, size_t len) 1121{ 1122 /* Not applicable */ 1123} 1124 1125/* Get current clock frequency for the given cpu id. */ 1126int 1127cpu_est_clockrate(int cpu_id, uint64_t *rate) 1128{ 1129 uint64_t tsc1, tsc2; 1130 uint64_t acnt, mcnt, perf; 1131 register_t reg; 1132 1133 if (pcpu_find(cpu_id) == NULL || rate == NULL) 1134 return (EINVAL); 1135 if ((cpu_feature & CPUID_TSC) == 0) 1136 return (EOPNOTSUPP); 1137 1138 /* 1139 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, 1140 * DELAY(9) based logic fails. 1141 */ 1142 if (tsc_is_invariant && !tsc_perf_stat) 1143 return (EOPNOTSUPP); 1144 1145#ifdef SMP 1146 if (smp_cpus > 1) { 1147 /* Schedule ourselves on the indicated cpu. */ 1148 thread_lock(curthread); 1149 sched_bind(curthread, cpu_id); 1150 thread_unlock(curthread); 1151 } 1152#endif 1153 1154 /* Calibrate by measuring a short delay. */ 1155 reg = intr_disable(); 1156 if (tsc_is_invariant) { 1157 wrmsr(MSR_MPERF, 0); 1158 wrmsr(MSR_APERF, 0); 1159 tsc1 = rdtsc(); 1160 DELAY(1000); 1161 mcnt = rdmsr(MSR_MPERF); 1162 acnt = rdmsr(MSR_APERF); 1163 tsc2 = rdtsc(); 1164 intr_restore(reg); 1165 perf = 1000 * acnt / mcnt; 1166 *rate = (tsc2 - tsc1) * perf; 1167 } else { 1168 tsc1 = rdtsc(); 1169 DELAY(1000); 1170 tsc2 = rdtsc(); 1171 intr_restore(reg); 1172 *rate = (tsc2 - tsc1) * 1000; 1173 } 1174 1175#ifdef SMP 1176 if (smp_cpus > 1) { 1177 thread_lock(curthread); 1178 sched_unbind(curthread); 1179 thread_unlock(curthread); 1180 } 1181#endif 1182 1183 return (0); 1184} 1185 1186#ifdef XEN 1187 1188static void 1189idle_block(void) 1190{ 1191 1192 HYPERVISOR_sched_op(SCHEDOP_block, 0); 1193} 1194 1195void 1196cpu_halt(void) 1197{ 1198 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 1199} 1200 1201int scheduler_running; 1202 1203static void 1204cpu_idle_hlt(sbintime_t sbt) 1205{ 1206 1207 scheduler_running = 1; 1208 enable_intr(); 1209 idle_block(); 1210} 1211 1212#else 1213/* 1214 * Shutdown the CPU as much as possible 1215 */ 1216void 1217cpu_halt(void) 1218{ 1219 for (;;) 1220 halt(); 1221} 1222 1223#endif 1224 1225void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ 1226static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ 1227static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ 1228TUNABLE_INT("machdep.idle_mwait", &idle_mwait); 1229SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait, 1230 0, "Use MONITOR/MWAIT for short idle"); 1231 1232#define STATE_RUNNING 0x0 1233#define STATE_MWAIT 0x1 1234#define STATE_SLEEPING 0x2 1235 1236static void 1237cpu_idle_acpi(sbintime_t sbt) 1238{ 1239 int *state; 1240 1241 state = (int *)PCPU_PTR(monitorbuf); 1242 *state = STATE_SLEEPING; 1243 1244 /* See comments in cpu_idle_hlt(). */ 1245 disable_intr(); 1246 if (sched_runnable()) 1247 enable_intr(); 1248 else if (cpu_idle_hook) 1249 cpu_idle_hook(sbt); 1250 else 1251 __asm __volatile("sti; hlt"); 1252 *state = STATE_RUNNING; 1253} 1254 1255#ifndef XEN 1256static void 1257cpu_idle_hlt(sbintime_t sbt) 1258{ 1259 int *state; 1260 1261 state = (int *)PCPU_PTR(monitorbuf); 1262 *state = STATE_SLEEPING; 1263 1264 /* 1265 * Since we may be in a critical section from cpu_idle(), if 1266 * an interrupt fires during that critical section we may have 1267 * a pending preemption. If the CPU halts, then that thread 1268 * may not execute until a later interrupt awakens the CPU. 1269 * To handle this race, check for a runnable thread after 1270 * disabling interrupts and immediately return if one is 1271 * found. Also, we must absolutely guarentee that hlt is 1272 * the next instruction after sti. This ensures that any 1273 * interrupt that fires after the call to disable_intr() will 1274 * immediately awaken the CPU from hlt. Finally, please note 1275 * that on x86 this works fine because of interrupts enabled only 1276 * after the instruction following sti takes place, while IF is set 1277 * to 1 immediately, allowing hlt instruction to acknowledge the 1278 * interrupt. 1279 */ 1280 disable_intr(); 1281 if (sched_runnable()) 1282 enable_intr(); 1283 else 1284 __asm __volatile("sti; hlt"); 1285 *state = STATE_RUNNING; 1286} 1287#endif 1288 1289/* 1290 * MWAIT cpu power states. Lower 4 bits are sub-states. 1291 */ 1292#define MWAIT_C0 0xf0 1293#define MWAIT_C1 0x00 1294#define MWAIT_C2 0x10 1295#define MWAIT_C3 0x20 1296#define MWAIT_C4 0x30 1297 1298static void 1299cpu_idle_mwait(sbintime_t sbt) 1300{ 1301 int *state; 1302 1303 state = (int *)PCPU_PTR(monitorbuf); 1304 *state = STATE_MWAIT; 1305 1306 /* See comments in cpu_idle_hlt(). */ 1307 disable_intr(); 1308 if (sched_runnable()) { 1309 enable_intr(); 1310 *state = STATE_RUNNING; 1311 return; 1312 } 1313 cpu_monitor(state, 0, 0); 1314 if (*state == STATE_MWAIT) 1315 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); 1316 else 1317 enable_intr(); 1318 *state = STATE_RUNNING; 1319} 1320 1321static void 1322cpu_idle_spin(sbintime_t sbt) 1323{ 1324 int *state; 1325 int i; 1326 1327 state = (int *)PCPU_PTR(monitorbuf); 1328 *state = STATE_RUNNING; 1329 1330 /* 1331 * The sched_runnable() call is racy but as long as there is 1332 * a loop missing it one time will have just a little impact if any 1333 * (and it is much better than missing the check at all). 1334 */ 1335 for (i = 0; i < 1000; i++) { 1336 if (sched_runnable()) 1337 return; 1338 cpu_spinwait(); 1339 } 1340} 1341 1342/* 1343 * C1E renders the local APIC timer dead, so we disable it by 1344 * reading the Interrupt Pending Message register and clearing 1345 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). 1346 * 1347 * Reference: 1348 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" 1349 * #32559 revision 3.00+ 1350 */ 1351#define MSR_AMDK8_IPM 0xc0010055 1352#define AMDK8_SMIONCMPHALT (1ULL << 27) 1353#define AMDK8_C1EONCMPHALT (1ULL << 28) 1354#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) 1355 1356static void 1357cpu_probe_amdc1e(void) 1358{ 1359 1360 /* 1361 * Detect the presence of C1E capability mostly on latest 1362 * dual-cores (or future) k8 family. 1363 */ 1364 if (cpu_vendor_id == CPU_VENDOR_AMD && 1365 (cpu_id & 0x00000f00) == 0x00000f00 && 1366 (cpu_id & 0x0fff0000) >= 0x00040000) { 1367 cpu_ident_amdc1e = 1; 1368 } 1369} 1370 1371#ifdef XEN 1372void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; 1373#else 1374void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; 1375#endif 1376 1377void 1378cpu_idle(int busy) 1379{ 1380#ifndef XEN 1381 uint64_t msr; 1382#endif 1383 sbintime_t sbt = -1; 1384 1385 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", 1386 busy, curcpu); 1387#if defined(MP_WATCHDOG) && !defined(XEN) 1388 ap_watchdog(PCPU_GET(cpuid)); 1389#endif 1390#ifndef XEN 1391 /* If we are busy - try to use fast methods. */ 1392 if (busy) { 1393 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { 1394 cpu_idle_mwait(busy); 1395 goto out; 1396 } 1397 } 1398#endif 1399 1400 /* If we have time - switch timers into idle mode. */ 1401 if (!busy) { 1402 critical_enter(); 1403 sbt = cpu_idleclock(); 1404 } 1405 1406#ifndef XEN 1407 /* Apply AMD APIC timer C1E workaround. */ 1408 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) { 1409 msr = rdmsr(MSR_AMDK8_IPM); 1410 if (msr & AMDK8_CMPHALT) 1411 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); 1412 } 1413#endif 1414 1415 /* Call main idle method. */ 1416 cpu_idle_fn(sbt); 1417 1418 /* Switch timers mack into active mode. */ 1419 if (!busy) { 1420 cpu_activeclock(); 1421 critical_exit(); 1422 } 1423#ifndef XEN 1424out: 1425#endif 1426 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", 1427 busy, curcpu); 1428} 1429 1430int 1431cpu_idle_wakeup(int cpu) 1432{ 1433 struct pcpu *pcpu; 1434 int *state; 1435 1436 pcpu = pcpu_find(cpu); 1437 state = (int *)pcpu->pc_monitorbuf; 1438 /* 1439 * This doesn't need to be atomic since missing the race will 1440 * simply result in unnecessary IPIs. 1441 */ 1442 if (*state == STATE_SLEEPING) 1443 return (0); 1444 if (*state == STATE_MWAIT) 1445 *state = STATE_RUNNING; 1446 return (1); 1447} 1448 1449/* 1450 * Ordered by speed/power consumption. 1451 */ 1452struct { 1453 void *id_fn; 1454 char *id_name; 1455} idle_tbl[] = { 1456 { cpu_idle_spin, "spin" }, 1457 { cpu_idle_mwait, "mwait" }, 1458 { cpu_idle_hlt, "hlt" }, 1459 { cpu_idle_acpi, "acpi" }, 1460 { NULL, NULL } 1461}; 1462 1463static int 1464idle_sysctl_available(SYSCTL_HANDLER_ARGS) 1465{ 1466 char *avail, *p; 1467 int error; 1468 int i; 1469 1470 avail = malloc(256, M_TEMP, M_WAITOK); 1471 p = avail; 1472 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1473 if (strstr(idle_tbl[i].id_name, "mwait") && 1474 (cpu_feature2 & CPUID2_MON) == 0) 1475 continue; 1476 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1477 cpu_idle_hook == NULL) 1478 continue; 1479 p += sprintf(p, "%s%s", p != avail ? ", " : "", 1480 idle_tbl[i].id_name); 1481 } 1482 error = sysctl_handle_string(oidp, avail, 0, req); 1483 free(avail, M_TEMP); 1484 return (error); 1485} 1486 1487SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 1488 0, 0, idle_sysctl_available, "A", "list of available idle functions"); 1489 1490static int 1491idle_sysctl(SYSCTL_HANDLER_ARGS) 1492{ 1493 char buf[16]; 1494 int error; 1495 char *p; 1496 int i; 1497 1498 p = "unknown"; 1499 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1500 if (idle_tbl[i].id_fn == cpu_idle_fn) { 1501 p = idle_tbl[i].id_name; 1502 break; 1503 } 1504 } 1505 strncpy(buf, p, sizeof(buf)); 1506 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1507 if (error != 0 || req->newptr == NULL) 1508 return (error); 1509 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1510 if (strstr(idle_tbl[i].id_name, "mwait") && 1511 (cpu_feature2 & CPUID2_MON) == 0) 1512 continue; 1513 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1514 cpu_idle_hook == NULL) 1515 continue; 1516 if (strcmp(idle_tbl[i].id_name, buf)) 1517 continue; 1518 cpu_idle_fn = idle_tbl[i].id_fn; 1519 return (0); 1520 } 1521 return (EINVAL); 1522} 1523 1524SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, 1525 idle_sysctl, "A", "currently selected idle function"); 1526 1527/* 1528 * Reset registers to default values on exec. 1529 */ 1530void 1531exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1532{ 1533 struct trapframe *regs = td->td_frame; 1534 struct pcb *pcb = td->td_pcb; 1535 1536 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1537 pcb->pcb_gs = _udatasel; 1538 load_gs(_udatasel); 1539 1540 mtx_lock_spin(&dt_lock); 1541 if (td->td_proc->p_md.md_ldt) 1542 user_ldt_free(td); 1543 else 1544 mtx_unlock_spin(&dt_lock); 1545 1546 bzero((char *)regs, sizeof(struct trapframe)); 1547 regs->tf_eip = imgp->entry_addr; 1548 regs->tf_esp = stack; 1549 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1550 regs->tf_ss = _udatasel; 1551 regs->tf_ds = _udatasel; 1552 regs->tf_es = _udatasel; 1553 regs->tf_fs = _udatasel; 1554 regs->tf_cs = _ucodesel; 1555 1556 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1557 regs->tf_ebx = imgp->ps_strings; 1558 1559 /* 1560 * Reset the hardware debug registers if they were in use. 1561 * They won't have any meaning for the newly exec'd process. 1562 */ 1563 if (pcb->pcb_flags & PCB_DBREGS) { 1564 pcb->pcb_dr0 = 0; 1565 pcb->pcb_dr1 = 0; 1566 pcb->pcb_dr2 = 0; 1567 pcb->pcb_dr3 = 0; 1568 pcb->pcb_dr6 = 0; 1569 pcb->pcb_dr7 = 0; 1570 if (pcb == curpcb) { 1571 /* 1572 * Clear the debug registers on the running 1573 * CPU, otherwise they will end up affecting 1574 * the next process we switch to. 1575 */ 1576 reset_dbregs(); 1577 } 1578 pcb->pcb_flags &= ~PCB_DBREGS; 1579 } 1580 1581 /* 1582 * Initialize the math emulator (if any) for the current process. 1583 * Actually, just clear the bit that says that the emulator has 1584 * been initialized. Initialization is delayed until the process 1585 * traps to the emulator (if it is done at all) mainly because 1586 * emulators don't provide an entry point for initialization. 1587 */ 1588 td->td_pcb->pcb_flags &= ~FP_SOFTFP; 1589 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1590 1591 /* 1592 * Drop the FP state if we hold it, so that the process gets a 1593 * clean FP state if it uses the FPU again. 1594 */ 1595 fpstate_drop(td); 1596 1597 /* 1598 * XXX - Linux emulator 1599 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1600 * on it. 1601 */ 1602 td->td_retval[1] = 0; 1603} 1604 1605void 1606cpu_setregs(void) 1607{ 1608 unsigned int cr0; 1609 1610 cr0 = rcr0(); 1611 1612 /* 1613 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1614 * 1615 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1616 * instructions. We must set the CR0_MP bit and use the CR0_TS 1617 * bit to control the trap, because setting the CR0_EM bit does 1618 * not cause WAIT instructions to trap. It's important to trap 1619 * WAIT instructions - otherwise the "wait" variants of no-wait 1620 * control instructions would degenerate to the "no-wait" variants 1621 * after FP context switches but work correctly otherwise. It's 1622 * particularly important to trap WAITs when there is no NPX - 1623 * otherwise the "wait" variants would always degenerate. 1624 * 1625 * Try setting CR0_NE to get correct error reporting on 486DX's. 1626 * Setting it should fail or do nothing on lesser processors. 1627 */ 1628 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1629 load_cr0(cr0); 1630 load_gs(_udatasel); 1631} 1632 1633u_long bootdev; /* not a struct cdev *- encoding is different */ 1634SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1635 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1636 1637static char bootmethod[16] = "BIOS"; 1638SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1639 "System firmware boot method"); 1640 1641/* 1642 * Initialize 386 and configure to run kernel 1643 */ 1644 1645/* 1646 * Initialize segments & interrupt table 1647 */ 1648 1649int _default_ldt; 1650 1651#ifdef XEN 1652union descriptor *gdt; 1653union descriptor *ldt; 1654#else 1655union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1656union descriptor ldt[NLDT]; /* local descriptor table */ 1657#endif 1658static struct gate_descriptor idt0[NIDT]; 1659struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1660struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1661struct mtx dt_lock; /* lock for GDT and LDT */ 1662 1663static struct i386tss dblfault_tss; 1664static char dblfault_stack[PAGE_SIZE]; 1665 1666extern vm_offset_t proc0kstack; 1667 1668 1669/* 1670 * software prototypes -- in more palatable form. 1671 * 1672 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1673 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1674 */ 1675struct soft_segment_descriptor gdt_segs[] = { 1676/* GNULL_SEL 0 Null Descriptor */ 1677{ .ssd_base = 0x0, 1678 .ssd_limit = 0x0, 1679 .ssd_type = 0, 1680 .ssd_dpl = SEL_KPL, 1681 .ssd_p = 0, 1682 .ssd_xx = 0, .ssd_xx1 = 0, 1683 .ssd_def32 = 0, 1684 .ssd_gran = 0 }, 1685/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1686{ .ssd_base = 0x0, 1687 .ssd_limit = 0xfffff, 1688 .ssd_type = SDT_MEMRWA, 1689 .ssd_dpl = SEL_KPL, 1690 .ssd_p = 1, 1691 .ssd_xx = 0, .ssd_xx1 = 0, 1692 .ssd_def32 = 1, 1693 .ssd_gran = 1 }, 1694/* GUFS_SEL 2 %fs Descriptor for user */ 1695{ .ssd_base = 0x0, 1696 .ssd_limit = 0xfffff, 1697 .ssd_type = SDT_MEMRWA, 1698 .ssd_dpl = SEL_UPL, 1699 .ssd_p = 1, 1700 .ssd_xx = 0, .ssd_xx1 = 0, 1701 .ssd_def32 = 1, 1702 .ssd_gran = 1 }, 1703/* GUGS_SEL 3 %gs Descriptor for user */ 1704{ .ssd_base = 0x0, 1705 .ssd_limit = 0xfffff, 1706 .ssd_type = SDT_MEMRWA, 1707 .ssd_dpl = SEL_UPL, 1708 .ssd_p = 1, 1709 .ssd_xx = 0, .ssd_xx1 = 0, 1710 .ssd_def32 = 1, 1711 .ssd_gran = 1 }, 1712/* GCODE_SEL 4 Code Descriptor for kernel */ 1713{ .ssd_base = 0x0, 1714 .ssd_limit = 0xfffff, 1715 .ssd_type = SDT_MEMERA, 1716 .ssd_dpl = SEL_KPL, 1717 .ssd_p = 1, 1718 .ssd_xx = 0, .ssd_xx1 = 0, 1719 .ssd_def32 = 1, 1720 .ssd_gran = 1 }, 1721/* GDATA_SEL 5 Data Descriptor for kernel */ 1722{ .ssd_base = 0x0, 1723 .ssd_limit = 0xfffff, 1724 .ssd_type = SDT_MEMRWA, 1725 .ssd_dpl = SEL_KPL, 1726 .ssd_p = 1, 1727 .ssd_xx = 0, .ssd_xx1 = 0, 1728 .ssd_def32 = 1, 1729 .ssd_gran = 1 }, 1730/* GUCODE_SEL 6 Code Descriptor for user */ 1731{ .ssd_base = 0x0, 1732 .ssd_limit = 0xfffff, 1733 .ssd_type = SDT_MEMERA, 1734 .ssd_dpl = SEL_UPL, 1735 .ssd_p = 1, 1736 .ssd_xx = 0, .ssd_xx1 = 0, 1737 .ssd_def32 = 1, 1738 .ssd_gran = 1 }, 1739/* GUDATA_SEL 7 Data Descriptor for user */ 1740{ .ssd_base = 0x0, 1741 .ssd_limit = 0xfffff, 1742 .ssd_type = SDT_MEMRWA, 1743 .ssd_dpl = SEL_UPL, 1744 .ssd_p = 1, 1745 .ssd_xx = 0, .ssd_xx1 = 0, 1746 .ssd_def32 = 1, 1747 .ssd_gran = 1 }, 1748/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1749{ .ssd_base = 0x400, 1750 .ssd_limit = 0xfffff, 1751 .ssd_type = SDT_MEMRWA, 1752 .ssd_dpl = SEL_KPL, 1753 .ssd_p = 1, 1754 .ssd_xx = 0, .ssd_xx1 = 0, 1755 .ssd_def32 = 1, 1756 .ssd_gran = 1 }, 1757#ifndef XEN 1758/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1759{ 1760 .ssd_base = 0x0, 1761 .ssd_limit = sizeof(struct i386tss)-1, 1762 .ssd_type = SDT_SYS386TSS, 1763 .ssd_dpl = 0, 1764 .ssd_p = 1, 1765 .ssd_xx = 0, .ssd_xx1 = 0, 1766 .ssd_def32 = 0, 1767 .ssd_gran = 0 }, 1768/* GLDT_SEL 10 LDT Descriptor */ 1769{ .ssd_base = (int) ldt, 1770 .ssd_limit = sizeof(ldt)-1, 1771 .ssd_type = SDT_SYSLDT, 1772 .ssd_dpl = SEL_UPL, 1773 .ssd_p = 1, 1774 .ssd_xx = 0, .ssd_xx1 = 0, 1775 .ssd_def32 = 0, 1776 .ssd_gran = 0 }, 1777/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1778{ .ssd_base = (int) ldt, 1779 .ssd_limit = (512 * sizeof(union descriptor)-1), 1780 .ssd_type = SDT_SYSLDT, 1781 .ssd_dpl = 0, 1782 .ssd_p = 1, 1783 .ssd_xx = 0, .ssd_xx1 = 0, 1784 .ssd_def32 = 0, 1785 .ssd_gran = 0 }, 1786/* GPANIC_SEL 12 Panic Tss Descriptor */ 1787{ .ssd_base = (int) &dblfault_tss, 1788 .ssd_limit = sizeof(struct i386tss)-1, 1789 .ssd_type = SDT_SYS386TSS, 1790 .ssd_dpl = 0, 1791 .ssd_p = 1, 1792 .ssd_xx = 0, .ssd_xx1 = 0, 1793 .ssd_def32 = 0, 1794 .ssd_gran = 0 }, 1795/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1796{ .ssd_base = 0, 1797 .ssd_limit = 0xfffff, 1798 .ssd_type = SDT_MEMERA, 1799 .ssd_dpl = 0, 1800 .ssd_p = 1, 1801 .ssd_xx = 0, .ssd_xx1 = 0, 1802 .ssd_def32 = 0, 1803 .ssd_gran = 1 }, 1804/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1805{ .ssd_base = 0, 1806 .ssd_limit = 0xfffff, 1807 .ssd_type = SDT_MEMERA, 1808 .ssd_dpl = 0, 1809 .ssd_p = 1, 1810 .ssd_xx = 0, .ssd_xx1 = 0, 1811 .ssd_def32 = 0, 1812 .ssd_gran = 1 }, 1813/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1814{ .ssd_base = 0, 1815 .ssd_limit = 0xfffff, 1816 .ssd_type = SDT_MEMRWA, 1817 .ssd_dpl = 0, 1818 .ssd_p = 1, 1819 .ssd_xx = 0, .ssd_xx1 = 0, 1820 .ssd_def32 = 1, 1821 .ssd_gran = 1 }, 1822/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1823{ .ssd_base = 0, 1824 .ssd_limit = 0xfffff, 1825 .ssd_type = SDT_MEMRWA, 1826 .ssd_dpl = 0, 1827 .ssd_p = 1, 1828 .ssd_xx = 0, .ssd_xx1 = 0, 1829 .ssd_def32 = 0, 1830 .ssd_gran = 1 }, 1831/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1832{ .ssd_base = 0, 1833 .ssd_limit = 0xfffff, 1834 .ssd_type = SDT_MEMRWA, 1835 .ssd_dpl = 0, 1836 .ssd_p = 1, 1837 .ssd_xx = 0, .ssd_xx1 = 0, 1838 .ssd_def32 = 0, 1839 .ssd_gran = 1 }, 1840/* GNDIS_SEL 18 NDIS Descriptor */ 1841{ .ssd_base = 0x0, 1842 .ssd_limit = 0x0, 1843 .ssd_type = 0, 1844 .ssd_dpl = 0, 1845 .ssd_p = 0, 1846 .ssd_xx = 0, .ssd_xx1 = 0, 1847 .ssd_def32 = 0, 1848 .ssd_gran = 0 }, 1849#endif /* !XEN */ 1850}; 1851 1852static struct soft_segment_descriptor ldt_segs[] = { 1853 /* Null Descriptor - overwritten by call gate */ 1854{ .ssd_base = 0x0, 1855 .ssd_limit = 0x0, 1856 .ssd_type = 0, 1857 .ssd_dpl = 0, 1858 .ssd_p = 0, 1859 .ssd_xx = 0, .ssd_xx1 = 0, 1860 .ssd_def32 = 0, 1861 .ssd_gran = 0 }, 1862 /* Null Descriptor - overwritten by call gate */ 1863{ .ssd_base = 0x0, 1864 .ssd_limit = 0x0, 1865 .ssd_type = 0, 1866 .ssd_dpl = 0, 1867 .ssd_p = 0, 1868 .ssd_xx = 0, .ssd_xx1 = 0, 1869 .ssd_def32 = 0, 1870 .ssd_gran = 0 }, 1871 /* Null Descriptor - overwritten by call gate */ 1872{ .ssd_base = 0x0, 1873 .ssd_limit = 0x0, 1874 .ssd_type = 0, 1875 .ssd_dpl = 0, 1876 .ssd_p = 0, 1877 .ssd_xx = 0, .ssd_xx1 = 0, 1878 .ssd_def32 = 0, 1879 .ssd_gran = 0 }, 1880 /* Code Descriptor for user */ 1881{ .ssd_base = 0x0, 1882 .ssd_limit = 0xfffff, 1883 .ssd_type = SDT_MEMERA, 1884 .ssd_dpl = SEL_UPL, 1885 .ssd_p = 1, 1886 .ssd_xx = 0, .ssd_xx1 = 0, 1887 .ssd_def32 = 1, 1888 .ssd_gran = 1 }, 1889 /* Null Descriptor - overwritten by call gate */ 1890{ .ssd_base = 0x0, 1891 .ssd_limit = 0x0, 1892 .ssd_type = 0, 1893 .ssd_dpl = 0, 1894 .ssd_p = 0, 1895 .ssd_xx = 0, .ssd_xx1 = 0, 1896 .ssd_def32 = 0, 1897 .ssd_gran = 0 }, 1898 /* Data Descriptor for user */ 1899{ .ssd_base = 0x0, 1900 .ssd_limit = 0xfffff, 1901 .ssd_type = SDT_MEMRWA, 1902 .ssd_dpl = SEL_UPL, 1903 .ssd_p = 1, 1904 .ssd_xx = 0, .ssd_xx1 = 0, 1905 .ssd_def32 = 1, 1906 .ssd_gran = 1 }, 1907}; 1908 1909void 1910setidt(idx, func, typ, dpl, selec) 1911 int idx; 1912 inthand_t *func; 1913 int typ; 1914 int dpl; 1915 int selec; 1916{ 1917 struct gate_descriptor *ip; 1918 1919 ip = idt + idx; 1920 ip->gd_looffset = (int)func; 1921 ip->gd_selector = selec; 1922 ip->gd_stkcpy = 0; 1923 ip->gd_xx = 0; 1924 ip->gd_type = typ; 1925 ip->gd_dpl = dpl; 1926 ip->gd_p = 1; 1927 ip->gd_hioffset = ((int)func)>>16 ; 1928} 1929 1930extern inthand_t 1931 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1932 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1933 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1934 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1935 IDTVEC(xmm), 1936#ifdef KDTRACE_HOOKS 1937 IDTVEC(dtrace_ret), 1938#endif 1939#ifdef XENHVM 1940 IDTVEC(xen_intr_upcall), 1941#endif 1942 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1943 1944#ifdef DDB 1945/* 1946 * Display the index and function name of any IDT entries that don't use 1947 * the default 'rsvd' entry point. 1948 */ 1949DB_SHOW_COMMAND(idt, db_show_idt) 1950{ 1951 struct gate_descriptor *ip; 1952 int idx; 1953 uintptr_t func; 1954 1955 ip = idt; 1956 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1957 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1958 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1959 db_printf("%3d\t", idx); 1960 db_printsym(func, DB_STGY_PROC); 1961 db_printf("\n"); 1962 } 1963 ip++; 1964 } 1965} 1966 1967/* Show privileged registers. */ 1968DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1969{ 1970 uint64_t idtr, gdtr; 1971 1972 idtr = ridt(); 1973 db_printf("idtr\t0x%08x/%04x\n", 1974 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1975 gdtr = rgdt(); 1976 db_printf("gdtr\t0x%08x/%04x\n", 1977 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1978 db_printf("ldtr\t0x%04x\n", rldt()); 1979 db_printf("tr\t0x%04x\n", rtr()); 1980 db_printf("cr0\t0x%08x\n", rcr0()); 1981 db_printf("cr2\t0x%08x\n", rcr2()); 1982 db_printf("cr3\t0x%08x\n", rcr3()); 1983 db_printf("cr4\t0x%08x\n", rcr4()); 1984} 1985#endif 1986 1987void 1988sdtossd(sd, ssd) 1989 struct segment_descriptor *sd; 1990 struct soft_segment_descriptor *ssd; 1991{ 1992 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1993 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1994 ssd->ssd_type = sd->sd_type; 1995 ssd->ssd_dpl = sd->sd_dpl; 1996 ssd->ssd_p = sd->sd_p; 1997 ssd->ssd_def32 = sd->sd_def32; 1998 ssd->ssd_gran = sd->sd_gran; 1999} 2000 2001#ifndef XEN 2002static int 2003add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 2004{ 2005 int i, insert_idx, physmap_idx; 2006 2007 physmap_idx = *physmap_idxp; 2008 2009 if (boothowto & RB_VERBOSE) 2010 printf("SMAP type=%02x base=%016llx len=%016llx\n", 2011 smap->type, smap->base, smap->length); 2012 2013 if (smap->type != SMAP_TYPE_MEMORY) 2014 return (1); 2015 2016 if (smap->length == 0) 2017 return (1); 2018 2019#ifndef PAE 2020 if (smap->base > 0xffffffff) { 2021 printf("%uK of memory above 4GB ignored\n", 2022 (u_int)(smap->length / 1024)); 2023 return (1); 2024 } 2025#endif 2026 2027 /* 2028 * Find insertion point while checking for overlap. Start off by 2029 * assuming the new entry will be added to the end. 2030 */ 2031 insert_idx = physmap_idx + 2; 2032 for (i = 0; i <= physmap_idx; i += 2) { 2033 if (smap->base < physmap[i + 1]) { 2034 if (smap->base + smap->length <= physmap[i]) { 2035 insert_idx = i; 2036 break; 2037 } 2038 if (boothowto & RB_VERBOSE) 2039 printf( 2040 "Overlapping memory regions, ignoring second region\n"); 2041 return (1); 2042 } 2043 } 2044 2045 /* See if we can prepend to the next entry. */ 2046 if (insert_idx <= physmap_idx && 2047 smap->base + smap->length == physmap[insert_idx]) { 2048 physmap[insert_idx] = smap->base; 2049 return (1); 2050 } 2051 2052 /* See if we can append to the previous entry. */ 2053 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { 2054 physmap[insert_idx - 1] += smap->length; 2055 return (1); 2056 } 2057 2058 physmap_idx += 2; 2059 *physmap_idxp = physmap_idx; 2060 if (physmap_idx == PHYSMAP_SIZE) { 2061 printf( 2062 "Too many segments in the physical address map, giving up\n"); 2063 return (0); 2064 } 2065 2066 /* 2067 * Move the last 'N' entries down to make room for the new 2068 * entry if needed. 2069 */ 2070 for (i = physmap_idx; i > insert_idx; i -= 2) { 2071 physmap[i] = physmap[i - 2]; 2072 physmap[i + 1] = physmap[i - 1]; 2073 } 2074 2075 /* Insert the new entry. */ 2076 physmap[insert_idx] = smap->base; 2077 physmap[insert_idx + 1] = smap->base + smap->length; 2078 return (1); 2079} 2080 2081static void 2082basemem_setup(void) 2083{ 2084 vm_paddr_t pa; 2085 pt_entry_t *pte; 2086 int i; 2087 2088 if (basemem > 640) { 2089 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 2090 basemem); 2091 basemem = 640; 2092 } 2093 2094 /* 2095 * XXX if biosbasemem is now < 640, there is a `hole' 2096 * between the end of base memory and the start of 2097 * ISA memory. The hole may be empty or it may 2098 * contain BIOS code or data. Map it read/write so 2099 * that the BIOS can write to it. (Memory from 0 to 2100 * the physical end of the kernel is mapped read-only 2101 * to begin with and then parts of it are remapped. 2102 * The parts that aren't remapped form holes that 2103 * remain read-only and are unused by the kernel. 2104 * The base memory area is below the physical end of 2105 * the kernel and right now forms a read-only hole. 2106 * The part of it from PAGE_SIZE to 2107 * (trunc_page(biosbasemem * 1024) - 1) will be 2108 * remapped and used by the kernel later.) 2109 * 2110 * This code is similar to the code used in 2111 * pmap_mapdev, but since no memory needs to be 2112 * allocated we simply change the mapping. 2113 */ 2114 for (pa = trunc_page(basemem * 1024); 2115 pa < ISA_HOLE_START; pa += PAGE_SIZE) 2116 pmap_kenter(KERNBASE + pa, pa); 2117 2118 /* 2119 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 2120 * the vm86 page table so that vm86 can scribble on them using 2121 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 2122 * page 0, at least as initialized here? 2123 */ 2124 pte = (pt_entry_t *)vm86paddr; 2125 for (i = basemem / 4; i < 160; i++) 2126 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 2127} 2128#endif 2129 2130/* 2131 * Populate the (physmap) array with base/bound pairs describing the 2132 * available physical memory in the system, then test this memory and 2133 * build the phys_avail array describing the actually-available memory. 2134 * 2135 * If we cannot accurately determine the physical memory map, then use 2136 * value from the 0xE801 call, and failing that, the RTC. 2137 * 2138 * Total memory size may be set by the kernel environment variable 2139 * hw.physmem or the compile-time define MAXMEM. 2140 * 2141 * XXX first should be vm_paddr_t. 2142 */ 2143static void 2144getmemsize(int first) 2145{ 2146 int has_smap, off, physmap_idx, pa_indx, da_indx; 2147 u_long physmem_tunable, memtest; 2148 vm_paddr_t physmap[PHYSMAP_SIZE]; 2149 pt_entry_t *pte; 2150 quad_t dcons_addr, dcons_size; 2151#ifndef XEN 2152 int hasbrokenint12, i, res; 2153 u_int extmem; 2154 struct vm86frame vmf; 2155 struct vm86context vmc; 2156 vm_paddr_t pa; 2157 struct bios_smap *smap, *smapbase, *smapend; 2158 u_int32_t smapsize; 2159 caddr_t kmdp; 2160#endif 2161 2162 has_smap = 0; 2163#if defined(XEN) 2164 Maxmem = xen_start_info->nr_pages - init_first; 2165 physmem = Maxmem; 2166 basemem = 0; 2167 physmap[0] = init_first << PAGE_SHIFT; 2168 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); 2169 physmap_idx = 0; 2170#else 2171#ifdef XBOX 2172 if (arch_i386_is_xbox) { 2173 /* 2174 * We queried the memory size before, so chop off 4MB for 2175 * the framebuffer and inform the OS of this. 2176 */ 2177 physmap[0] = 0; 2178 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2179 physmap_idx = 0; 2180 goto physmap_done; 2181 } 2182#endif 2183 bzero(&vmf, sizeof(vmf)); 2184 bzero(physmap, sizeof(physmap)); 2185 basemem = 0; 2186 2187 /* 2188 * Check if the loader supplied an SMAP memory map. If so, 2189 * use that and do not make any VM86 calls. 2190 */ 2191 physmap_idx = 0; 2192 smapbase = NULL; 2193 kmdp = preload_search_by_type("elf kernel"); 2194 if (kmdp == NULL) 2195 kmdp = preload_search_by_type("elf32 kernel"); 2196 if (kmdp != NULL) 2197 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2198 MODINFO_METADATA | MODINFOMD_SMAP); 2199 if (smapbase != NULL) { 2200 /* 2201 * subr_module.c says: 2202 * "Consumer may safely assume that size value precedes data." 2203 * ie: an int32_t immediately precedes SMAP. 2204 */ 2205 smapsize = *((u_int32_t *)smapbase - 1); 2206 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 2207 has_smap = 1; 2208 2209 for (smap = smapbase; smap < smapend; smap++) 2210 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2211 break; 2212 goto have_smap; 2213 } 2214 2215 /* 2216 * Some newer BIOSes have a broken INT 12H implementation 2217 * which causes a kernel panic immediately. In this case, we 2218 * need use the SMAP to determine the base memory size. 2219 */ 2220 hasbrokenint12 = 0; 2221 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2222 if (hasbrokenint12 == 0) { 2223 /* Use INT12 to determine base memory size. */ 2224 vm86_intcall(0x12, &vmf); 2225 basemem = vmf.vmf_ax; 2226 basemem_setup(); 2227 } 2228 2229 /* 2230 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2231 * the kernel page table so we can use it as a buffer. The 2232 * kernel will unmap this page later. 2233 */ 2234 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2235 vmc.npages = 0; 2236 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2237 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2238 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2239 2240 vmf.vmf_ebx = 0; 2241 do { 2242 vmf.vmf_eax = 0xE820; 2243 vmf.vmf_edx = SMAP_SIG; 2244 vmf.vmf_ecx = sizeof(struct bios_smap); 2245 i = vm86_datacall(0x15, &vmf, &vmc); 2246 if (i || vmf.vmf_eax != SMAP_SIG) 2247 break; 2248 has_smap = 1; 2249 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2250 break; 2251 } while (vmf.vmf_ebx != 0); 2252 2253have_smap: 2254 /* 2255 * If we didn't fetch the "base memory" size from INT12, 2256 * figure it out from the SMAP (or just guess). 2257 */ 2258 if (basemem == 0) { 2259 for (i = 0; i <= physmap_idx; i += 2) { 2260 if (physmap[i] == 0x00000000) { 2261 basemem = physmap[i + 1] / 1024; 2262 break; 2263 } 2264 } 2265 2266 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2267 if (basemem == 0) 2268 basemem = 640; 2269 basemem_setup(); 2270 } 2271 2272 if (physmap[1] != 0) 2273 goto physmap_done; 2274 2275 /* 2276 * If we failed to find an SMAP, figure out the extended 2277 * memory size. We will then build a simple memory map with 2278 * two segments, one for "base memory" and the second for 2279 * "extended memory". Note that "extended memory" starts at a 2280 * physical address of 1MB and that both basemem and extmem 2281 * are in units of 1KB. 2282 * 2283 * First, try to fetch the extended memory size via INT 15:E801. 2284 */ 2285 vmf.vmf_ax = 0xE801; 2286 if (vm86_intcall(0x15, &vmf) == 0) { 2287 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2288 } else { 2289 /* 2290 * If INT15:E801 fails, this is our last ditch effort 2291 * to determine the extended memory size. Currently 2292 * we prefer the RTC value over INT15:88. 2293 */ 2294#if 0 2295 vmf.vmf_ah = 0x88; 2296 vm86_intcall(0x15, &vmf); 2297 extmem = vmf.vmf_ax; 2298#else 2299 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2300#endif 2301 } 2302 2303 /* 2304 * Special hack for chipsets that still remap the 384k hole when 2305 * there's 16MB of memory - this really confuses people that 2306 * are trying to use bus mastering ISA controllers with the 2307 * "16MB limit"; they only have 16MB, but the remapping puts 2308 * them beyond the limit. 2309 * 2310 * If extended memory is between 15-16MB (16-17MB phys address range), 2311 * chop it to 15MB. 2312 */ 2313 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2314 extmem = 15 * 1024; 2315 2316 physmap[0] = 0; 2317 physmap[1] = basemem * 1024; 2318 physmap_idx = 2; 2319 physmap[physmap_idx] = 0x100000; 2320 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2321 2322physmap_done: 2323#endif 2324 /* 2325 * Now, physmap contains a map of physical memory. 2326 */ 2327 2328#ifdef SMP 2329 /* make hole for AP bootstrap code */ 2330 physmap[1] = mp_bootaddress(physmap[1]); 2331#endif 2332 2333 /* 2334 * Maxmem isn't the "maximum memory", it's one larger than the 2335 * highest page of the physical address space. It should be 2336 * called something like "Maxphyspage". We may adjust this 2337 * based on ``hw.physmem'' and the results of the memory test. 2338 */ 2339 Maxmem = atop(physmap[physmap_idx + 1]); 2340 2341#ifdef MAXMEM 2342 Maxmem = MAXMEM / 4; 2343#endif 2344 2345 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2346 Maxmem = atop(physmem_tunable); 2347 2348 /* 2349 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2350 * the amount of memory in the system. 2351 */ 2352 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2353 Maxmem = atop(physmap[physmap_idx + 1]); 2354 2355 /* 2356 * By default enable the memory test on real hardware, and disable 2357 * it if we appear to be running in a VM. This avoids touching all 2358 * pages unnecessarily, which doesn't matter on real hardware but is 2359 * bad for shared VM hosts. Use a general name so that 2360 * one could eventually do more with the code than just disable it. 2361 */ 2362 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2363 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2364 2365 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2366 (boothowto & RB_VERBOSE)) 2367 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2368 2369 /* 2370 * If Maxmem has been increased beyond what the system has detected, 2371 * extend the last memory segment to the new limit. 2372 */ 2373 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2374 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2375 2376 /* call pmap initialization to make new kernel address space */ 2377 pmap_bootstrap(first); 2378 2379 /* 2380 * Size up each available chunk of physical memory. 2381 */ 2382 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2383 pa_indx = 0; 2384 da_indx = 1; 2385 phys_avail[pa_indx++] = physmap[0]; 2386 phys_avail[pa_indx] = physmap[0]; 2387 dump_avail[da_indx] = physmap[0]; 2388 pte = CMAP3; 2389 2390 /* 2391 * Get dcons buffer address 2392 */ 2393 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2394 getenv_quad("dcons.size", &dcons_size) == 0) 2395 dcons_addr = 0; 2396 2397#ifndef XEN 2398 /* 2399 * physmap is in bytes, so when converting to page boundaries, 2400 * round up the start address and round down the end address. 2401 */ 2402 for (i = 0; i <= physmap_idx; i += 2) { 2403 vm_paddr_t end; 2404 2405 end = ptoa((vm_paddr_t)Maxmem); 2406 if (physmap[i + 1] < end) 2407 end = trunc_page(physmap[i + 1]); 2408 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2409 int tmp, page_bad, full; 2410 int *ptr = (int *)CADDR3; 2411 2412 full = FALSE; 2413 /* 2414 * block out kernel memory as not available. 2415 */ 2416 if (pa >= KERNLOAD && pa < first) 2417 goto do_dump_avail; 2418 2419 /* 2420 * block out dcons buffer 2421 */ 2422 if (dcons_addr > 0 2423 && pa >= trunc_page(dcons_addr) 2424 && pa < dcons_addr + dcons_size) 2425 goto do_dump_avail; 2426 2427 page_bad = FALSE; 2428 if (memtest == 0) 2429 goto skip_memtest; 2430 2431 /* 2432 * map page into kernel: valid, read/write,non-cacheable 2433 */ 2434 *pte = pa | PG_V | PG_RW | PG_N; 2435 invltlb(); 2436 2437 tmp = *(int *)ptr; 2438 /* 2439 * Test for alternating 1's and 0's 2440 */ 2441 *(volatile int *)ptr = 0xaaaaaaaa; 2442 if (*(volatile int *)ptr != 0xaaaaaaaa) 2443 page_bad = TRUE; 2444 /* 2445 * Test for alternating 0's and 1's 2446 */ 2447 *(volatile int *)ptr = 0x55555555; 2448 if (*(volatile int *)ptr != 0x55555555) 2449 page_bad = TRUE; 2450 /* 2451 * Test for all 1's 2452 */ 2453 *(volatile int *)ptr = 0xffffffff; 2454 if (*(volatile int *)ptr != 0xffffffff) 2455 page_bad = TRUE; 2456 /* 2457 * Test for all 0's 2458 */ 2459 *(volatile int *)ptr = 0x0; 2460 if (*(volatile int *)ptr != 0x0) 2461 page_bad = TRUE; 2462 /* 2463 * Restore original value. 2464 */ 2465 *(int *)ptr = tmp; 2466 2467skip_memtest: 2468 /* 2469 * Adjust array of valid/good pages. 2470 */ 2471 if (page_bad == TRUE) 2472 continue; 2473 /* 2474 * If this good page is a continuation of the 2475 * previous set of good pages, then just increase 2476 * the end pointer. Otherwise start a new chunk. 2477 * Note that "end" points one higher than end, 2478 * making the range >= start and < end. 2479 * If we're also doing a speculative memory 2480 * test and we at or past the end, bump up Maxmem 2481 * so that we keep going. The first bad page 2482 * will terminate the loop. 2483 */ 2484 if (phys_avail[pa_indx] == pa) { 2485 phys_avail[pa_indx] += PAGE_SIZE; 2486 } else { 2487 pa_indx++; 2488 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2489 printf( 2490 "Too many holes in the physical address space, giving up\n"); 2491 pa_indx--; 2492 full = TRUE; 2493 goto do_dump_avail; 2494 } 2495 phys_avail[pa_indx++] = pa; /* start */ 2496 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2497 } 2498 physmem++; 2499do_dump_avail: 2500 if (dump_avail[da_indx] == pa) { 2501 dump_avail[da_indx] += PAGE_SIZE; 2502 } else { 2503 da_indx++; 2504 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2505 da_indx--; 2506 goto do_next; 2507 } 2508 dump_avail[da_indx++] = pa; /* start */ 2509 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2510 } 2511do_next: 2512 if (full) 2513 break; 2514 } 2515 } 2516 *pte = 0; 2517 invltlb(); 2518#else 2519 phys_avail[0] = physfree; 2520 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2521 dump_avail[0] = 0; 2522 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2523 2524#endif 2525 2526 /* 2527 * XXX 2528 * The last chunk must contain at least one page plus the message 2529 * buffer to avoid complicating other code (message buffer address 2530 * calculation, etc.). 2531 */ 2532 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2533 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2534 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2535 phys_avail[pa_indx--] = 0; 2536 phys_avail[pa_indx--] = 0; 2537 } 2538 2539 Maxmem = atop(phys_avail[pa_indx]); 2540 2541 /* Trim off space for the message buffer. */ 2542 phys_avail[pa_indx] -= round_page(msgbufsize); 2543 2544 /* Map the message buffer. */ 2545 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2546 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2547 off); 2548 2549 PT_UPDATES_FLUSH(); 2550} 2551 2552#ifdef XEN 2553#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 2554 2555void 2556init386(first) 2557 int first; 2558{ 2559 unsigned long gdtmachpfn; 2560 int error, gsel_tss, metadata_missing, x, pa; 2561 size_t kstack0_sz; 2562 struct pcpu *pc; 2563 struct callback_register event = { 2564 .type = CALLBACKTYPE_event, 2565 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, 2566 }; 2567 struct callback_register failsafe = { 2568 .type = CALLBACKTYPE_failsafe, 2569 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, 2570 }; 2571 2572 thread0.td_kstack = proc0kstack; 2573 thread0.td_kstack_pages = KSTACK_PAGES; 2574 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2575 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2576 2577 /* 2578 * This may be done better later if it gets more high level 2579 * components in it. If so just link td->td_proc here. 2580 */ 2581 proc_linkup0(&proc0, &thread0); 2582 2583 metadata_missing = 0; 2584 if (xen_start_info->mod_start) { 2585 preload_metadata = (caddr_t)xen_start_info->mod_start; 2586 preload_bootstrap_relocate(KERNBASE); 2587 } else { 2588 metadata_missing = 1; 2589 } 2590 if (envmode == 1) 2591 kern_envp = static_env; 2592 else if ((caddr_t)xen_start_info->cmd_line) 2593 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); 2594 2595 boothowto |= xen_boothowto(kern_envp); 2596 2597 /* Init basic tunables, hz etc */ 2598 init_param1(); 2599 2600 /* 2601 * XEN occupies a portion of the upper virtual address space 2602 * At its base it manages an array mapping machine page frames 2603 * to physical page frames - hence we need to be able to 2604 * access 4GB - (64MB - 4MB + 64k) 2605 */ 2606 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2607 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2608 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2609 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2610 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2611 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2612 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2613 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2614 2615 pc = &__pcpu[0]; 2616 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2617 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2618 2619 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); 2620 bzero(gdt, PAGE_SIZE); 2621 for (x = 0; x < NGDT; x++) 2622 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2623 2624 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2625 2626 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; 2627 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); 2628 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); 2629 lgdt(&r_gdt); 2630 gdtset = 1; 2631 2632 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { 2633 panic("set_trap_table failed - error %d\n", error); 2634 } 2635 2636 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); 2637 if (error == 0) 2638 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); 2639#if CONFIG_XEN_COMPAT <= 0x030002 2640 if (error == -ENOXENSYS) 2641 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), 2642 (unsigned long)Xhypervisor_callback, 2643 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 2644#endif 2645 pcpu_init(pc, 0, sizeof(struct pcpu)); 2646 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2647 pmap_kenter(pa + KERNBASE, pa); 2648 dpcpu_init((void *)(first + KERNBASE), 0); 2649 first += DPCPU_SIZE; 2650 physfree += DPCPU_SIZE; 2651 init_first += DPCPU_SIZE / PAGE_SIZE; 2652 2653 PCPU_SET(prvspace, pc); 2654 PCPU_SET(curthread, &thread0); 2655 PCPU_SET(curpcb, thread0.td_pcb); 2656 2657 /* 2658 * Initialize mutexes. 2659 * 2660 * icu_lock: in order to allow an interrupt to occur in a critical 2661 * section, to set pcpu->ipending (etc...) properly, we 2662 * must be able to get the icu lock, so it can't be 2663 * under witness. 2664 */ 2665 mutex_init(); 2666 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2667 2668 /* make ldt memory segments */ 2669 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); 2670 bzero(ldt, PAGE_SIZE); 2671 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2672 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2673 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2674 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2675 2676 default_proc_ldt.ldt_base = (caddr_t)ldt; 2677 default_proc_ldt.ldt_len = 6; 2678 _default_ldt = (int)&default_proc_ldt; 2679 PCPU_SET(currentldt, _default_ldt); 2680 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); 2681 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); 2682 2683#if defined(XEN_PRIVILEGED) 2684 /* 2685 * Initialize the i8254 before the console so that console 2686 * initialization can use DELAY(). 2687 */ 2688 i8254_init(); 2689#endif 2690 2691 /* 2692 * Initialize the console before we print anything out. 2693 */ 2694 cninit(); 2695 2696 if (metadata_missing) 2697 printf("WARNING: loader(8) metadata is missing!\n"); 2698 2699#ifdef DEV_ISA 2700#ifdef DEV_ATPIC 2701 elcr_probe(); 2702 atpic_startup(); 2703#else 2704 /* Reset and mask the atpics and leave them shut down. */ 2705 atpic_reset(); 2706 2707 /* 2708 * Point the ICU spurious interrupt vectors at the APIC spurious 2709 * interrupt handler. 2710 */ 2711 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2712 GSEL(GCODE_SEL, SEL_KPL)); 2713 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2714 GSEL(GCODE_SEL, SEL_KPL)); 2715#endif 2716#endif 2717 2718#ifdef DDB 2719 ksym_start = bootinfo.bi_symtab; 2720 ksym_end = bootinfo.bi_esymtab; 2721#endif 2722 2723 kdb_init(); 2724 2725#ifdef KDB 2726 if (boothowto & RB_KDB) 2727 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2728#endif 2729 2730 finishidentcpu(); /* Final stage of CPU initialization */ 2731 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2732 GSEL(GCODE_SEL, SEL_KPL)); 2733 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2734 GSEL(GCODE_SEL, SEL_KPL)); 2735 initializecpu(); /* Initialize CPU registers */ 2736 2737 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2738 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2739 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2740 kstack0_sz - sizeof(struct pcb) - 16); 2741 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2742 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2743 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), 2744 PCPU_GET(common_tss.tss_esp0)); 2745 2746 /* pointer to selector slot for %fs/%gs */ 2747 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2748 2749 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2750 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2751 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2752 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2753#ifdef PAE 2754 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2755#else 2756 dblfault_tss.tss_cr3 = (int)IdlePTD; 2757#endif 2758 dblfault_tss.tss_eip = (int)dblfault_handler; 2759 dblfault_tss.tss_eflags = PSL_KERNEL; 2760 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2761 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2762 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2763 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2764 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2765 2766 vm86_initialize(); 2767 getmemsize(first); 2768 init_param2(physmem); 2769 2770 /* now running on new page tables, configured,and u/iom is accessible */ 2771 2772 msgbufinit(msgbufp, msgbufsize); 2773 /* transfer to user mode */ 2774 2775 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2776 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2777 2778 /* setup proc 0's pcb */ 2779 thread0.td_pcb->pcb_flags = 0; 2780#ifdef PAE 2781 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2782#else 2783 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2784#endif 2785 thread0.td_pcb->pcb_ext = 0; 2786 thread0.td_frame = &proc0_tf; 2787 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; 2788 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; 2789 2790 cpu_probe_amdc1e(); 2791} 2792 2793#else 2794void 2795init386(first) 2796 int first; 2797{ 2798 struct gate_descriptor *gdp; 2799 int gsel_tss, metadata_missing, x, pa; 2800 size_t kstack0_sz; 2801 struct pcpu *pc; 2802 2803 thread0.td_kstack = proc0kstack; 2804 thread0.td_kstack_pages = KSTACK_PAGES; 2805 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2806 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2807 2808 /* 2809 * This may be done better later if it gets more high level 2810 * components in it. If so just link td->td_proc here. 2811 */ 2812 proc_linkup0(&proc0, &thread0); 2813 2814 metadata_missing = 0; 2815 if (bootinfo.bi_modulep) { 2816 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2817 preload_bootstrap_relocate(KERNBASE); 2818 } else { 2819 metadata_missing = 1; 2820 } 2821 if (envmode == 1) 2822 kern_envp = static_env; 2823 else if (bootinfo.bi_envp) 2824 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2825 2826 /* Init basic tunables, hz etc */ 2827 init_param1(); 2828 2829 /* 2830 * Make gdt memory segments. All segments cover the full 4GB 2831 * of address space and permissions are enforced at page level. 2832 */ 2833 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2834 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2835 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2836 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2837 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2838 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2839 2840 pc = &__pcpu[0]; 2841 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2842 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2843 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2844 2845 for (x = 0; x < NGDT; x++) 2846 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2847 2848 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2849 r_gdt.rd_base = (int) gdt; 2850 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2851 lgdt(&r_gdt); 2852 2853 pcpu_init(pc, 0, sizeof(struct pcpu)); 2854 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2855 pmap_kenter(pa + KERNBASE, pa); 2856 dpcpu_init((void *)(first + KERNBASE), 0); 2857 first += DPCPU_SIZE; 2858 PCPU_SET(prvspace, pc); 2859 PCPU_SET(curthread, &thread0); 2860 PCPU_SET(curpcb, thread0.td_pcb); 2861 2862 /* 2863 * Initialize mutexes. 2864 * 2865 * icu_lock: in order to allow an interrupt to occur in a critical 2866 * section, to set pcpu->ipending (etc...) properly, we 2867 * must be able to get the icu lock, so it can't be 2868 * under witness. 2869 */ 2870 mutex_init(); 2871 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2872 2873 /* make ldt memory segments */ 2874 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2875 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2876 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2877 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2878 2879 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2880 lldt(_default_ldt); 2881 PCPU_SET(currentldt, _default_ldt); 2882 2883 /* exceptions */ 2884 for (x = 0; x < NIDT; x++) 2885 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2886 GSEL(GCODE_SEL, SEL_KPL)); 2887 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2888 GSEL(GCODE_SEL, SEL_KPL)); 2889 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2890 GSEL(GCODE_SEL, SEL_KPL)); 2891 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2892 GSEL(GCODE_SEL, SEL_KPL)); 2893 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2894 GSEL(GCODE_SEL, SEL_KPL)); 2895 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2896 GSEL(GCODE_SEL, SEL_KPL)); 2897 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2898 GSEL(GCODE_SEL, SEL_KPL)); 2899 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2900 GSEL(GCODE_SEL, SEL_KPL)); 2901 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2902 , GSEL(GCODE_SEL, SEL_KPL)); 2903 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2904 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2905 GSEL(GCODE_SEL, SEL_KPL)); 2906 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2907 GSEL(GCODE_SEL, SEL_KPL)); 2908 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2909 GSEL(GCODE_SEL, SEL_KPL)); 2910 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2911 GSEL(GCODE_SEL, SEL_KPL)); 2912 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2913 GSEL(GCODE_SEL, SEL_KPL)); 2914 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2915 GSEL(GCODE_SEL, SEL_KPL)); 2916 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2917 GSEL(GCODE_SEL, SEL_KPL)); 2918 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2919 GSEL(GCODE_SEL, SEL_KPL)); 2920 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2921 GSEL(GCODE_SEL, SEL_KPL)); 2922 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2923 GSEL(GCODE_SEL, SEL_KPL)); 2924 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2925 GSEL(GCODE_SEL, SEL_KPL)); 2926#ifdef KDTRACE_HOOKS 2927 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2928 GSEL(GCODE_SEL, SEL_KPL)); 2929#endif 2930#ifdef XENHVM 2931 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, 2932 GSEL(GCODE_SEL, SEL_KPL)); 2933#endif 2934 2935 r_idt.rd_limit = sizeof(idt0) - 1; 2936 r_idt.rd_base = (int) idt; 2937 lidt(&r_idt); 2938 2939#ifdef XBOX 2940 /* 2941 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2942 * This should be 0x10de / 0x02a5. 2943 * 2944 * This is exactly what Linux does. 2945 */ 2946 outl(0xcf8, 0x80000000); 2947 if (inl(0xcfc) == 0x02a510de) { 2948 arch_i386_is_xbox = 1; 2949 pic16l_setled(XBOX_LED_GREEN); 2950 2951 /* 2952 * We are an XBOX, but we may have either 64MB or 128MB of 2953 * memory. The PCI host bridge should be programmed for this, 2954 * so we just query it. 2955 */ 2956 outl(0xcf8, 0x80000084); 2957 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2958 } 2959#endif /* XBOX */ 2960 2961 /* 2962 * Initialize the i8254 before the console so that console 2963 * initialization can use DELAY(). 2964 */ 2965 i8254_init(); 2966 2967 /* 2968 * Initialize the console before we print anything out. 2969 */ 2970 cninit(); 2971 2972 if (metadata_missing) 2973 printf("WARNING: loader(8) metadata is missing!\n"); 2974 2975#ifdef DEV_ISA 2976#ifdef DEV_ATPIC 2977 elcr_probe(); 2978 atpic_startup(); 2979#else 2980 /* Reset and mask the atpics and leave them shut down. */ 2981 atpic_reset(); 2982 2983 /* 2984 * Point the ICU spurious interrupt vectors at the APIC spurious 2985 * interrupt handler. 2986 */ 2987 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2988 GSEL(GCODE_SEL, SEL_KPL)); 2989 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2990 GSEL(GCODE_SEL, SEL_KPL)); 2991#endif 2992#endif 2993 2994#ifdef DDB 2995 ksym_start = bootinfo.bi_symtab; 2996 ksym_end = bootinfo.bi_esymtab; 2997#endif 2998 2999 kdb_init(); 3000 3001#ifdef KDB 3002 if (boothowto & RB_KDB) 3003 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 3004#endif 3005 3006 finishidentcpu(); /* Final stage of CPU initialization */ 3007 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 3008 GSEL(GCODE_SEL, SEL_KPL)); 3009 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 3010 GSEL(GCODE_SEL, SEL_KPL)); 3011 initializecpu(); /* Initialize CPU registers */ 3012 3013 /* make an initial tss so cpu can get interrupt stack on syscall! */ 3014 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 3015 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 3016 kstack0_sz - sizeof(struct pcb) - 16); 3017 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 3018 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 3019 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 3020 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 3021 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 3022 ltr(gsel_tss); 3023 3024 /* pointer to selector slot for %fs/%gs */ 3025 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 3026 3027 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 3028 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 3029 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 3030 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 3031#ifdef PAE 3032 dblfault_tss.tss_cr3 = (int)IdlePDPT; 3033#else 3034 dblfault_tss.tss_cr3 = (int)IdlePTD; 3035#endif 3036 dblfault_tss.tss_eip = (int)dblfault_handler; 3037 dblfault_tss.tss_eflags = PSL_KERNEL; 3038 dblfault_tss.tss_ds = dblfault_tss.tss_es = 3039 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 3040 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 3041 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 3042 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 3043 3044 vm86_initialize(); 3045 getmemsize(first); 3046 init_param2(physmem); 3047 3048 /* now running on new page tables, configured,and u/iom is accessible */ 3049 3050 msgbufinit(msgbufp, msgbufsize); 3051 3052 /* make a call gate to reenter kernel with */ 3053 gdp = &ldt[LSYS5CALLS_SEL].gd; 3054 3055 x = (int) &IDTVEC(lcall_syscall); 3056 gdp->gd_looffset = x; 3057 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 3058 gdp->gd_stkcpy = 1; 3059 gdp->gd_type = SDT_SYS386CGT; 3060 gdp->gd_dpl = SEL_UPL; 3061 gdp->gd_p = 1; 3062 gdp->gd_hioffset = x >> 16; 3063 3064 /* XXX does this work? */ 3065 /* XXX yes! */ 3066 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3067 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3068 3069 /* transfer to user mode */ 3070 3071 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 3072 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 3073 3074 /* setup proc 0's pcb */ 3075 thread0.td_pcb->pcb_flags = 0; 3076#ifdef PAE 3077 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 3078#else 3079 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 3080#endif 3081 thread0.td_pcb->pcb_ext = 0; 3082 thread0.td_frame = &proc0_tf; 3083 3084 cpu_probe_amdc1e(); 3085 3086#ifdef FDT 3087 x86_init_fdt(); 3088#endif 3089} 3090#endif 3091 3092void 3093cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 3094{ 3095 3096 pcpu->pc_acpi_id = 0xffffffff; 3097} 3098 3099static int 3100smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 3101{ 3102 struct bios_smap *smapbase; 3103 struct bios_smap_xattr smap; 3104 caddr_t kmdp; 3105 uint32_t *smapattr; 3106 int count, error, i; 3107 3108 /* Retrieve the system memory map from the loader. */ 3109 kmdp = preload_search_by_type("elf kernel"); 3110 if (kmdp == NULL) 3111 kmdp = preload_search_by_type("elf32 kernel"); 3112 if (kmdp == NULL) 3113 return (0); 3114 smapbase = (struct bios_smap *)preload_search_info(kmdp, 3115 MODINFO_METADATA | MODINFOMD_SMAP); 3116 if (smapbase == NULL) 3117 return (0); 3118 smapattr = (uint32_t *)preload_search_info(kmdp, 3119 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 3120 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 3121 error = 0; 3122 for (i = 0; i < count; i++) { 3123 smap.base = smapbase[i].base; 3124 smap.length = smapbase[i].length; 3125 smap.type = smapbase[i].type; 3126 if (smapattr != NULL) 3127 smap.xattr = smapattr[i]; 3128 else 3129 smap.xattr = 0; 3130 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 3131 } 3132 return (error); 3133} 3134SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 3135 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 3136 3137void 3138spinlock_enter(void) 3139{ 3140 struct thread *td; 3141 register_t flags; 3142 3143 td = curthread; 3144 if (td->td_md.md_spinlock_count == 0) { 3145 flags = intr_disable(); 3146 td->td_md.md_spinlock_count = 1; 3147 td->td_md.md_saved_flags = flags; 3148 } else 3149 td->td_md.md_spinlock_count++; 3150 critical_enter(); 3151} 3152 3153void 3154spinlock_exit(void) 3155{ 3156 struct thread *td; 3157 register_t flags; 3158 3159 td = curthread; 3160 critical_exit(); 3161 flags = td->td_md.md_saved_flags; 3162 td->td_md.md_spinlock_count--; 3163 if (td->td_md.md_spinlock_count == 0) 3164 intr_restore(flags); 3165} 3166 3167#if defined(I586_CPU) && !defined(NO_F00F_HACK) 3168static void f00f_hack(void *unused); 3169SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 3170 3171static void 3172f00f_hack(void *unused) 3173{ 3174 struct gate_descriptor *new_idt; 3175 vm_offset_t tmp; 3176 3177 if (!has_f00f_bug) 3178 return; 3179 3180 GIANT_REQUIRED; 3181 3182 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 3183 3184 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 3185 if (tmp == 0) 3186 panic("kmem_malloc returned 0"); 3187 3188 /* Put the problematic entry (#6) at the end of the lower page. */ 3189 new_idt = (struct gate_descriptor*) 3190 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 3191 bcopy(idt, new_idt, sizeof(idt0)); 3192 r_idt.rd_base = (u_int)new_idt; 3193 lidt(&r_idt); 3194 idt = new_idt; 3195 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 3196} 3197#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 3198 3199/* 3200 * Construct a PCB from a trapframe. This is called from kdb_trap() where 3201 * we want to start a backtrace from the function that caused us to enter 3202 * the debugger. We have the context in the trapframe, but base the trace 3203 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 3204 * enough for a backtrace. 3205 */ 3206void 3207makectx(struct trapframe *tf, struct pcb *pcb) 3208{ 3209 3210 pcb->pcb_edi = tf->tf_edi; 3211 pcb->pcb_esi = tf->tf_esi; 3212 pcb->pcb_ebp = tf->tf_ebp; 3213 pcb->pcb_ebx = tf->tf_ebx; 3214 pcb->pcb_eip = tf->tf_eip; 3215 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 3216} 3217 3218int 3219ptrace_set_pc(struct thread *td, u_long addr) 3220{ 3221 3222 td->td_frame->tf_eip = addr; 3223 return (0); 3224} 3225 3226int 3227ptrace_single_step(struct thread *td) 3228{ 3229 td->td_frame->tf_eflags |= PSL_T; 3230 return (0); 3231} 3232 3233int 3234ptrace_clear_single_step(struct thread *td) 3235{ 3236 td->td_frame->tf_eflags &= ~PSL_T; 3237 return (0); 3238} 3239 3240int 3241fill_regs(struct thread *td, struct reg *regs) 3242{ 3243 struct pcb *pcb; 3244 struct trapframe *tp; 3245 3246 tp = td->td_frame; 3247 pcb = td->td_pcb; 3248 regs->r_gs = pcb->pcb_gs; 3249 return (fill_frame_regs(tp, regs)); 3250} 3251 3252int 3253fill_frame_regs(struct trapframe *tp, struct reg *regs) 3254{ 3255 regs->r_fs = tp->tf_fs; 3256 regs->r_es = tp->tf_es; 3257 regs->r_ds = tp->tf_ds; 3258 regs->r_edi = tp->tf_edi; 3259 regs->r_esi = tp->tf_esi; 3260 regs->r_ebp = tp->tf_ebp; 3261 regs->r_ebx = tp->tf_ebx; 3262 regs->r_edx = tp->tf_edx; 3263 regs->r_ecx = tp->tf_ecx; 3264 regs->r_eax = tp->tf_eax; 3265 regs->r_eip = tp->tf_eip; 3266 regs->r_cs = tp->tf_cs; 3267 regs->r_eflags = tp->tf_eflags; 3268 regs->r_esp = tp->tf_esp; 3269 regs->r_ss = tp->tf_ss; 3270 return (0); 3271} 3272 3273int 3274set_regs(struct thread *td, struct reg *regs) 3275{ 3276 struct pcb *pcb; 3277 struct trapframe *tp; 3278 3279 tp = td->td_frame; 3280 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 3281 !CS_SECURE(regs->r_cs)) 3282 return (EINVAL); 3283 pcb = td->td_pcb; 3284 tp->tf_fs = regs->r_fs; 3285 tp->tf_es = regs->r_es; 3286 tp->tf_ds = regs->r_ds; 3287 tp->tf_edi = regs->r_edi; 3288 tp->tf_esi = regs->r_esi; 3289 tp->tf_ebp = regs->r_ebp; 3290 tp->tf_ebx = regs->r_ebx; 3291 tp->tf_edx = regs->r_edx; 3292 tp->tf_ecx = regs->r_ecx; 3293 tp->tf_eax = regs->r_eax; 3294 tp->tf_eip = regs->r_eip; 3295 tp->tf_cs = regs->r_cs; 3296 tp->tf_eflags = regs->r_eflags; 3297 tp->tf_esp = regs->r_esp; 3298 tp->tf_ss = regs->r_ss; 3299 pcb->pcb_gs = regs->r_gs; 3300 return (0); 3301} 3302 3303#ifdef CPU_ENABLE_SSE 3304static void 3305fill_fpregs_xmm(sv_xmm, sv_87) 3306 struct savexmm *sv_xmm; 3307 struct save87 *sv_87; 3308{ 3309 register struct env87 *penv_87 = &sv_87->sv_env; 3310 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3311 int i; 3312 3313 bzero(sv_87, sizeof(*sv_87)); 3314 3315 /* FPU control/status */ 3316 penv_87->en_cw = penv_xmm->en_cw; 3317 penv_87->en_sw = penv_xmm->en_sw; 3318 penv_87->en_tw = penv_xmm->en_tw; 3319 penv_87->en_fip = penv_xmm->en_fip; 3320 penv_87->en_fcs = penv_xmm->en_fcs; 3321 penv_87->en_opcode = penv_xmm->en_opcode; 3322 penv_87->en_foo = penv_xmm->en_foo; 3323 penv_87->en_fos = penv_xmm->en_fos; 3324 3325 /* FPU registers */ 3326 for (i = 0; i < 8; ++i) 3327 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3328} 3329 3330static void 3331set_fpregs_xmm(sv_87, sv_xmm) 3332 struct save87 *sv_87; 3333 struct savexmm *sv_xmm; 3334{ 3335 register struct env87 *penv_87 = &sv_87->sv_env; 3336 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3337 int i; 3338 3339 /* FPU control/status */ 3340 penv_xmm->en_cw = penv_87->en_cw; 3341 penv_xmm->en_sw = penv_87->en_sw; 3342 penv_xmm->en_tw = penv_87->en_tw; 3343 penv_xmm->en_fip = penv_87->en_fip; 3344 penv_xmm->en_fcs = penv_87->en_fcs; 3345 penv_xmm->en_opcode = penv_87->en_opcode; 3346 penv_xmm->en_foo = penv_87->en_foo; 3347 penv_xmm->en_fos = penv_87->en_fos; 3348 3349 /* FPU registers */ 3350 for (i = 0; i < 8; ++i) 3351 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3352} 3353#endif /* CPU_ENABLE_SSE */ 3354 3355int 3356fill_fpregs(struct thread *td, struct fpreg *fpregs) 3357{ 3358 3359 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 3360 P_SHOULDSTOP(td->td_proc), 3361 ("not suspended thread %p", td)); 3362#ifdef DEV_NPX 3363 npxgetregs(td); 3364#else 3365 bzero(fpregs, sizeof(*fpregs)); 3366#endif 3367#ifdef CPU_ENABLE_SSE 3368 if (cpu_fxsr) 3369 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm, 3370 (struct save87 *)fpregs); 3371 else 3372#endif /* CPU_ENABLE_SSE */ 3373 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs, 3374 sizeof(*fpregs)); 3375 return (0); 3376} 3377 3378int 3379set_fpregs(struct thread *td, struct fpreg *fpregs) 3380{ 3381 3382#ifdef CPU_ENABLE_SSE 3383 if (cpu_fxsr) 3384 set_fpregs_xmm((struct save87 *)fpregs, 3385 &td->td_pcb->pcb_user_save.sv_xmm); 3386 else 3387#endif /* CPU_ENABLE_SSE */ 3388 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87, 3389 sizeof(*fpregs)); 3390#ifdef DEV_NPX 3391 npxuserinited(td); 3392#endif 3393 return (0); 3394} 3395 3396/* 3397 * Get machine context. 3398 */ 3399int 3400get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3401{ 3402 struct trapframe *tp; 3403 struct segment_descriptor *sdp; 3404 3405 tp = td->td_frame; 3406 3407 PROC_LOCK(curthread->td_proc); 3408 mcp->mc_onstack = sigonstack(tp->tf_esp); 3409 PROC_UNLOCK(curthread->td_proc); 3410 mcp->mc_gs = td->td_pcb->pcb_gs; 3411 mcp->mc_fs = tp->tf_fs; 3412 mcp->mc_es = tp->tf_es; 3413 mcp->mc_ds = tp->tf_ds; 3414 mcp->mc_edi = tp->tf_edi; 3415 mcp->mc_esi = tp->tf_esi; 3416 mcp->mc_ebp = tp->tf_ebp; 3417 mcp->mc_isp = tp->tf_isp; 3418 mcp->mc_eflags = tp->tf_eflags; 3419 if (flags & GET_MC_CLEAR_RET) { 3420 mcp->mc_eax = 0; 3421 mcp->mc_edx = 0; 3422 mcp->mc_eflags &= ~PSL_C; 3423 } else { 3424 mcp->mc_eax = tp->tf_eax; 3425 mcp->mc_edx = tp->tf_edx; 3426 } 3427 mcp->mc_ebx = tp->tf_ebx; 3428 mcp->mc_ecx = tp->tf_ecx; 3429 mcp->mc_eip = tp->tf_eip; 3430 mcp->mc_cs = tp->tf_cs; 3431 mcp->mc_esp = tp->tf_esp; 3432 mcp->mc_ss = tp->tf_ss; 3433 mcp->mc_len = sizeof(*mcp); 3434 get_fpcontext(td, mcp); 3435 sdp = &td->td_pcb->pcb_fsd; 3436 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3437 sdp = &td->td_pcb->pcb_gsd; 3438 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3439 mcp->mc_flags = 0; 3440 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3441 return (0); 3442} 3443 3444/* 3445 * Set machine context. 3446 * 3447 * However, we don't set any but the user modifiable flags, and we won't 3448 * touch the cs selector. 3449 */ 3450int 3451set_mcontext(struct thread *td, const mcontext_t *mcp) 3452{ 3453 struct trapframe *tp; 3454 int eflags, ret; 3455 3456 tp = td->td_frame; 3457 if (mcp->mc_len != sizeof(*mcp)) 3458 return (EINVAL); 3459 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3460 (tp->tf_eflags & ~PSL_USERCHANGE); 3461 if ((ret = set_fpcontext(td, mcp)) == 0) { 3462 tp->tf_fs = mcp->mc_fs; 3463 tp->tf_es = mcp->mc_es; 3464 tp->tf_ds = mcp->mc_ds; 3465 tp->tf_edi = mcp->mc_edi; 3466 tp->tf_esi = mcp->mc_esi; 3467 tp->tf_ebp = mcp->mc_ebp; 3468 tp->tf_ebx = mcp->mc_ebx; 3469 tp->tf_edx = mcp->mc_edx; 3470 tp->tf_ecx = mcp->mc_ecx; 3471 tp->tf_eax = mcp->mc_eax; 3472 tp->tf_eip = mcp->mc_eip; 3473 tp->tf_eflags = eflags; 3474 tp->tf_esp = mcp->mc_esp; 3475 tp->tf_ss = mcp->mc_ss; 3476 td->td_pcb->pcb_gs = mcp->mc_gs; 3477 ret = 0; 3478 } 3479 return (ret); 3480} 3481 3482static void 3483get_fpcontext(struct thread *td, mcontext_t *mcp) 3484{ 3485 3486#ifndef DEV_NPX 3487 mcp->mc_fpformat = _MC_FPFMT_NODEV; 3488 mcp->mc_ownedfp = _MC_FPOWNED_NONE; 3489 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); 3490#else 3491 mcp->mc_ownedfp = npxgetregs(td); 3492 bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0], 3493 sizeof(mcp->mc_fpstate)); 3494 mcp->mc_fpformat = npxformat(); 3495#endif 3496} 3497 3498static int 3499set_fpcontext(struct thread *td, const mcontext_t *mcp) 3500{ 3501 3502 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3503 return (0); 3504 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3505 mcp->mc_fpformat != _MC_FPFMT_XMM) 3506 return (EINVAL); 3507 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) 3508 /* We don't care what state is left in the FPU or PCB. */ 3509 fpstate_drop(td); 3510 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3511 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3512#ifdef DEV_NPX 3513#ifdef CPU_ENABLE_SSE 3514 if (cpu_fxsr) 3515 ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env. 3516 en_mxcsr &= cpu_mxcsr_mask; 3517#endif 3518 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate); 3519#endif 3520 } else 3521 return (EINVAL); 3522 return (0); 3523} 3524 3525static void 3526fpstate_drop(struct thread *td) 3527{ 3528 3529 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3530 critical_enter(); 3531#ifdef DEV_NPX 3532 if (PCPU_GET(fpcurthread) == td) 3533 npxdrop(); 3534#endif 3535 /* 3536 * XXX force a full drop of the npx. The above only drops it if we 3537 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3538 * 3539 * XXX I don't much like npxgetregs()'s semantics of doing a full 3540 * drop. Dropping only to the pcb matches fnsave's behaviour. 3541 * We only need to drop to !PCB_INITDONE in sendsig(). But 3542 * sendsig() is the only caller of npxgetregs()... perhaps we just 3543 * have too many layers. 3544 */ 3545 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3546 PCB_NPXUSERINITDONE); 3547 critical_exit(); 3548} 3549 3550int 3551fill_dbregs(struct thread *td, struct dbreg *dbregs) 3552{ 3553 struct pcb *pcb; 3554 3555 if (td == NULL) { 3556 dbregs->dr[0] = rdr0(); 3557 dbregs->dr[1] = rdr1(); 3558 dbregs->dr[2] = rdr2(); 3559 dbregs->dr[3] = rdr3(); 3560 dbregs->dr[4] = rdr4(); 3561 dbregs->dr[5] = rdr5(); 3562 dbregs->dr[6] = rdr6(); 3563 dbregs->dr[7] = rdr7(); 3564 } else { 3565 pcb = td->td_pcb; 3566 dbregs->dr[0] = pcb->pcb_dr0; 3567 dbregs->dr[1] = pcb->pcb_dr1; 3568 dbregs->dr[2] = pcb->pcb_dr2; 3569 dbregs->dr[3] = pcb->pcb_dr3; 3570 dbregs->dr[4] = 0; 3571 dbregs->dr[5] = 0; 3572 dbregs->dr[6] = pcb->pcb_dr6; 3573 dbregs->dr[7] = pcb->pcb_dr7; 3574 } 3575 return (0); 3576} 3577 3578int 3579set_dbregs(struct thread *td, struct dbreg *dbregs) 3580{ 3581 struct pcb *pcb; 3582 int i; 3583 3584 if (td == NULL) { 3585 load_dr0(dbregs->dr[0]); 3586 load_dr1(dbregs->dr[1]); 3587 load_dr2(dbregs->dr[2]); 3588 load_dr3(dbregs->dr[3]); 3589 load_dr4(dbregs->dr[4]); 3590 load_dr5(dbregs->dr[5]); 3591 load_dr6(dbregs->dr[6]); 3592 load_dr7(dbregs->dr[7]); 3593 } else { 3594 /* 3595 * Don't let an illegal value for dr7 get set. Specifically, 3596 * check for undefined settings. Setting these bit patterns 3597 * result in undefined behaviour and can lead to an unexpected 3598 * TRCTRAP. 3599 */ 3600 for (i = 0; i < 4; i++) { 3601 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3602 return (EINVAL); 3603 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3604 return (EINVAL); 3605 } 3606 3607 pcb = td->td_pcb; 3608 3609 /* 3610 * Don't let a process set a breakpoint that is not within the 3611 * process's address space. If a process could do this, it 3612 * could halt the system by setting a breakpoint in the kernel 3613 * (if ddb was enabled). Thus, we need to check to make sure 3614 * that no breakpoints are being enabled for addresses outside 3615 * process's address space. 3616 * 3617 * XXX - what about when the watched area of the user's 3618 * address space is written into from within the kernel 3619 * ... wouldn't that still cause a breakpoint to be generated 3620 * from within kernel mode? 3621 */ 3622 3623 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3624 /* dr0 is enabled */ 3625 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3626 return (EINVAL); 3627 } 3628 3629 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3630 /* dr1 is enabled */ 3631 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3632 return (EINVAL); 3633 } 3634 3635 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3636 /* dr2 is enabled */ 3637 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3638 return (EINVAL); 3639 } 3640 3641 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3642 /* dr3 is enabled */ 3643 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3644 return (EINVAL); 3645 } 3646 3647 pcb->pcb_dr0 = dbregs->dr[0]; 3648 pcb->pcb_dr1 = dbregs->dr[1]; 3649 pcb->pcb_dr2 = dbregs->dr[2]; 3650 pcb->pcb_dr3 = dbregs->dr[3]; 3651 pcb->pcb_dr6 = dbregs->dr[6]; 3652 pcb->pcb_dr7 = dbregs->dr[7]; 3653 3654 pcb->pcb_flags |= PCB_DBREGS; 3655 } 3656 3657 return (0); 3658} 3659 3660/* 3661 * Return > 0 if a hardware breakpoint has been hit, and the 3662 * breakpoint was in user space. Return 0, otherwise. 3663 */ 3664int 3665user_dbreg_trap(void) 3666{ 3667 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ 3668 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3669 int nbp; /* number of breakpoints that triggered */ 3670 caddr_t addr[4]; /* breakpoint addresses */ 3671 int i; 3672 3673 dr7 = rdr7(); 3674 if ((dr7 & 0x000000ff) == 0) { 3675 /* 3676 * all GE and LE bits in the dr7 register are zero, 3677 * thus the trap couldn't have been caused by the 3678 * hardware debug registers 3679 */ 3680 return 0; 3681 } 3682 3683 nbp = 0; 3684 dr6 = rdr6(); 3685 bp = dr6 & 0x0000000f; 3686 3687 if (!bp) { 3688 /* 3689 * None of the breakpoint bits are set meaning this 3690 * trap was not caused by any of the debug registers 3691 */ 3692 return 0; 3693 } 3694 3695 /* 3696 * at least one of the breakpoints were hit, check to see 3697 * which ones and if any of them are user space addresses 3698 */ 3699 3700 if (bp & 0x01) { 3701 addr[nbp++] = (caddr_t)rdr0(); 3702 } 3703 if (bp & 0x02) { 3704 addr[nbp++] = (caddr_t)rdr1(); 3705 } 3706 if (bp & 0x04) { 3707 addr[nbp++] = (caddr_t)rdr2(); 3708 } 3709 if (bp & 0x08) { 3710 addr[nbp++] = (caddr_t)rdr3(); 3711 } 3712 3713 for (i = 0; i < nbp; i++) { 3714 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3715 /* 3716 * addr[i] is in user space 3717 */ 3718 return nbp; 3719 } 3720 } 3721 3722 /* 3723 * None of the breakpoints are in user space. 3724 */ 3725 return 0; 3726} 3727 3728#ifdef KDB 3729 3730/* 3731 * Provide inb() and outb() as functions. They are normally only available as 3732 * inline functions, thus cannot be called from the debugger. 3733 */ 3734 3735/* silence compiler warnings */ 3736u_char inb_(u_short); 3737void outb_(u_short, u_char); 3738 3739u_char 3740inb_(u_short port) 3741{ 3742 return inb(port); 3743} 3744 3745void 3746outb_(u_short port, u_char data) 3747{ 3748 outb(port, data); 3749} 3750 3751#endif /* KDB */ 3752