machdep.c revision 258559
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/10/sys/i386/i386/machdep.c 258559 2013-11-25 15:58:48Z emaste $"); 42 43#include "opt_apic.h" 44#include "opt_atalk.h" 45#include "opt_atpic.h" 46#include "opt_compat.h" 47#include "opt_cpu.h" 48#include "opt_ddb.h" 49#include "opt_inet.h" 50#include "opt_ipx.h" 51#include "opt_isa.h" 52#include "opt_kstack_pages.h" 53#include "opt_maxmem.h" 54#include "opt_mp_watchdog.h" 55#include "opt_npx.h" 56#include "opt_perfmon.h" 57#include "opt_platform.h" 58#include "opt_xbox.h" 59#include "opt_kdtrace.h" 60 61#include <sys/param.h> 62#include <sys/proc.h> 63#include <sys/systm.h> 64#include <sys/bio.h> 65#include <sys/buf.h> 66#include <sys/bus.h> 67#include <sys/callout.h> 68#include <sys/cons.h> 69#include <sys/cpu.h> 70#include <sys/eventhandler.h> 71#include <sys/exec.h> 72#include <sys/imgact.h> 73#include <sys/kdb.h> 74#include <sys/kernel.h> 75#include <sys/ktr.h> 76#include <sys/linker.h> 77#include <sys/lock.h> 78#include <sys/malloc.h> 79#include <sys/memrange.h> 80#include <sys/msgbuf.h> 81#include <sys/mutex.h> 82#include <sys/pcpu.h> 83#include <sys/ptrace.h> 84#include <sys/reboot.h> 85#include <sys/rwlock.h> 86#include <sys/sched.h> 87#include <sys/signalvar.h> 88#ifdef SMP 89#include <sys/smp.h> 90#endif 91#include <sys/syscallsubr.h> 92#include <sys/sysctl.h> 93#include <sys/sysent.h> 94#include <sys/sysproto.h> 95#include <sys/ucontext.h> 96#include <sys/vmmeter.h> 97 98#include <vm/vm.h> 99#include <vm/vm_extern.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_page.h> 102#include <vm/vm_map.h> 103#include <vm/vm_object.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_param.h> 106 107#ifdef DDB 108#ifndef KDB 109#error KDB must be enabled in order for DDB to work! 110#endif 111#include <ddb/ddb.h> 112#include <ddb/db_sym.h> 113#endif 114 115#include <isa/rtc.h> 116 117#include <net/netisr.h> 118 119#include <machine/bootinfo.h> 120#include <machine/clock.h> 121#include <machine/cpu.h> 122#include <machine/cputypes.h> 123#include <machine/intr_machdep.h> 124#include <x86/mca.h> 125#include <machine/md_var.h> 126#include <machine/metadata.h> 127#include <machine/mp_watchdog.h> 128#include <machine/pc/bios.h> 129#include <machine/pcb.h> 130#include <machine/pcb_ext.h> 131#include <machine/proc.h> 132#include <machine/reg.h> 133#include <machine/sigframe.h> 134#include <machine/specialreg.h> 135#include <machine/vm86.h> 136#ifdef PERFMON 137#include <machine/perfmon.h> 138#endif 139#ifdef SMP 140#include <machine/smp.h> 141#endif 142#ifdef FDT 143#include <x86/fdt.h> 144#endif 145 146#ifdef DEV_APIC 147#include <machine/apicvar.h> 148#endif 149 150#ifdef DEV_ISA 151#include <x86/isa/icu.h> 152#endif 153 154#ifdef XBOX 155#include <machine/xbox.h> 156 157int arch_i386_is_xbox = 0; 158uint32_t arch_i386_xbox_memsize = 0; 159#endif 160 161#ifdef XEN 162/* XEN includes */ 163#include <xen/xen-os.h> 164#include <xen/hypervisor.h> 165#include <machine/xen/xenvar.h> 166#include <machine/xen/xenfunc.h> 167#include <xen/xen_intr.h> 168 169void Xhypervisor_callback(void); 170void failsafe_callback(void); 171 172extern trap_info_t trap_table[]; 173struct proc_ldt default_proc_ldt; 174extern int init_first; 175int running_xen = 1; 176extern unsigned long physfree; 177#endif /* XEN */ 178 179/* Sanity check for __curthread() */ 180CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 181 182extern void init386(int first); 183extern void dblfault_handler(void); 184 185extern void printcpuinfo(void); /* XXX header file */ 186extern void finishidentcpu(void); 187extern void panicifcpuunsupported(void); 188 189#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 190#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 191 192#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 193#define CPU_ENABLE_SSE 194#endif 195 196static void cpu_startup(void *); 197static void fpstate_drop(struct thread *td); 198static void get_fpcontext(struct thread *td, mcontext_t *mcp); 199static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 200#ifdef CPU_ENABLE_SSE 201static void set_fpregs_xmm(struct save87 *, struct savexmm *); 202static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 203#endif /* CPU_ENABLE_SSE */ 204SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 205 206#ifdef DDB 207extern vm_offset_t ksym_start, ksym_end; 208#endif 209 210/* Intel ICH registers */ 211#define ICH_PMBASE 0x400 212#define ICH_SMI_EN ICH_PMBASE + 0x30 213 214int _udatasel, _ucodesel; 215u_int basemem; 216 217int cold = 1; 218 219#ifdef COMPAT_43 220static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 221#endif 222#ifdef COMPAT_FREEBSD4 223static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 224#endif 225 226long Maxmem = 0; 227long realmem = 0; 228 229#ifdef PAE 230FEATURE(pae, "Physical Address Extensions"); 231#endif 232 233/* 234 * The number of PHYSMAP entries must be one less than the number of 235 * PHYSSEG entries because the PHYSMAP entry that spans the largest 236 * physical address that is accessible by ISA DMA is split into two 237 * PHYSSEG entries. 238 */ 239#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 240 241vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 242vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 243 244/* must be 2 less so 0 0 can signal end of chunks */ 245#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 246#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 247 248struct kva_md_info kmi; 249 250static struct trapframe proc0_tf; 251struct pcpu __pcpu[MAXCPU]; 252 253struct mtx icu_lock; 254 255struct mem_range_softc mem_range_softc; 256 257static void 258cpu_startup(dummy) 259 void *dummy; 260{ 261 uintmax_t memsize; 262 char *sysenv; 263 264 /* 265 * On MacBooks, we need to disallow the legacy USB circuit to 266 * generate an SMI# because this can cause several problems, 267 * namely: incorrect CPU frequency detection and failure to 268 * start the APs. 269 * We do this by disabling a bit in the SMI_EN (SMI Control and 270 * Enable register) of the Intel ICH LPC Interface Bridge. 271 */ 272 sysenv = getenv("smbios.system.product"); 273 if (sysenv != NULL) { 274 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 275 strncmp(sysenv, "MacBook3,1", 10) == 0 || 276 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 277 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 278 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 279 strncmp(sysenv, "Macmini1,1", 10) == 0) { 280 if (bootverbose) 281 printf("Disabling LEGACY_USB_EN bit on " 282 "Intel ICH.\n"); 283 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 284 } 285 freeenv(sysenv); 286 } 287 288 /* 289 * Good {morning,afternoon,evening,night}. 290 */ 291 startrtclock(); 292 printcpuinfo(); 293 panicifcpuunsupported(); 294#ifdef PERFMON 295 perfmon_init(); 296#endif 297 realmem = Maxmem; 298 299 /* 300 * Display physical memory if SMBIOS reports reasonable amount. 301 */ 302 memsize = 0; 303 sysenv = getenv("smbios.memory.enabled"); 304 if (sysenv != NULL) { 305 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 306 freeenv(sysenv); 307 } 308 if (memsize < ptoa((uintmax_t)cnt.v_free_count)) 309 memsize = ptoa((uintmax_t)Maxmem); 310 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 311 312 /* 313 * Display any holes after the first chunk of extended memory. 314 */ 315 if (bootverbose) { 316 int indx; 317 318 printf("Physical memory chunk(s):\n"); 319 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 320 vm_paddr_t size; 321 322 size = phys_avail[indx + 1] - phys_avail[indx]; 323 printf( 324 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 325 (uintmax_t)phys_avail[indx], 326 (uintmax_t)phys_avail[indx + 1] - 1, 327 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 328 } 329 } 330 331 vm_ksubmap_init(&kmi); 332 333 printf("avail memory = %ju (%ju MB)\n", 334 ptoa((uintmax_t)cnt.v_free_count), 335 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 336 337 /* 338 * Set up buffers, so they can be used to read disk labels. 339 */ 340 bufinit(); 341 vm_pager_bufferinit(); 342#ifndef XEN 343 cpu_setregs(); 344#endif 345} 346 347/* 348 * Send an interrupt to process. 349 * 350 * Stack is set up to allow sigcode stored 351 * at top to call routine, followed by kcall 352 * to sigreturn routine below. After sigreturn 353 * resets the signal mask, the stack, and the 354 * frame pointer, it returns to the user 355 * specified pc, psl. 356 */ 357#ifdef COMPAT_43 358static void 359osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 360{ 361 struct osigframe sf, *fp; 362 struct proc *p; 363 struct thread *td; 364 struct sigacts *psp; 365 struct trapframe *regs; 366 int sig; 367 int oonstack; 368 369 td = curthread; 370 p = td->td_proc; 371 PROC_LOCK_ASSERT(p, MA_OWNED); 372 sig = ksi->ksi_signo; 373 psp = p->p_sigacts; 374 mtx_assert(&psp->ps_mtx, MA_OWNED); 375 regs = td->td_frame; 376 oonstack = sigonstack(regs->tf_esp); 377 378 /* Allocate space for the signal handler context. */ 379 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 380 SIGISMEMBER(psp->ps_sigonstack, sig)) { 381 fp = (struct osigframe *)(td->td_sigstk.ss_sp + 382 td->td_sigstk.ss_size - sizeof(struct osigframe)); 383#if defined(COMPAT_43) 384 td->td_sigstk.ss_flags |= SS_ONSTACK; 385#endif 386 } else 387 fp = (struct osigframe *)regs->tf_esp - 1; 388 389 /* Translate the signal if appropriate. */ 390 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 391 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 392 393 /* Build the argument list for the signal handler. */ 394 sf.sf_signum = sig; 395 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 396 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 397 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 398 /* Signal handler installed with SA_SIGINFO. */ 399 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 400 sf.sf_siginfo.si_signo = sig; 401 sf.sf_siginfo.si_code = ksi->ksi_code; 402 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 403 sf.sf_addr = 0; 404 } else { 405 /* Old FreeBSD-style arguments. */ 406 sf.sf_arg2 = ksi->ksi_code; 407 sf.sf_addr = (register_t)ksi->ksi_addr; 408 sf.sf_ahu.sf_handler = catcher; 409 } 410 mtx_unlock(&psp->ps_mtx); 411 PROC_UNLOCK(p); 412 413 /* Save most if not all of trap frame. */ 414 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 415 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 416 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 417 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 418 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 419 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 420 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 421 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 422 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 423 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 424 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 425 sf.sf_siginfo.si_sc.sc_gs = rgs(); 426 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 427 428 /* Build the signal context to be used by osigreturn(). */ 429 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 430 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 431 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 432 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 433 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 434 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 435 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 436 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 437 438 /* 439 * If we're a vm86 process, we want to save the segment registers. 440 * We also change eflags to be our emulated eflags, not the actual 441 * eflags. 442 */ 443 if (regs->tf_eflags & PSL_VM) { 444 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 445 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 446 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 447 448 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 449 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 450 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 451 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 452 453 if (vm86->vm86_has_vme == 0) 454 sf.sf_siginfo.si_sc.sc_ps = 455 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 456 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 457 458 /* See sendsig() for comments. */ 459 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 460 } 461 462 /* 463 * Copy the sigframe out to the user's stack. 464 */ 465 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 466#ifdef DEBUG 467 printf("process %ld has trashed its stack\n", (long)p->p_pid); 468#endif 469 PROC_LOCK(p); 470 sigexit(td, SIGILL); 471 } 472 473 regs->tf_esp = (int)fp; 474 if (p->p_sysent->sv_sigcode_base != 0) { 475 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 476 szosigcode; 477 } else { 478 /* a.out sysentvec does not use shared page */ 479 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 480 } 481 regs->tf_eflags &= ~(PSL_T | PSL_D); 482 regs->tf_cs = _ucodesel; 483 regs->tf_ds = _udatasel; 484 regs->tf_es = _udatasel; 485 regs->tf_fs = _udatasel; 486 load_gs(_udatasel); 487 regs->tf_ss = _udatasel; 488 PROC_LOCK(p); 489 mtx_lock(&psp->ps_mtx); 490} 491#endif /* COMPAT_43 */ 492 493#ifdef COMPAT_FREEBSD4 494static void 495freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 496{ 497 struct sigframe4 sf, *sfp; 498 struct proc *p; 499 struct thread *td; 500 struct sigacts *psp; 501 struct trapframe *regs; 502 int sig; 503 int oonstack; 504 505 td = curthread; 506 p = td->td_proc; 507 PROC_LOCK_ASSERT(p, MA_OWNED); 508 sig = ksi->ksi_signo; 509 psp = p->p_sigacts; 510 mtx_assert(&psp->ps_mtx, MA_OWNED); 511 regs = td->td_frame; 512 oonstack = sigonstack(regs->tf_esp); 513 514 /* Save user context. */ 515 bzero(&sf, sizeof(sf)); 516 sf.sf_uc.uc_sigmask = *mask; 517 sf.sf_uc.uc_stack = td->td_sigstk; 518 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 519 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 520 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 521 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 522 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 523 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 524 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 525 bzero(sf.sf_uc.uc_mcontext.__spare__, 526 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 527 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 528 529 /* Allocate space for the signal handler context. */ 530 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 531 SIGISMEMBER(psp->ps_sigonstack, sig)) { 532 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + 533 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 534#if defined(COMPAT_43) 535 td->td_sigstk.ss_flags |= SS_ONSTACK; 536#endif 537 } else 538 sfp = (struct sigframe4 *)regs->tf_esp - 1; 539 540 /* Translate the signal if appropriate. */ 541 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 542 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 543 544 /* Build the argument list for the signal handler. */ 545 sf.sf_signum = sig; 546 sf.sf_ucontext = (register_t)&sfp->sf_uc; 547 bzero(&sf.sf_si, sizeof(sf.sf_si)); 548 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 549 /* Signal handler installed with SA_SIGINFO. */ 550 sf.sf_siginfo = (register_t)&sfp->sf_si; 551 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 552 553 /* Fill in POSIX parts */ 554 sf.sf_si.si_signo = sig; 555 sf.sf_si.si_code = ksi->ksi_code; 556 sf.sf_si.si_addr = ksi->ksi_addr; 557 } else { 558 /* Old FreeBSD-style arguments. */ 559 sf.sf_siginfo = ksi->ksi_code; 560 sf.sf_addr = (register_t)ksi->ksi_addr; 561 sf.sf_ahu.sf_handler = catcher; 562 } 563 mtx_unlock(&psp->ps_mtx); 564 PROC_UNLOCK(p); 565 566 /* 567 * If we're a vm86 process, we want to save the segment registers. 568 * We also change eflags to be our emulated eflags, not the actual 569 * eflags. 570 */ 571 if (regs->tf_eflags & PSL_VM) { 572 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 573 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 574 575 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 576 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 577 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 578 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 579 580 if (vm86->vm86_has_vme == 0) 581 sf.sf_uc.uc_mcontext.mc_eflags = 582 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 583 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 584 585 /* 586 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 587 * syscalls made by the signal handler. This just avoids 588 * wasting time for our lazy fixup of such faults. PSL_NT 589 * does nothing in vm86 mode, but vm86 programs can set it 590 * almost legitimately in probes for old cpu types. 591 */ 592 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 593 } 594 595 /* 596 * Copy the sigframe out to the user's stack. 597 */ 598 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 599#ifdef DEBUG 600 printf("process %ld has trashed its stack\n", (long)p->p_pid); 601#endif 602 PROC_LOCK(p); 603 sigexit(td, SIGILL); 604 } 605 606 regs->tf_esp = (int)sfp; 607 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 608 szfreebsd4_sigcode; 609 regs->tf_eflags &= ~(PSL_T | PSL_D); 610 regs->tf_cs = _ucodesel; 611 regs->tf_ds = _udatasel; 612 regs->tf_es = _udatasel; 613 regs->tf_fs = _udatasel; 614 regs->tf_ss = _udatasel; 615 PROC_LOCK(p); 616 mtx_lock(&psp->ps_mtx); 617} 618#endif /* COMPAT_FREEBSD4 */ 619 620void 621sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 622{ 623 struct sigframe sf, *sfp; 624 struct proc *p; 625 struct thread *td; 626 struct sigacts *psp; 627 char *sp; 628 struct trapframe *regs; 629 struct segment_descriptor *sdp; 630 int sig; 631 int oonstack; 632 633 td = curthread; 634 p = td->td_proc; 635 PROC_LOCK_ASSERT(p, MA_OWNED); 636 sig = ksi->ksi_signo; 637 psp = p->p_sigacts; 638 mtx_assert(&psp->ps_mtx, MA_OWNED); 639#ifdef COMPAT_FREEBSD4 640 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 641 freebsd4_sendsig(catcher, ksi, mask); 642 return; 643 } 644#endif 645#ifdef COMPAT_43 646 if (SIGISMEMBER(psp->ps_osigset, sig)) { 647 osendsig(catcher, ksi, mask); 648 return; 649 } 650#endif 651 regs = td->td_frame; 652 oonstack = sigonstack(regs->tf_esp); 653 654 /* Save user context. */ 655 bzero(&sf, sizeof(sf)); 656 sf.sf_uc.uc_sigmask = *mask; 657 sf.sf_uc.uc_stack = td->td_sigstk; 658 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 659 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 660 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 661 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 662 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 663 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 664 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 665 fpstate_drop(td); 666 /* 667 * Unconditionally fill the fsbase and gsbase into the mcontext. 668 */ 669 sdp = &td->td_pcb->pcb_fsd; 670 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 671 sdp->sd_lobase; 672 sdp = &td->td_pcb->pcb_gsd; 673 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 674 sdp->sd_lobase; 675 sf.sf_uc.uc_mcontext.mc_flags = 0; 676 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 677 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 678 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 679 680 /* Allocate space for the signal handler context. */ 681 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 682 SIGISMEMBER(psp->ps_sigonstack, sig)) { 683 sp = td->td_sigstk.ss_sp + 684 td->td_sigstk.ss_size - sizeof(struct sigframe); 685#if defined(COMPAT_43) 686 td->td_sigstk.ss_flags |= SS_ONSTACK; 687#endif 688 } else 689 sp = (char *)regs->tf_esp - sizeof(struct sigframe); 690 /* Align to 16 bytes. */ 691 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 692 693 /* Translate the signal if appropriate. */ 694 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 695 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 696 697 /* Build the argument list for the signal handler. */ 698 sf.sf_signum = sig; 699 sf.sf_ucontext = (register_t)&sfp->sf_uc; 700 bzero(&sf.sf_si, sizeof(sf.sf_si)); 701 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 702 /* Signal handler installed with SA_SIGINFO. */ 703 sf.sf_siginfo = (register_t)&sfp->sf_si; 704 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 705 706 /* Fill in POSIX parts */ 707 sf.sf_si = ksi->ksi_info; 708 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 709 } else { 710 /* Old FreeBSD-style arguments. */ 711 sf.sf_siginfo = ksi->ksi_code; 712 sf.sf_addr = (register_t)ksi->ksi_addr; 713 sf.sf_ahu.sf_handler = catcher; 714 } 715 mtx_unlock(&psp->ps_mtx); 716 PROC_UNLOCK(p); 717 718 /* 719 * If we're a vm86 process, we want to save the segment registers. 720 * We also change eflags to be our emulated eflags, not the actual 721 * eflags. 722 */ 723 if (regs->tf_eflags & PSL_VM) { 724 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 725 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 726 727 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 728 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 729 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 730 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 731 732 if (vm86->vm86_has_vme == 0) 733 sf.sf_uc.uc_mcontext.mc_eflags = 734 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 735 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 736 737 /* 738 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 739 * syscalls made by the signal handler. This just avoids 740 * wasting time for our lazy fixup of such faults. PSL_NT 741 * does nothing in vm86 mode, but vm86 programs can set it 742 * almost legitimately in probes for old cpu types. 743 */ 744 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 745 } 746 747 /* 748 * Copy the sigframe out to the user's stack. 749 */ 750 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 751#ifdef DEBUG 752 printf("process %ld has trashed its stack\n", (long)p->p_pid); 753#endif 754 PROC_LOCK(p); 755 sigexit(td, SIGILL); 756 } 757 758 regs->tf_esp = (int)sfp; 759 regs->tf_eip = p->p_sysent->sv_sigcode_base; 760 if (regs->tf_eip == 0) 761 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 762 regs->tf_eflags &= ~(PSL_T | PSL_D); 763 regs->tf_cs = _ucodesel; 764 regs->tf_ds = _udatasel; 765 regs->tf_es = _udatasel; 766 regs->tf_fs = _udatasel; 767 regs->tf_ss = _udatasel; 768 PROC_LOCK(p); 769 mtx_lock(&psp->ps_mtx); 770} 771 772/* 773 * System call to cleanup state after a signal 774 * has been taken. Reset signal mask and 775 * stack state from context left by sendsig (above). 776 * Return to previous pc and psl as specified by 777 * context left by sendsig. Check carefully to 778 * make sure that the user has not modified the 779 * state to gain improper privileges. 780 * 781 * MPSAFE 782 */ 783#ifdef COMPAT_43 784int 785osigreturn(td, uap) 786 struct thread *td; 787 struct osigreturn_args /* { 788 struct osigcontext *sigcntxp; 789 } */ *uap; 790{ 791 struct osigcontext sc; 792 struct trapframe *regs; 793 struct osigcontext *scp; 794 int eflags, error; 795 ksiginfo_t ksi; 796 797 regs = td->td_frame; 798 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 799 if (error != 0) 800 return (error); 801 scp = ≻ 802 eflags = scp->sc_ps; 803 if (eflags & PSL_VM) { 804 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 805 struct vm86_kernel *vm86; 806 807 /* 808 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 809 * set up the vm86 area, and we can't enter vm86 mode. 810 */ 811 if (td->td_pcb->pcb_ext == 0) 812 return (EINVAL); 813 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 814 if (vm86->vm86_inited == 0) 815 return (EINVAL); 816 817 /* Go back to user mode if both flags are set. */ 818 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 819 ksiginfo_init_trap(&ksi); 820 ksi.ksi_signo = SIGBUS; 821 ksi.ksi_code = BUS_OBJERR; 822 ksi.ksi_addr = (void *)regs->tf_eip; 823 trapsignal(td, &ksi); 824 } 825 826 if (vm86->vm86_has_vme) { 827 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 828 (eflags & VME_USERCHANGE) | PSL_VM; 829 } else { 830 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 831 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 832 (eflags & VM_USERCHANGE) | PSL_VM; 833 } 834 tf->tf_vm86_ds = scp->sc_ds; 835 tf->tf_vm86_es = scp->sc_es; 836 tf->tf_vm86_fs = scp->sc_fs; 837 tf->tf_vm86_gs = scp->sc_gs; 838 tf->tf_ds = _udatasel; 839 tf->tf_es = _udatasel; 840 tf->tf_fs = _udatasel; 841 } else { 842 /* 843 * Don't allow users to change privileged or reserved flags. 844 */ 845 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 846 return (EINVAL); 847 } 848 849 /* 850 * Don't allow users to load a valid privileged %cs. Let the 851 * hardware check for invalid selectors, excess privilege in 852 * other selectors, invalid %eip's and invalid %esp's. 853 */ 854 if (!CS_SECURE(scp->sc_cs)) { 855 ksiginfo_init_trap(&ksi); 856 ksi.ksi_signo = SIGBUS; 857 ksi.ksi_code = BUS_OBJERR; 858 ksi.ksi_trapno = T_PROTFLT; 859 ksi.ksi_addr = (void *)regs->tf_eip; 860 trapsignal(td, &ksi); 861 return (EINVAL); 862 } 863 regs->tf_ds = scp->sc_ds; 864 regs->tf_es = scp->sc_es; 865 regs->tf_fs = scp->sc_fs; 866 } 867 868 /* Restore remaining registers. */ 869 regs->tf_eax = scp->sc_eax; 870 regs->tf_ebx = scp->sc_ebx; 871 regs->tf_ecx = scp->sc_ecx; 872 regs->tf_edx = scp->sc_edx; 873 regs->tf_esi = scp->sc_esi; 874 regs->tf_edi = scp->sc_edi; 875 regs->tf_cs = scp->sc_cs; 876 regs->tf_ss = scp->sc_ss; 877 regs->tf_isp = scp->sc_isp; 878 regs->tf_ebp = scp->sc_fp; 879 regs->tf_esp = scp->sc_sp; 880 regs->tf_eip = scp->sc_pc; 881 regs->tf_eflags = eflags; 882 883#if defined(COMPAT_43) 884 if (scp->sc_onstack & 1) 885 td->td_sigstk.ss_flags |= SS_ONSTACK; 886 else 887 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 888#endif 889 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 890 SIGPROCMASK_OLD); 891 return (EJUSTRETURN); 892} 893#endif /* COMPAT_43 */ 894 895#ifdef COMPAT_FREEBSD4 896/* 897 * MPSAFE 898 */ 899int 900freebsd4_sigreturn(td, uap) 901 struct thread *td; 902 struct freebsd4_sigreturn_args /* { 903 const ucontext4 *sigcntxp; 904 } */ *uap; 905{ 906 struct ucontext4 uc; 907 struct trapframe *regs; 908 struct ucontext4 *ucp; 909 int cs, eflags, error; 910 ksiginfo_t ksi; 911 912 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 913 if (error != 0) 914 return (error); 915 ucp = &uc; 916 regs = td->td_frame; 917 eflags = ucp->uc_mcontext.mc_eflags; 918 if (eflags & PSL_VM) { 919 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 920 struct vm86_kernel *vm86; 921 922 /* 923 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 924 * set up the vm86 area, and we can't enter vm86 mode. 925 */ 926 if (td->td_pcb->pcb_ext == 0) 927 return (EINVAL); 928 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 929 if (vm86->vm86_inited == 0) 930 return (EINVAL); 931 932 /* Go back to user mode if both flags are set. */ 933 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 934 ksiginfo_init_trap(&ksi); 935 ksi.ksi_signo = SIGBUS; 936 ksi.ksi_code = BUS_OBJERR; 937 ksi.ksi_addr = (void *)regs->tf_eip; 938 trapsignal(td, &ksi); 939 } 940 if (vm86->vm86_has_vme) { 941 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 942 (eflags & VME_USERCHANGE) | PSL_VM; 943 } else { 944 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 945 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 946 (eflags & VM_USERCHANGE) | PSL_VM; 947 } 948 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 949 tf->tf_eflags = eflags; 950 tf->tf_vm86_ds = tf->tf_ds; 951 tf->tf_vm86_es = tf->tf_es; 952 tf->tf_vm86_fs = tf->tf_fs; 953 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 954 tf->tf_ds = _udatasel; 955 tf->tf_es = _udatasel; 956 tf->tf_fs = _udatasel; 957 } else { 958 /* 959 * Don't allow users to change privileged or reserved flags. 960 */ 961 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 962 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 963 td->td_proc->p_pid, td->td_name, eflags); 964 return (EINVAL); 965 } 966 967 /* 968 * Don't allow users to load a valid privileged %cs. Let the 969 * hardware check for invalid selectors, excess privilege in 970 * other selectors, invalid %eip's and invalid %esp's. 971 */ 972 cs = ucp->uc_mcontext.mc_cs; 973 if (!CS_SECURE(cs)) { 974 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 975 td->td_proc->p_pid, td->td_name, cs); 976 ksiginfo_init_trap(&ksi); 977 ksi.ksi_signo = SIGBUS; 978 ksi.ksi_code = BUS_OBJERR; 979 ksi.ksi_trapno = T_PROTFLT; 980 ksi.ksi_addr = (void *)regs->tf_eip; 981 trapsignal(td, &ksi); 982 return (EINVAL); 983 } 984 985 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 986 } 987 988#if defined(COMPAT_43) 989 if (ucp->uc_mcontext.mc_onstack & 1) 990 td->td_sigstk.ss_flags |= SS_ONSTACK; 991 else 992 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 993#endif 994 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 995 return (EJUSTRETURN); 996} 997#endif /* COMPAT_FREEBSD4 */ 998 999/* 1000 * MPSAFE 1001 */ 1002int 1003sys_sigreturn(td, uap) 1004 struct thread *td; 1005 struct sigreturn_args /* { 1006 const struct __ucontext *sigcntxp; 1007 } */ *uap; 1008{ 1009 ucontext_t uc; 1010 struct trapframe *regs; 1011 ucontext_t *ucp; 1012 int cs, eflags, error, ret; 1013 ksiginfo_t ksi; 1014 1015 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1016 if (error != 0) 1017 return (error); 1018 ucp = &uc; 1019 regs = td->td_frame; 1020 eflags = ucp->uc_mcontext.mc_eflags; 1021 if (eflags & PSL_VM) { 1022 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1023 struct vm86_kernel *vm86; 1024 1025 /* 1026 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1027 * set up the vm86 area, and we can't enter vm86 mode. 1028 */ 1029 if (td->td_pcb->pcb_ext == 0) 1030 return (EINVAL); 1031 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1032 if (vm86->vm86_inited == 0) 1033 return (EINVAL); 1034 1035 /* Go back to user mode if both flags are set. */ 1036 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1037 ksiginfo_init_trap(&ksi); 1038 ksi.ksi_signo = SIGBUS; 1039 ksi.ksi_code = BUS_OBJERR; 1040 ksi.ksi_addr = (void *)regs->tf_eip; 1041 trapsignal(td, &ksi); 1042 } 1043 1044 if (vm86->vm86_has_vme) { 1045 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1046 (eflags & VME_USERCHANGE) | PSL_VM; 1047 } else { 1048 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1049 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1050 (eflags & VM_USERCHANGE) | PSL_VM; 1051 } 1052 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1053 tf->tf_eflags = eflags; 1054 tf->tf_vm86_ds = tf->tf_ds; 1055 tf->tf_vm86_es = tf->tf_es; 1056 tf->tf_vm86_fs = tf->tf_fs; 1057 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1058 tf->tf_ds = _udatasel; 1059 tf->tf_es = _udatasel; 1060 tf->tf_fs = _udatasel; 1061 } else { 1062 /* 1063 * Don't allow users to change privileged or reserved flags. 1064 */ 1065 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1066 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1067 td->td_proc->p_pid, td->td_name, eflags); 1068 return (EINVAL); 1069 } 1070 1071 /* 1072 * Don't allow users to load a valid privileged %cs. Let the 1073 * hardware check for invalid selectors, excess privilege in 1074 * other selectors, invalid %eip's and invalid %esp's. 1075 */ 1076 cs = ucp->uc_mcontext.mc_cs; 1077 if (!CS_SECURE(cs)) { 1078 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1079 td->td_proc->p_pid, td->td_name, cs); 1080 ksiginfo_init_trap(&ksi); 1081 ksi.ksi_signo = SIGBUS; 1082 ksi.ksi_code = BUS_OBJERR; 1083 ksi.ksi_trapno = T_PROTFLT; 1084 ksi.ksi_addr = (void *)regs->tf_eip; 1085 trapsignal(td, &ksi); 1086 return (EINVAL); 1087 } 1088 1089 ret = set_fpcontext(td, &ucp->uc_mcontext); 1090 if (ret != 0) 1091 return (ret); 1092 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1093 } 1094 1095#if defined(COMPAT_43) 1096 if (ucp->uc_mcontext.mc_onstack & 1) 1097 td->td_sigstk.ss_flags |= SS_ONSTACK; 1098 else 1099 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1100#endif 1101 1102 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1103 return (EJUSTRETURN); 1104} 1105 1106/* 1107 * Machine dependent boot() routine 1108 * 1109 * I haven't seen anything to put here yet 1110 * Possibly some stuff might be grafted back here from boot() 1111 */ 1112void 1113cpu_boot(int howto) 1114{ 1115} 1116 1117/* 1118 * Flush the D-cache for non-DMA I/O so that the I-cache can 1119 * be made coherent later. 1120 */ 1121void 1122cpu_flush_dcache(void *ptr, size_t len) 1123{ 1124 /* Not applicable */ 1125} 1126 1127/* Get current clock frequency for the given cpu id. */ 1128int 1129cpu_est_clockrate(int cpu_id, uint64_t *rate) 1130{ 1131 uint64_t tsc1, tsc2; 1132 uint64_t acnt, mcnt, perf; 1133 register_t reg; 1134 1135 if (pcpu_find(cpu_id) == NULL || rate == NULL) 1136 return (EINVAL); 1137 if ((cpu_feature & CPUID_TSC) == 0) 1138 return (EOPNOTSUPP); 1139 1140 /* 1141 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, 1142 * DELAY(9) based logic fails. 1143 */ 1144 if (tsc_is_invariant && !tsc_perf_stat) 1145 return (EOPNOTSUPP); 1146 1147#ifdef SMP 1148 if (smp_cpus > 1) { 1149 /* Schedule ourselves on the indicated cpu. */ 1150 thread_lock(curthread); 1151 sched_bind(curthread, cpu_id); 1152 thread_unlock(curthread); 1153 } 1154#endif 1155 1156 /* Calibrate by measuring a short delay. */ 1157 reg = intr_disable(); 1158 if (tsc_is_invariant) { 1159 wrmsr(MSR_MPERF, 0); 1160 wrmsr(MSR_APERF, 0); 1161 tsc1 = rdtsc(); 1162 DELAY(1000); 1163 mcnt = rdmsr(MSR_MPERF); 1164 acnt = rdmsr(MSR_APERF); 1165 tsc2 = rdtsc(); 1166 intr_restore(reg); 1167 perf = 1000 * acnt / mcnt; 1168 *rate = (tsc2 - tsc1) * perf; 1169 } else { 1170 tsc1 = rdtsc(); 1171 DELAY(1000); 1172 tsc2 = rdtsc(); 1173 intr_restore(reg); 1174 *rate = (tsc2 - tsc1) * 1000; 1175 } 1176 1177#ifdef SMP 1178 if (smp_cpus > 1) { 1179 thread_lock(curthread); 1180 sched_unbind(curthread); 1181 thread_unlock(curthread); 1182 } 1183#endif 1184 1185 return (0); 1186} 1187 1188#ifdef XEN 1189 1190static void 1191idle_block(void) 1192{ 1193 1194 HYPERVISOR_sched_op(SCHEDOP_block, 0); 1195} 1196 1197void 1198cpu_halt(void) 1199{ 1200 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 1201} 1202 1203int scheduler_running; 1204 1205static void 1206cpu_idle_hlt(sbintime_t sbt) 1207{ 1208 1209 scheduler_running = 1; 1210 enable_intr(); 1211 idle_block(); 1212} 1213 1214#else 1215/* 1216 * Shutdown the CPU as much as possible 1217 */ 1218void 1219cpu_halt(void) 1220{ 1221 for (;;) 1222 halt(); 1223} 1224 1225#endif 1226 1227void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ 1228static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ 1229static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ 1230TUNABLE_INT("machdep.idle_mwait", &idle_mwait); 1231SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait, 1232 0, "Use MONITOR/MWAIT for short idle"); 1233 1234#define STATE_RUNNING 0x0 1235#define STATE_MWAIT 0x1 1236#define STATE_SLEEPING 0x2 1237 1238static void 1239cpu_idle_acpi(sbintime_t sbt) 1240{ 1241 int *state; 1242 1243 state = (int *)PCPU_PTR(monitorbuf); 1244 *state = STATE_SLEEPING; 1245 1246 /* See comments in cpu_idle_hlt(). */ 1247 disable_intr(); 1248 if (sched_runnable()) 1249 enable_intr(); 1250 else if (cpu_idle_hook) 1251 cpu_idle_hook(sbt); 1252 else 1253 __asm __volatile("sti; hlt"); 1254 *state = STATE_RUNNING; 1255} 1256 1257#ifndef XEN 1258static void 1259cpu_idle_hlt(sbintime_t sbt) 1260{ 1261 int *state; 1262 1263 state = (int *)PCPU_PTR(monitorbuf); 1264 *state = STATE_SLEEPING; 1265 1266 /* 1267 * Since we may be in a critical section from cpu_idle(), if 1268 * an interrupt fires during that critical section we may have 1269 * a pending preemption. If the CPU halts, then that thread 1270 * may not execute until a later interrupt awakens the CPU. 1271 * To handle this race, check for a runnable thread after 1272 * disabling interrupts and immediately return if one is 1273 * found. Also, we must absolutely guarentee that hlt is 1274 * the next instruction after sti. This ensures that any 1275 * interrupt that fires after the call to disable_intr() will 1276 * immediately awaken the CPU from hlt. Finally, please note 1277 * that on x86 this works fine because of interrupts enabled only 1278 * after the instruction following sti takes place, while IF is set 1279 * to 1 immediately, allowing hlt instruction to acknowledge the 1280 * interrupt. 1281 */ 1282 disable_intr(); 1283 if (sched_runnable()) 1284 enable_intr(); 1285 else 1286 __asm __volatile("sti; hlt"); 1287 *state = STATE_RUNNING; 1288} 1289#endif 1290 1291/* 1292 * MWAIT cpu power states. Lower 4 bits are sub-states. 1293 */ 1294#define MWAIT_C0 0xf0 1295#define MWAIT_C1 0x00 1296#define MWAIT_C2 0x10 1297#define MWAIT_C3 0x20 1298#define MWAIT_C4 0x30 1299 1300static void 1301cpu_idle_mwait(sbintime_t sbt) 1302{ 1303 int *state; 1304 1305 state = (int *)PCPU_PTR(monitorbuf); 1306 *state = STATE_MWAIT; 1307 1308 /* See comments in cpu_idle_hlt(). */ 1309 disable_intr(); 1310 if (sched_runnable()) { 1311 enable_intr(); 1312 *state = STATE_RUNNING; 1313 return; 1314 } 1315 cpu_monitor(state, 0, 0); 1316 if (*state == STATE_MWAIT) 1317 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); 1318 else 1319 enable_intr(); 1320 *state = STATE_RUNNING; 1321} 1322 1323static void 1324cpu_idle_spin(sbintime_t sbt) 1325{ 1326 int *state; 1327 int i; 1328 1329 state = (int *)PCPU_PTR(monitorbuf); 1330 *state = STATE_RUNNING; 1331 1332 /* 1333 * The sched_runnable() call is racy but as long as there is 1334 * a loop missing it one time will have just a little impact if any 1335 * (and it is much better than missing the check at all). 1336 */ 1337 for (i = 0; i < 1000; i++) { 1338 if (sched_runnable()) 1339 return; 1340 cpu_spinwait(); 1341 } 1342} 1343 1344/* 1345 * C1E renders the local APIC timer dead, so we disable it by 1346 * reading the Interrupt Pending Message register and clearing 1347 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). 1348 * 1349 * Reference: 1350 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" 1351 * #32559 revision 3.00+ 1352 */ 1353#define MSR_AMDK8_IPM 0xc0010055 1354#define AMDK8_SMIONCMPHALT (1ULL << 27) 1355#define AMDK8_C1EONCMPHALT (1ULL << 28) 1356#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) 1357 1358static void 1359cpu_probe_amdc1e(void) 1360{ 1361 1362 /* 1363 * Detect the presence of C1E capability mostly on latest 1364 * dual-cores (or future) k8 family. 1365 */ 1366 if (cpu_vendor_id == CPU_VENDOR_AMD && 1367 (cpu_id & 0x00000f00) == 0x00000f00 && 1368 (cpu_id & 0x0fff0000) >= 0x00040000) { 1369 cpu_ident_amdc1e = 1; 1370 } 1371} 1372 1373#ifdef XEN 1374void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; 1375#else 1376void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; 1377#endif 1378 1379void 1380cpu_idle(int busy) 1381{ 1382#ifndef XEN 1383 uint64_t msr; 1384#endif 1385 sbintime_t sbt = -1; 1386 1387 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", 1388 busy, curcpu); 1389#if defined(MP_WATCHDOG) && !defined(XEN) 1390 ap_watchdog(PCPU_GET(cpuid)); 1391#endif 1392#ifndef XEN 1393 /* If we are busy - try to use fast methods. */ 1394 if (busy) { 1395 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { 1396 cpu_idle_mwait(busy); 1397 goto out; 1398 } 1399 } 1400#endif 1401 1402 /* If we have time - switch timers into idle mode. */ 1403 if (!busy) { 1404 critical_enter(); 1405 sbt = cpu_idleclock(); 1406 } 1407 1408#ifndef XEN 1409 /* Apply AMD APIC timer C1E workaround. */ 1410 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) { 1411 msr = rdmsr(MSR_AMDK8_IPM); 1412 if (msr & AMDK8_CMPHALT) 1413 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); 1414 } 1415#endif 1416 1417 /* Call main idle method. */ 1418 cpu_idle_fn(sbt); 1419 1420 /* Switch timers mack into active mode. */ 1421 if (!busy) { 1422 cpu_activeclock(); 1423 critical_exit(); 1424 } 1425#ifndef XEN 1426out: 1427#endif 1428 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", 1429 busy, curcpu); 1430} 1431 1432int 1433cpu_idle_wakeup(int cpu) 1434{ 1435 struct pcpu *pcpu; 1436 int *state; 1437 1438 pcpu = pcpu_find(cpu); 1439 state = (int *)pcpu->pc_monitorbuf; 1440 /* 1441 * This doesn't need to be atomic since missing the race will 1442 * simply result in unnecessary IPIs. 1443 */ 1444 if (*state == STATE_SLEEPING) 1445 return (0); 1446 if (*state == STATE_MWAIT) 1447 *state = STATE_RUNNING; 1448 return (1); 1449} 1450 1451/* 1452 * Ordered by speed/power consumption. 1453 */ 1454struct { 1455 void *id_fn; 1456 char *id_name; 1457} idle_tbl[] = { 1458 { cpu_idle_spin, "spin" }, 1459 { cpu_idle_mwait, "mwait" }, 1460 { cpu_idle_hlt, "hlt" }, 1461 { cpu_idle_acpi, "acpi" }, 1462 { NULL, NULL } 1463}; 1464 1465static int 1466idle_sysctl_available(SYSCTL_HANDLER_ARGS) 1467{ 1468 char *avail, *p; 1469 int error; 1470 int i; 1471 1472 avail = malloc(256, M_TEMP, M_WAITOK); 1473 p = avail; 1474 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1475 if (strstr(idle_tbl[i].id_name, "mwait") && 1476 (cpu_feature2 & CPUID2_MON) == 0) 1477 continue; 1478 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1479 cpu_idle_hook == NULL) 1480 continue; 1481 p += sprintf(p, "%s%s", p != avail ? ", " : "", 1482 idle_tbl[i].id_name); 1483 } 1484 error = sysctl_handle_string(oidp, avail, 0, req); 1485 free(avail, M_TEMP); 1486 return (error); 1487} 1488 1489SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 1490 0, 0, idle_sysctl_available, "A", "list of available idle functions"); 1491 1492static int 1493idle_sysctl(SYSCTL_HANDLER_ARGS) 1494{ 1495 char buf[16]; 1496 int error; 1497 char *p; 1498 int i; 1499 1500 p = "unknown"; 1501 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1502 if (idle_tbl[i].id_fn == cpu_idle_fn) { 1503 p = idle_tbl[i].id_name; 1504 break; 1505 } 1506 } 1507 strncpy(buf, p, sizeof(buf)); 1508 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1509 if (error != 0 || req->newptr == NULL) 1510 return (error); 1511 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1512 if (strstr(idle_tbl[i].id_name, "mwait") && 1513 (cpu_feature2 & CPUID2_MON) == 0) 1514 continue; 1515 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1516 cpu_idle_hook == NULL) 1517 continue; 1518 if (strcmp(idle_tbl[i].id_name, buf)) 1519 continue; 1520 cpu_idle_fn = idle_tbl[i].id_fn; 1521 return (0); 1522 } 1523 return (EINVAL); 1524} 1525 1526SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, 1527 idle_sysctl, "A", "currently selected idle function"); 1528 1529/* 1530 * Reset registers to default values on exec. 1531 */ 1532void 1533exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1534{ 1535 struct trapframe *regs = td->td_frame; 1536 struct pcb *pcb = td->td_pcb; 1537 1538 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1539 pcb->pcb_gs = _udatasel; 1540 load_gs(_udatasel); 1541 1542 mtx_lock_spin(&dt_lock); 1543 if (td->td_proc->p_md.md_ldt) 1544 user_ldt_free(td); 1545 else 1546 mtx_unlock_spin(&dt_lock); 1547 1548 bzero((char *)regs, sizeof(struct trapframe)); 1549 regs->tf_eip = imgp->entry_addr; 1550 regs->tf_esp = stack; 1551 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1552 regs->tf_ss = _udatasel; 1553 regs->tf_ds = _udatasel; 1554 regs->tf_es = _udatasel; 1555 regs->tf_fs = _udatasel; 1556 regs->tf_cs = _ucodesel; 1557 1558 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1559 regs->tf_ebx = imgp->ps_strings; 1560 1561 /* 1562 * Reset the hardware debug registers if they were in use. 1563 * They won't have any meaning for the newly exec'd process. 1564 */ 1565 if (pcb->pcb_flags & PCB_DBREGS) { 1566 pcb->pcb_dr0 = 0; 1567 pcb->pcb_dr1 = 0; 1568 pcb->pcb_dr2 = 0; 1569 pcb->pcb_dr3 = 0; 1570 pcb->pcb_dr6 = 0; 1571 pcb->pcb_dr7 = 0; 1572 if (pcb == curpcb) { 1573 /* 1574 * Clear the debug registers on the running 1575 * CPU, otherwise they will end up affecting 1576 * the next process we switch to. 1577 */ 1578 reset_dbregs(); 1579 } 1580 pcb->pcb_flags &= ~PCB_DBREGS; 1581 } 1582 1583 /* 1584 * Initialize the math emulator (if any) for the current process. 1585 * Actually, just clear the bit that says that the emulator has 1586 * been initialized. Initialization is delayed until the process 1587 * traps to the emulator (if it is done at all) mainly because 1588 * emulators don't provide an entry point for initialization. 1589 */ 1590 td->td_pcb->pcb_flags &= ~FP_SOFTFP; 1591 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1592 1593 /* 1594 * Drop the FP state if we hold it, so that the process gets a 1595 * clean FP state if it uses the FPU again. 1596 */ 1597 fpstate_drop(td); 1598 1599 /* 1600 * XXX - Linux emulator 1601 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1602 * on it. 1603 */ 1604 td->td_retval[1] = 0; 1605} 1606 1607void 1608cpu_setregs(void) 1609{ 1610 unsigned int cr0; 1611 1612 cr0 = rcr0(); 1613 1614 /* 1615 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1616 * 1617 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1618 * instructions. We must set the CR0_MP bit and use the CR0_TS 1619 * bit to control the trap, because setting the CR0_EM bit does 1620 * not cause WAIT instructions to trap. It's important to trap 1621 * WAIT instructions - otherwise the "wait" variants of no-wait 1622 * control instructions would degenerate to the "no-wait" variants 1623 * after FP context switches but work correctly otherwise. It's 1624 * particularly important to trap WAITs when there is no NPX - 1625 * otherwise the "wait" variants would always degenerate. 1626 * 1627 * Try setting CR0_NE to get correct error reporting on 486DX's. 1628 * Setting it should fail or do nothing on lesser processors. 1629 */ 1630 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1631 load_cr0(cr0); 1632 load_gs(_udatasel); 1633} 1634 1635u_long bootdev; /* not a struct cdev *- encoding is different */ 1636SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1637 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1638 1639/* 1640 * Initialize 386 and configure to run kernel 1641 */ 1642 1643/* 1644 * Initialize segments & interrupt table 1645 */ 1646 1647int _default_ldt; 1648 1649#ifdef XEN 1650union descriptor *gdt; 1651union descriptor *ldt; 1652#else 1653union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1654union descriptor ldt[NLDT]; /* local descriptor table */ 1655#endif 1656static struct gate_descriptor idt0[NIDT]; 1657struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1658struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1659struct mtx dt_lock; /* lock for GDT and LDT */ 1660 1661#if defined(I586_CPU) && !defined(NO_F00F_HACK) 1662extern int has_f00f_bug; 1663#endif 1664 1665static struct i386tss dblfault_tss; 1666static char dblfault_stack[PAGE_SIZE]; 1667 1668extern vm_offset_t proc0kstack; 1669 1670 1671/* 1672 * software prototypes -- in more palatable form. 1673 * 1674 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1675 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1676 */ 1677struct soft_segment_descriptor gdt_segs[] = { 1678/* GNULL_SEL 0 Null Descriptor */ 1679{ .ssd_base = 0x0, 1680 .ssd_limit = 0x0, 1681 .ssd_type = 0, 1682 .ssd_dpl = SEL_KPL, 1683 .ssd_p = 0, 1684 .ssd_xx = 0, .ssd_xx1 = 0, 1685 .ssd_def32 = 0, 1686 .ssd_gran = 0 }, 1687/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1688{ .ssd_base = 0x0, 1689 .ssd_limit = 0xfffff, 1690 .ssd_type = SDT_MEMRWA, 1691 .ssd_dpl = SEL_KPL, 1692 .ssd_p = 1, 1693 .ssd_xx = 0, .ssd_xx1 = 0, 1694 .ssd_def32 = 1, 1695 .ssd_gran = 1 }, 1696/* GUFS_SEL 2 %fs Descriptor for user */ 1697{ .ssd_base = 0x0, 1698 .ssd_limit = 0xfffff, 1699 .ssd_type = SDT_MEMRWA, 1700 .ssd_dpl = SEL_UPL, 1701 .ssd_p = 1, 1702 .ssd_xx = 0, .ssd_xx1 = 0, 1703 .ssd_def32 = 1, 1704 .ssd_gran = 1 }, 1705/* GUGS_SEL 3 %gs Descriptor for user */ 1706{ .ssd_base = 0x0, 1707 .ssd_limit = 0xfffff, 1708 .ssd_type = SDT_MEMRWA, 1709 .ssd_dpl = SEL_UPL, 1710 .ssd_p = 1, 1711 .ssd_xx = 0, .ssd_xx1 = 0, 1712 .ssd_def32 = 1, 1713 .ssd_gran = 1 }, 1714/* GCODE_SEL 4 Code Descriptor for kernel */ 1715{ .ssd_base = 0x0, 1716 .ssd_limit = 0xfffff, 1717 .ssd_type = SDT_MEMERA, 1718 .ssd_dpl = SEL_KPL, 1719 .ssd_p = 1, 1720 .ssd_xx = 0, .ssd_xx1 = 0, 1721 .ssd_def32 = 1, 1722 .ssd_gran = 1 }, 1723/* GDATA_SEL 5 Data Descriptor for kernel */ 1724{ .ssd_base = 0x0, 1725 .ssd_limit = 0xfffff, 1726 .ssd_type = SDT_MEMRWA, 1727 .ssd_dpl = SEL_KPL, 1728 .ssd_p = 1, 1729 .ssd_xx = 0, .ssd_xx1 = 0, 1730 .ssd_def32 = 1, 1731 .ssd_gran = 1 }, 1732/* GUCODE_SEL 6 Code Descriptor for user */ 1733{ .ssd_base = 0x0, 1734 .ssd_limit = 0xfffff, 1735 .ssd_type = SDT_MEMERA, 1736 .ssd_dpl = SEL_UPL, 1737 .ssd_p = 1, 1738 .ssd_xx = 0, .ssd_xx1 = 0, 1739 .ssd_def32 = 1, 1740 .ssd_gran = 1 }, 1741/* GUDATA_SEL 7 Data Descriptor for user */ 1742{ .ssd_base = 0x0, 1743 .ssd_limit = 0xfffff, 1744 .ssd_type = SDT_MEMRWA, 1745 .ssd_dpl = SEL_UPL, 1746 .ssd_p = 1, 1747 .ssd_xx = 0, .ssd_xx1 = 0, 1748 .ssd_def32 = 1, 1749 .ssd_gran = 1 }, 1750/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1751{ .ssd_base = 0x400, 1752 .ssd_limit = 0xfffff, 1753 .ssd_type = SDT_MEMRWA, 1754 .ssd_dpl = SEL_KPL, 1755 .ssd_p = 1, 1756 .ssd_xx = 0, .ssd_xx1 = 0, 1757 .ssd_def32 = 1, 1758 .ssd_gran = 1 }, 1759#ifndef XEN 1760/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1761{ 1762 .ssd_base = 0x0, 1763 .ssd_limit = sizeof(struct i386tss)-1, 1764 .ssd_type = SDT_SYS386TSS, 1765 .ssd_dpl = 0, 1766 .ssd_p = 1, 1767 .ssd_xx = 0, .ssd_xx1 = 0, 1768 .ssd_def32 = 0, 1769 .ssd_gran = 0 }, 1770/* GLDT_SEL 10 LDT Descriptor */ 1771{ .ssd_base = (int) ldt, 1772 .ssd_limit = sizeof(ldt)-1, 1773 .ssd_type = SDT_SYSLDT, 1774 .ssd_dpl = SEL_UPL, 1775 .ssd_p = 1, 1776 .ssd_xx = 0, .ssd_xx1 = 0, 1777 .ssd_def32 = 0, 1778 .ssd_gran = 0 }, 1779/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1780{ .ssd_base = (int) ldt, 1781 .ssd_limit = (512 * sizeof(union descriptor)-1), 1782 .ssd_type = SDT_SYSLDT, 1783 .ssd_dpl = 0, 1784 .ssd_p = 1, 1785 .ssd_xx = 0, .ssd_xx1 = 0, 1786 .ssd_def32 = 0, 1787 .ssd_gran = 0 }, 1788/* GPANIC_SEL 12 Panic Tss Descriptor */ 1789{ .ssd_base = (int) &dblfault_tss, 1790 .ssd_limit = sizeof(struct i386tss)-1, 1791 .ssd_type = SDT_SYS386TSS, 1792 .ssd_dpl = 0, 1793 .ssd_p = 1, 1794 .ssd_xx = 0, .ssd_xx1 = 0, 1795 .ssd_def32 = 0, 1796 .ssd_gran = 0 }, 1797/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1798{ .ssd_base = 0, 1799 .ssd_limit = 0xfffff, 1800 .ssd_type = SDT_MEMERA, 1801 .ssd_dpl = 0, 1802 .ssd_p = 1, 1803 .ssd_xx = 0, .ssd_xx1 = 0, 1804 .ssd_def32 = 0, 1805 .ssd_gran = 1 }, 1806/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1807{ .ssd_base = 0, 1808 .ssd_limit = 0xfffff, 1809 .ssd_type = SDT_MEMERA, 1810 .ssd_dpl = 0, 1811 .ssd_p = 1, 1812 .ssd_xx = 0, .ssd_xx1 = 0, 1813 .ssd_def32 = 0, 1814 .ssd_gran = 1 }, 1815/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1816{ .ssd_base = 0, 1817 .ssd_limit = 0xfffff, 1818 .ssd_type = SDT_MEMRWA, 1819 .ssd_dpl = 0, 1820 .ssd_p = 1, 1821 .ssd_xx = 0, .ssd_xx1 = 0, 1822 .ssd_def32 = 1, 1823 .ssd_gran = 1 }, 1824/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1825{ .ssd_base = 0, 1826 .ssd_limit = 0xfffff, 1827 .ssd_type = SDT_MEMRWA, 1828 .ssd_dpl = 0, 1829 .ssd_p = 1, 1830 .ssd_xx = 0, .ssd_xx1 = 0, 1831 .ssd_def32 = 0, 1832 .ssd_gran = 1 }, 1833/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1834{ .ssd_base = 0, 1835 .ssd_limit = 0xfffff, 1836 .ssd_type = SDT_MEMRWA, 1837 .ssd_dpl = 0, 1838 .ssd_p = 1, 1839 .ssd_xx = 0, .ssd_xx1 = 0, 1840 .ssd_def32 = 0, 1841 .ssd_gran = 1 }, 1842/* GNDIS_SEL 18 NDIS Descriptor */ 1843{ .ssd_base = 0x0, 1844 .ssd_limit = 0x0, 1845 .ssd_type = 0, 1846 .ssd_dpl = 0, 1847 .ssd_p = 0, 1848 .ssd_xx = 0, .ssd_xx1 = 0, 1849 .ssd_def32 = 0, 1850 .ssd_gran = 0 }, 1851#endif /* !XEN */ 1852}; 1853 1854static struct soft_segment_descriptor ldt_segs[] = { 1855 /* Null Descriptor - overwritten by call gate */ 1856{ .ssd_base = 0x0, 1857 .ssd_limit = 0x0, 1858 .ssd_type = 0, 1859 .ssd_dpl = 0, 1860 .ssd_p = 0, 1861 .ssd_xx = 0, .ssd_xx1 = 0, 1862 .ssd_def32 = 0, 1863 .ssd_gran = 0 }, 1864 /* Null Descriptor - overwritten by call gate */ 1865{ .ssd_base = 0x0, 1866 .ssd_limit = 0x0, 1867 .ssd_type = 0, 1868 .ssd_dpl = 0, 1869 .ssd_p = 0, 1870 .ssd_xx = 0, .ssd_xx1 = 0, 1871 .ssd_def32 = 0, 1872 .ssd_gran = 0 }, 1873 /* Null Descriptor - overwritten by call gate */ 1874{ .ssd_base = 0x0, 1875 .ssd_limit = 0x0, 1876 .ssd_type = 0, 1877 .ssd_dpl = 0, 1878 .ssd_p = 0, 1879 .ssd_xx = 0, .ssd_xx1 = 0, 1880 .ssd_def32 = 0, 1881 .ssd_gran = 0 }, 1882 /* Code Descriptor for user */ 1883{ .ssd_base = 0x0, 1884 .ssd_limit = 0xfffff, 1885 .ssd_type = SDT_MEMERA, 1886 .ssd_dpl = SEL_UPL, 1887 .ssd_p = 1, 1888 .ssd_xx = 0, .ssd_xx1 = 0, 1889 .ssd_def32 = 1, 1890 .ssd_gran = 1 }, 1891 /* Null Descriptor - overwritten by call gate */ 1892{ .ssd_base = 0x0, 1893 .ssd_limit = 0x0, 1894 .ssd_type = 0, 1895 .ssd_dpl = 0, 1896 .ssd_p = 0, 1897 .ssd_xx = 0, .ssd_xx1 = 0, 1898 .ssd_def32 = 0, 1899 .ssd_gran = 0 }, 1900 /* Data Descriptor for user */ 1901{ .ssd_base = 0x0, 1902 .ssd_limit = 0xfffff, 1903 .ssd_type = SDT_MEMRWA, 1904 .ssd_dpl = SEL_UPL, 1905 .ssd_p = 1, 1906 .ssd_xx = 0, .ssd_xx1 = 0, 1907 .ssd_def32 = 1, 1908 .ssd_gran = 1 }, 1909}; 1910 1911void 1912setidt(idx, func, typ, dpl, selec) 1913 int idx; 1914 inthand_t *func; 1915 int typ; 1916 int dpl; 1917 int selec; 1918{ 1919 struct gate_descriptor *ip; 1920 1921 ip = idt + idx; 1922 ip->gd_looffset = (int)func; 1923 ip->gd_selector = selec; 1924 ip->gd_stkcpy = 0; 1925 ip->gd_xx = 0; 1926 ip->gd_type = typ; 1927 ip->gd_dpl = dpl; 1928 ip->gd_p = 1; 1929 ip->gd_hioffset = ((int)func)>>16 ; 1930} 1931 1932extern inthand_t 1933 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1934 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1935 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1936 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1937 IDTVEC(xmm), 1938#ifdef KDTRACE_HOOKS 1939 IDTVEC(dtrace_ret), 1940#endif 1941#ifdef XENHVM 1942 IDTVEC(xen_intr_upcall), 1943#endif 1944 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1945 1946#ifdef DDB 1947/* 1948 * Display the index and function name of any IDT entries that don't use 1949 * the default 'rsvd' entry point. 1950 */ 1951DB_SHOW_COMMAND(idt, db_show_idt) 1952{ 1953 struct gate_descriptor *ip; 1954 int idx; 1955 uintptr_t func; 1956 1957 ip = idt; 1958 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1959 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1960 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1961 db_printf("%3d\t", idx); 1962 db_printsym(func, DB_STGY_PROC); 1963 db_printf("\n"); 1964 } 1965 ip++; 1966 } 1967} 1968 1969/* Show privileged registers. */ 1970DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1971{ 1972 uint64_t idtr, gdtr; 1973 1974 idtr = ridt(); 1975 db_printf("idtr\t0x%08x/%04x\n", 1976 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1977 gdtr = rgdt(); 1978 db_printf("gdtr\t0x%08x/%04x\n", 1979 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1980 db_printf("ldtr\t0x%04x\n", rldt()); 1981 db_printf("tr\t0x%04x\n", rtr()); 1982 db_printf("cr0\t0x%08x\n", rcr0()); 1983 db_printf("cr2\t0x%08x\n", rcr2()); 1984 db_printf("cr3\t0x%08x\n", rcr3()); 1985 db_printf("cr4\t0x%08x\n", rcr4()); 1986} 1987#endif 1988 1989void 1990sdtossd(sd, ssd) 1991 struct segment_descriptor *sd; 1992 struct soft_segment_descriptor *ssd; 1993{ 1994 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1995 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1996 ssd->ssd_type = sd->sd_type; 1997 ssd->ssd_dpl = sd->sd_dpl; 1998 ssd->ssd_p = sd->sd_p; 1999 ssd->ssd_def32 = sd->sd_def32; 2000 ssd->ssd_gran = sd->sd_gran; 2001} 2002 2003#ifndef XEN 2004static int 2005add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 2006{ 2007 int i, insert_idx, physmap_idx; 2008 2009 physmap_idx = *physmap_idxp; 2010 2011 if (boothowto & RB_VERBOSE) 2012 printf("SMAP type=%02x base=%016llx len=%016llx\n", 2013 smap->type, smap->base, smap->length); 2014 2015 if (smap->type != SMAP_TYPE_MEMORY) 2016 return (1); 2017 2018 if (smap->length == 0) 2019 return (1); 2020 2021#ifndef PAE 2022 if (smap->base > 0xffffffff) { 2023 printf("%uK of memory above 4GB ignored\n", 2024 (u_int)(smap->length / 1024)); 2025 return (1); 2026 } 2027#endif 2028 2029 /* 2030 * Find insertion point while checking for overlap. Start off by 2031 * assuming the new entry will be added to the end. 2032 */ 2033 insert_idx = physmap_idx + 2; 2034 for (i = 0; i <= physmap_idx; i += 2) { 2035 if (smap->base < physmap[i + 1]) { 2036 if (smap->base + smap->length <= physmap[i]) { 2037 insert_idx = i; 2038 break; 2039 } 2040 if (boothowto & RB_VERBOSE) 2041 printf( 2042 "Overlapping memory regions, ignoring second region\n"); 2043 return (1); 2044 } 2045 } 2046 2047 /* See if we can prepend to the next entry. */ 2048 if (insert_idx <= physmap_idx && 2049 smap->base + smap->length == physmap[insert_idx]) { 2050 physmap[insert_idx] = smap->base; 2051 return (1); 2052 } 2053 2054 /* See if we can append to the previous entry. */ 2055 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { 2056 physmap[insert_idx - 1] += smap->length; 2057 return (1); 2058 } 2059 2060 physmap_idx += 2; 2061 *physmap_idxp = physmap_idx; 2062 if (physmap_idx == PHYSMAP_SIZE) { 2063 printf( 2064 "Too many segments in the physical address map, giving up\n"); 2065 return (0); 2066 } 2067 2068 /* 2069 * Move the last 'N' entries down to make room for the new 2070 * entry if needed. 2071 */ 2072 for (i = physmap_idx; i > insert_idx; i -= 2) { 2073 physmap[i] = physmap[i - 2]; 2074 physmap[i + 1] = physmap[i - 1]; 2075 } 2076 2077 /* Insert the new entry. */ 2078 physmap[insert_idx] = smap->base; 2079 physmap[insert_idx + 1] = smap->base + smap->length; 2080 return (1); 2081} 2082 2083static void 2084basemem_setup(void) 2085{ 2086 vm_paddr_t pa; 2087 pt_entry_t *pte; 2088 int i; 2089 2090 if (basemem > 640) { 2091 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 2092 basemem); 2093 basemem = 640; 2094 } 2095 2096 /* 2097 * XXX if biosbasemem is now < 640, there is a `hole' 2098 * between the end of base memory and the start of 2099 * ISA memory. The hole may be empty or it may 2100 * contain BIOS code or data. Map it read/write so 2101 * that the BIOS can write to it. (Memory from 0 to 2102 * the physical end of the kernel is mapped read-only 2103 * to begin with and then parts of it are remapped. 2104 * The parts that aren't remapped form holes that 2105 * remain read-only and are unused by the kernel. 2106 * The base memory area is below the physical end of 2107 * the kernel and right now forms a read-only hole. 2108 * The part of it from PAGE_SIZE to 2109 * (trunc_page(biosbasemem * 1024) - 1) will be 2110 * remapped and used by the kernel later.) 2111 * 2112 * This code is similar to the code used in 2113 * pmap_mapdev, but since no memory needs to be 2114 * allocated we simply change the mapping. 2115 */ 2116 for (pa = trunc_page(basemem * 1024); 2117 pa < ISA_HOLE_START; pa += PAGE_SIZE) 2118 pmap_kenter(KERNBASE + pa, pa); 2119 2120 /* 2121 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 2122 * the vm86 page table so that vm86 can scribble on them using 2123 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 2124 * page 0, at least as initialized here? 2125 */ 2126 pte = (pt_entry_t *)vm86paddr; 2127 for (i = basemem / 4; i < 160; i++) 2128 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 2129} 2130#endif 2131 2132/* 2133 * Populate the (physmap) array with base/bound pairs describing the 2134 * available physical memory in the system, then test this memory and 2135 * build the phys_avail array describing the actually-available memory. 2136 * 2137 * If we cannot accurately determine the physical memory map, then use 2138 * value from the 0xE801 call, and failing that, the RTC. 2139 * 2140 * Total memory size may be set by the kernel environment variable 2141 * hw.physmem or the compile-time define MAXMEM. 2142 * 2143 * XXX first should be vm_paddr_t. 2144 */ 2145static void 2146getmemsize(int first) 2147{ 2148 int has_smap, off, physmap_idx, pa_indx, da_indx; 2149 u_long physmem_tunable, memtest; 2150 vm_paddr_t physmap[PHYSMAP_SIZE]; 2151 pt_entry_t *pte; 2152 quad_t dcons_addr, dcons_size; 2153#ifndef XEN 2154 int hasbrokenint12, i, res; 2155 u_int extmem; 2156 struct vm86frame vmf; 2157 struct vm86context vmc; 2158 vm_paddr_t pa; 2159 struct bios_smap *smap, *smapbase, *smapend; 2160 u_int32_t smapsize; 2161 caddr_t kmdp; 2162#endif 2163 2164 has_smap = 0; 2165#if defined(XEN) 2166 Maxmem = xen_start_info->nr_pages - init_first; 2167 physmem = Maxmem; 2168 basemem = 0; 2169 physmap[0] = init_first << PAGE_SHIFT; 2170 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); 2171 physmap_idx = 0; 2172#else 2173#ifdef XBOX 2174 if (arch_i386_is_xbox) { 2175 /* 2176 * We queried the memory size before, so chop off 4MB for 2177 * the framebuffer and inform the OS of this. 2178 */ 2179 physmap[0] = 0; 2180 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2181 physmap_idx = 0; 2182 goto physmap_done; 2183 } 2184#endif 2185 bzero(&vmf, sizeof(vmf)); 2186 bzero(physmap, sizeof(physmap)); 2187 basemem = 0; 2188 2189 /* 2190 * Check if the loader supplied an SMAP memory map. If so, 2191 * use that and do not make any VM86 calls. 2192 */ 2193 physmap_idx = 0; 2194 smapbase = NULL; 2195 kmdp = preload_search_by_type("elf kernel"); 2196 if (kmdp == NULL) 2197 kmdp = preload_search_by_type("elf32 kernel"); 2198 if (kmdp != NULL) 2199 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2200 MODINFO_METADATA | MODINFOMD_SMAP); 2201 if (smapbase != NULL) { 2202 /* 2203 * subr_module.c says: 2204 * "Consumer may safely assume that size value precedes data." 2205 * ie: an int32_t immediately precedes SMAP. 2206 */ 2207 smapsize = *((u_int32_t *)smapbase - 1); 2208 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 2209 has_smap = 1; 2210 2211 for (smap = smapbase; smap < smapend; smap++) 2212 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2213 break; 2214 goto have_smap; 2215 } 2216 2217 /* 2218 * Some newer BIOSes have a broken INT 12H implementation 2219 * which causes a kernel panic immediately. In this case, we 2220 * need use the SMAP to determine the base memory size. 2221 */ 2222 hasbrokenint12 = 0; 2223 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2224 if (hasbrokenint12 == 0) { 2225 /* Use INT12 to determine base memory size. */ 2226 vm86_intcall(0x12, &vmf); 2227 basemem = vmf.vmf_ax; 2228 basemem_setup(); 2229 } 2230 2231 /* 2232 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2233 * the kernel page table so we can use it as a buffer. The 2234 * kernel will unmap this page later. 2235 */ 2236 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2237 vmc.npages = 0; 2238 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2239 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2240 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2241 2242 vmf.vmf_ebx = 0; 2243 do { 2244 vmf.vmf_eax = 0xE820; 2245 vmf.vmf_edx = SMAP_SIG; 2246 vmf.vmf_ecx = sizeof(struct bios_smap); 2247 i = vm86_datacall(0x15, &vmf, &vmc); 2248 if (i || vmf.vmf_eax != SMAP_SIG) 2249 break; 2250 has_smap = 1; 2251 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2252 break; 2253 } while (vmf.vmf_ebx != 0); 2254 2255have_smap: 2256 /* 2257 * If we didn't fetch the "base memory" size from INT12, 2258 * figure it out from the SMAP (or just guess). 2259 */ 2260 if (basemem == 0) { 2261 for (i = 0; i <= physmap_idx; i += 2) { 2262 if (physmap[i] == 0x00000000) { 2263 basemem = physmap[i + 1] / 1024; 2264 break; 2265 } 2266 } 2267 2268 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2269 if (basemem == 0) 2270 basemem = 640; 2271 basemem_setup(); 2272 } 2273 2274 if (physmap[1] != 0) 2275 goto physmap_done; 2276 2277 /* 2278 * If we failed to find an SMAP, figure out the extended 2279 * memory size. We will then build a simple memory map with 2280 * two segments, one for "base memory" and the second for 2281 * "extended memory". Note that "extended memory" starts at a 2282 * physical address of 1MB and that both basemem and extmem 2283 * are in units of 1KB. 2284 * 2285 * First, try to fetch the extended memory size via INT 15:E801. 2286 */ 2287 vmf.vmf_ax = 0xE801; 2288 if (vm86_intcall(0x15, &vmf) == 0) { 2289 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2290 } else { 2291 /* 2292 * If INT15:E801 fails, this is our last ditch effort 2293 * to determine the extended memory size. Currently 2294 * we prefer the RTC value over INT15:88. 2295 */ 2296#if 0 2297 vmf.vmf_ah = 0x88; 2298 vm86_intcall(0x15, &vmf); 2299 extmem = vmf.vmf_ax; 2300#else 2301 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2302#endif 2303 } 2304 2305 /* 2306 * Special hack for chipsets that still remap the 384k hole when 2307 * there's 16MB of memory - this really confuses people that 2308 * are trying to use bus mastering ISA controllers with the 2309 * "16MB limit"; they only have 16MB, but the remapping puts 2310 * them beyond the limit. 2311 * 2312 * If extended memory is between 15-16MB (16-17MB phys address range), 2313 * chop it to 15MB. 2314 */ 2315 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2316 extmem = 15 * 1024; 2317 2318 physmap[0] = 0; 2319 physmap[1] = basemem * 1024; 2320 physmap_idx = 2; 2321 physmap[physmap_idx] = 0x100000; 2322 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2323 2324physmap_done: 2325#endif 2326 /* 2327 * Now, physmap contains a map of physical memory. 2328 */ 2329 2330#ifdef SMP 2331 /* make hole for AP bootstrap code */ 2332 physmap[1] = mp_bootaddress(physmap[1]); 2333#endif 2334 2335 /* 2336 * Maxmem isn't the "maximum memory", it's one larger than the 2337 * highest page of the physical address space. It should be 2338 * called something like "Maxphyspage". We may adjust this 2339 * based on ``hw.physmem'' and the results of the memory test. 2340 */ 2341 Maxmem = atop(physmap[physmap_idx + 1]); 2342 2343#ifdef MAXMEM 2344 Maxmem = MAXMEM / 4; 2345#endif 2346 2347 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2348 Maxmem = atop(physmem_tunable); 2349 2350 /* 2351 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2352 * the amount of memory in the system. 2353 */ 2354 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2355 Maxmem = atop(physmap[physmap_idx + 1]); 2356 2357 /* 2358 * By default enable the memory test on real hardware, and disable 2359 * it if we appear to be running in a VM. This avoids touching all 2360 * pages unnecessarily, which doesn't matter on real hardware but is 2361 * bad for shared VM hosts. Use a general name so that 2362 * one could eventually do more with the code than just disable it. 2363 */ 2364 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2365 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2366 2367 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2368 (boothowto & RB_VERBOSE)) 2369 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2370 2371 /* 2372 * If Maxmem has been increased beyond what the system has detected, 2373 * extend the last memory segment to the new limit. 2374 */ 2375 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2376 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2377 2378 /* call pmap initialization to make new kernel address space */ 2379 pmap_bootstrap(first); 2380 2381 /* 2382 * Size up each available chunk of physical memory. 2383 */ 2384 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2385 pa_indx = 0; 2386 da_indx = 1; 2387 phys_avail[pa_indx++] = physmap[0]; 2388 phys_avail[pa_indx] = physmap[0]; 2389 dump_avail[da_indx] = physmap[0]; 2390 pte = CMAP1; 2391 2392 /* 2393 * Get dcons buffer address 2394 */ 2395 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2396 getenv_quad("dcons.size", &dcons_size) == 0) 2397 dcons_addr = 0; 2398 2399#ifndef XEN 2400 /* 2401 * physmap is in bytes, so when converting to page boundaries, 2402 * round up the start address and round down the end address. 2403 */ 2404 for (i = 0; i <= physmap_idx; i += 2) { 2405 vm_paddr_t end; 2406 2407 end = ptoa((vm_paddr_t)Maxmem); 2408 if (physmap[i + 1] < end) 2409 end = trunc_page(physmap[i + 1]); 2410 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2411 int tmp, page_bad, full; 2412 int *ptr = (int *)CADDR1; 2413 2414 full = FALSE; 2415 /* 2416 * block out kernel memory as not available. 2417 */ 2418 if (pa >= KERNLOAD && pa < first) 2419 goto do_dump_avail; 2420 2421 /* 2422 * block out dcons buffer 2423 */ 2424 if (dcons_addr > 0 2425 && pa >= trunc_page(dcons_addr) 2426 && pa < dcons_addr + dcons_size) 2427 goto do_dump_avail; 2428 2429 page_bad = FALSE; 2430 if (memtest == 0) 2431 goto skip_memtest; 2432 2433 /* 2434 * map page into kernel: valid, read/write,non-cacheable 2435 */ 2436 *pte = pa | PG_V | PG_RW | PG_N; 2437 invltlb(); 2438 2439 tmp = *(int *)ptr; 2440 /* 2441 * Test for alternating 1's and 0's 2442 */ 2443 *(volatile int *)ptr = 0xaaaaaaaa; 2444 if (*(volatile int *)ptr != 0xaaaaaaaa) 2445 page_bad = TRUE; 2446 /* 2447 * Test for alternating 0's and 1's 2448 */ 2449 *(volatile int *)ptr = 0x55555555; 2450 if (*(volatile int *)ptr != 0x55555555) 2451 page_bad = TRUE; 2452 /* 2453 * Test for all 1's 2454 */ 2455 *(volatile int *)ptr = 0xffffffff; 2456 if (*(volatile int *)ptr != 0xffffffff) 2457 page_bad = TRUE; 2458 /* 2459 * Test for all 0's 2460 */ 2461 *(volatile int *)ptr = 0x0; 2462 if (*(volatile int *)ptr != 0x0) 2463 page_bad = TRUE; 2464 /* 2465 * Restore original value. 2466 */ 2467 *(int *)ptr = tmp; 2468 2469skip_memtest: 2470 /* 2471 * Adjust array of valid/good pages. 2472 */ 2473 if (page_bad == TRUE) 2474 continue; 2475 /* 2476 * If this good page is a continuation of the 2477 * previous set of good pages, then just increase 2478 * the end pointer. Otherwise start a new chunk. 2479 * Note that "end" points one higher than end, 2480 * making the range >= start and < end. 2481 * If we're also doing a speculative memory 2482 * test and we at or past the end, bump up Maxmem 2483 * so that we keep going. The first bad page 2484 * will terminate the loop. 2485 */ 2486 if (phys_avail[pa_indx] == pa) { 2487 phys_avail[pa_indx] += PAGE_SIZE; 2488 } else { 2489 pa_indx++; 2490 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2491 printf( 2492 "Too many holes in the physical address space, giving up\n"); 2493 pa_indx--; 2494 full = TRUE; 2495 goto do_dump_avail; 2496 } 2497 phys_avail[pa_indx++] = pa; /* start */ 2498 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2499 } 2500 physmem++; 2501do_dump_avail: 2502 if (dump_avail[da_indx] == pa) { 2503 dump_avail[da_indx] += PAGE_SIZE; 2504 } else { 2505 da_indx++; 2506 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2507 da_indx--; 2508 goto do_next; 2509 } 2510 dump_avail[da_indx++] = pa; /* start */ 2511 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2512 } 2513do_next: 2514 if (full) 2515 break; 2516 } 2517 } 2518 *pte = 0; 2519 invltlb(); 2520#else 2521 phys_avail[0] = physfree; 2522 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2523 dump_avail[0] = 0; 2524 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2525 2526#endif 2527 2528 /* 2529 * XXX 2530 * The last chunk must contain at least one page plus the message 2531 * buffer to avoid complicating other code (message buffer address 2532 * calculation, etc.). 2533 */ 2534 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2535 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2536 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2537 phys_avail[pa_indx--] = 0; 2538 phys_avail[pa_indx--] = 0; 2539 } 2540 2541 Maxmem = atop(phys_avail[pa_indx]); 2542 2543 /* Trim off space for the message buffer. */ 2544 phys_avail[pa_indx] -= round_page(msgbufsize); 2545 2546 /* Map the message buffer. */ 2547 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2548 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2549 off); 2550 2551 PT_UPDATES_FLUSH(); 2552} 2553 2554#ifdef XEN 2555#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 2556 2557void 2558init386(first) 2559 int first; 2560{ 2561 unsigned long gdtmachpfn; 2562 int error, gsel_tss, metadata_missing, x, pa; 2563 size_t kstack0_sz; 2564 struct pcpu *pc; 2565 struct callback_register event = { 2566 .type = CALLBACKTYPE_event, 2567 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, 2568 }; 2569 struct callback_register failsafe = { 2570 .type = CALLBACKTYPE_failsafe, 2571 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, 2572 }; 2573 2574 thread0.td_kstack = proc0kstack; 2575 thread0.td_kstack_pages = KSTACK_PAGES; 2576 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2577 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2578 2579 /* 2580 * This may be done better later if it gets more high level 2581 * components in it. If so just link td->td_proc here. 2582 */ 2583 proc_linkup0(&proc0, &thread0); 2584 2585 metadata_missing = 0; 2586 if (xen_start_info->mod_start) { 2587 preload_metadata = (caddr_t)xen_start_info->mod_start; 2588 preload_bootstrap_relocate(KERNBASE); 2589 } else { 2590 metadata_missing = 1; 2591 } 2592 if (envmode == 1) 2593 kern_envp = static_env; 2594 else if ((caddr_t)xen_start_info->cmd_line) 2595 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); 2596 2597 boothowto |= xen_boothowto(kern_envp); 2598 2599 /* Init basic tunables, hz etc */ 2600 init_param1(); 2601 2602 /* 2603 * XEN occupies a portion of the upper virtual address space 2604 * At its base it manages an array mapping machine page frames 2605 * to physical page frames - hence we need to be able to 2606 * access 4GB - (64MB - 4MB + 64k) 2607 */ 2608 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2609 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2610 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2611 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2612 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2613 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2614 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2615 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2616 2617 pc = &__pcpu[0]; 2618 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2619 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2620 2621 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); 2622 bzero(gdt, PAGE_SIZE); 2623 for (x = 0; x < NGDT; x++) 2624 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2625 2626 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2627 2628 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; 2629 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); 2630 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); 2631 lgdt(&r_gdt); 2632 gdtset = 1; 2633 2634 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { 2635 panic("set_trap_table failed - error %d\n", error); 2636 } 2637 2638 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); 2639 if (error == 0) 2640 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); 2641#if CONFIG_XEN_COMPAT <= 0x030002 2642 if (error == -ENOXENSYS) 2643 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), 2644 (unsigned long)Xhypervisor_callback, 2645 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 2646#endif 2647 pcpu_init(pc, 0, sizeof(struct pcpu)); 2648 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2649 pmap_kenter(pa + KERNBASE, pa); 2650 dpcpu_init((void *)(first + KERNBASE), 0); 2651 first += DPCPU_SIZE; 2652 physfree += DPCPU_SIZE; 2653 init_first += DPCPU_SIZE / PAGE_SIZE; 2654 2655 PCPU_SET(prvspace, pc); 2656 PCPU_SET(curthread, &thread0); 2657 PCPU_SET(curpcb, thread0.td_pcb); 2658 2659 /* 2660 * Initialize mutexes. 2661 * 2662 * icu_lock: in order to allow an interrupt to occur in a critical 2663 * section, to set pcpu->ipending (etc...) properly, we 2664 * must be able to get the icu lock, so it can't be 2665 * under witness. 2666 */ 2667 mutex_init(); 2668 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2669 2670 /* make ldt memory segments */ 2671 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); 2672 bzero(ldt, PAGE_SIZE); 2673 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2674 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2675 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2676 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2677 2678 default_proc_ldt.ldt_base = (caddr_t)ldt; 2679 default_proc_ldt.ldt_len = 6; 2680 _default_ldt = (int)&default_proc_ldt; 2681 PCPU_SET(currentldt, _default_ldt); 2682 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); 2683 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); 2684 2685#if defined(XEN_PRIVILEGED) 2686 /* 2687 * Initialize the i8254 before the console so that console 2688 * initialization can use DELAY(). 2689 */ 2690 i8254_init(); 2691#endif 2692 2693 /* 2694 * Initialize the console before we print anything out. 2695 */ 2696 cninit(); 2697 2698 if (metadata_missing) 2699 printf("WARNING: loader(8) metadata is missing!\n"); 2700 2701#ifdef DEV_ISA 2702#ifdef DEV_ATPIC 2703 elcr_probe(); 2704 atpic_startup(); 2705#else 2706 /* Reset and mask the atpics and leave them shut down. */ 2707 atpic_reset(); 2708 2709 /* 2710 * Point the ICU spurious interrupt vectors at the APIC spurious 2711 * interrupt handler. 2712 */ 2713 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2714 GSEL(GCODE_SEL, SEL_KPL)); 2715 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2716 GSEL(GCODE_SEL, SEL_KPL)); 2717#endif 2718#endif 2719 2720#ifdef DDB 2721 ksym_start = bootinfo.bi_symtab; 2722 ksym_end = bootinfo.bi_esymtab; 2723#endif 2724 2725 kdb_init(); 2726 2727#ifdef KDB 2728 if (boothowto & RB_KDB) 2729 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2730#endif 2731 2732 finishidentcpu(); /* Final stage of CPU initialization */ 2733 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2734 GSEL(GCODE_SEL, SEL_KPL)); 2735 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2736 GSEL(GCODE_SEL, SEL_KPL)); 2737 initializecpu(); /* Initialize CPU registers */ 2738 2739 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2740 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2741 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2742 kstack0_sz - sizeof(struct pcb) - 16); 2743 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2744 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2745 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), 2746 PCPU_GET(common_tss.tss_esp0)); 2747 2748 /* pointer to selector slot for %fs/%gs */ 2749 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2750 2751 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2752 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2753 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2754 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2755#ifdef PAE 2756 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2757#else 2758 dblfault_tss.tss_cr3 = (int)IdlePTD; 2759#endif 2760 dblfault_tss.tss_eip = (int)dblfault_handler; 2761 dblfault_tss.tss_eflags = PSL_KERNEL; 2762 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2763 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2764 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2765 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2766 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2767 2768 vm86_initialize(); 2769 getmemsize(first); 2770 init_param2(physmem); 2771 2772 /* now running on new page tables, configured,and u/iom is accessible */ 2773 2774 msgbufinit(msgbufp, msgbufsize); 2775 /* transfer to user mode */ 2776 2777 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2778 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2779 2780 /* setup proc 0's pcb */ 2781 thread0.td_pcb->pcb_flags = 0; 2782#ifdef PAE 2783 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2784#else 2785 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2786#endif 2787 thread0.td_pcb->pcb_ext = 0; 2788 thread0.td_frame = &proc0_tf; 2789 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; 2790 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; 2791 2792 cpu_probe_amdc1e(); 2793} 2794 2795#else 2796void 2797init386(first) 2798 int first; 2799{ 2800 struct gate_descriptor *gdp; 2801 int gsel_tss, metadata_missing, x, pa; 2802 size_t kstack0_sz; 2803 struct pcpu *pc; 2804 2805 thread0.td_kstack = proc0kstack; 2806 thread0.td_kstack_pages = KSTACK_PAGES; 2807 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2808 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2809 2810 /* 2811 * This may be done better later if it gets more high level 2812 * components in it. If so just link td->td_proc here. 2813 */ 2814 proc_linkup0(&proc0, &thread0); 2815 2816 metadata_missing = 0; 2817 if (bootinfo.bi_modulep) { 2818 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2819 preload_bootstrap_relocate(KERNBASE); 2820 } else { 2821 metadata_missing = 1; 2822 } 2823 if (envmode == 1) 2824 kern_envp = static_env; 2825 else if (bootinfo.bi_envp) 2826 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2827 2828 /* Init basic tunables, hz etc */ 2829 init_param1(); 2830 2831 /* 2832 * Make gdt memory segments. All segments cover the full 4GB 2833 * of address space and permissions are enforced at page level. 2834 */ 2835 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2836 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2837 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2838 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2839 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2840 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2841 2842 pc = &__pcpu[0]; 2843 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2844 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2845 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2846 2847 for (x = 0; x < NGDT; x++) 2848 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2849 2850 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2851 r_gdt.rd_base = (int) gdt; 2852 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2853 lgdt(&r_gdt); 2854 2855 pcpu_init(pc, 0, sizeof(struct pcpu)); 2856 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2857 pmap_kenter(pa + KERNBASE, pa); 2858 dpcpu_init((void *)(first + KERNBASE), 0); 2859 first += DPCPU_SIZE; 2860 PCPU_SET(prvspace, pc); 2861 PCPU_SET(curthread, &thread0); 2862 PCPU_SET(curpcb, thread0.td_pcb); 2863 2864 /* 2865 * Initialize mutexes. 2866 * 2867 * icu_lock: in order to allow an interrupt to occur in a critical 2868 * section, to set pcpu->ipending (etc...) properly, we 2869 * must be able to get the icu lock, so it can't be 2870 * under witness. 2871 */ 2872 mutex_init(); 2873 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2874 2875 /* make ldt memory segments */ 2876 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2877 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2878 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2879 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2880 2881 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2882 lldt(_default_ldt); 2883 PCPU_SET(currentldt, _default_ldt); 2884 2885 /* exceptions */ 2886 for (x = 0; x < NIDT; x++) 2887 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2888 GSEL(GCODE_SEL, SEL_KPL)); 2889 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2890 GSEL(GCODE_SEL, SEL_KPL)); 2891 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2892 GSEL(GCODE_SEL, SEL_KPL)); 2893 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2894 GSEL(GCODE_SEL, SEL_KPL)); 2895 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2896 GSEL(GCODE_SEL, SEL_KPL)); 2897 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2898 GSEL(GCODE_SEL, SEL_KPL)); 2899 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2900 GSEL(GCODE_SEL, SEL_KPL)); 2901 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2902 GSEL(GCODE_SEL, SEL_KPL)); 2903 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2904 , GSEL(GCODE_SEL, SEL_KPL)); 2905 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2906 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2907 GSEL(GCODE_SEL, SEL_KPL)); 2908 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2909 GSEL(GCODE_SEL, SEL_KPL)); 2910 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2911 GSEL(GCODE_SEL, SEL_KPL)); 2912 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2913 GSEL(GCODE_SEL, SEL_KPL)); 2914 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2915 GSEL(GCODE_SEL, SEL_KPL)); 2916 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2917 GSEL(GCODE_SEL, SEL_KPL)); 2918 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2919 GSEL(GCODE_SEL, SEL_KPL)); 2920 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2921 GSEL(GCODE_SEL, SEL_KPL)); 2922 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2923 GSEL(GCODE_SEL, SEL_KPL)); 2924 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2925 GSEL(GCODE_SEL, SEL_KPL)); 2926 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2927 GSEL(GCODE_SEL, SEL_KPL)); 2928#ifdef KDTRACE_HOOKS 2929 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2930 GSEL(GCODE_SEL, SEL_KPL)); 2931#endif 2932#ifdef XENHVM 2933 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, 2934 GSEL(GCODE_SEL, SEL_KPL)); 2935#endif 2936 2937 r_idt.rd_limit = sizeof(idt0) - 1; 2938 r_idt.rd_base = (int) idt; 2939 lidt(&r_idt); 2940 2941#ifdef XBOX 2942 /* 2943 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2944 * This should be 0x10de / 0x02a5. 2945 * 2946 * This is exactly what Linux does. 2947 */ 2948 outl(0xcf8, 0x80000000); 2949 if (inl(0xcfc) == 0x02a510de) { 2950 arch_i386_is_xbox = 1; 2951 pic16l_setled(XBOX_LED_GREEN); 2952 2953 /* 2954 * We are an XBOX, but we may have either 64MB or 128MB of 2955 * memory. The PCI host bridge should be programmed for this, 2956 * so we just query it. 2957 */ 2958 outl(0xcf8, 0x80000084); 2959 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2960 } 2961#endif /* XBOX */ 2962 2963 /* 2964 * Initialize the i8254 before the console so that console 2965 * initialization can use DELAY(). 2966 */ 2967 i8254_init(); 2968 2969 /* 2970 * Initialize the console before we print anything out. 2971 */ 2972 cninit(); 2973 2974 if (metadata_missing) 2975 printf("WARNING: loader(8) metadata is missing!\n"); 2976 2977#ifdef DEV_ISA 2978#ifdef DEV_ATPIC 2979 elcr_probe(); 2980 atpic_startup(); 2981#else 2982 /* Reset and mask the atpics and leave them shut down. */ 2983 atpic_reset(); 2984 2985 /* 2986 * Point the ICU spurious interrupt vectors at the APIC spurious 2987 * interrupt handler. 2988 */ 2989 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2990 GSEL(GCODE_SEL, SEL_KPL)); 2991 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2992 GSEL(GCODE_SEL, SEL_KPL)); 2993#endif 2994#endif 2995 2996#ifdef DDB 2997 ksym_start = bootinfo.bi_symtab; 2998 ksym_end = bootinfo.bi_esymtab; 2999#endif 3000 3001 kdb_init(); 3002 3003#ifdef KDB 3004 if (boothowto & RB_KDB) 3005 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 3006#endif 3007 3008 finishidentcpu(); /* Final stage of CPU initialization */ 3009 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 3010 GSEL(GCODE_SEL, SEL_KPL)); 3011 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 3012 GSEL(GCODE_SEL, SEL_KPL)); 3013 initializecpu(); /* Initialize CPU registers */ 3014 3015 /* make an initial tss so cpu can get interrupt stack on syscall! */ 3016 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 3017 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 3018 kstack0_sz - sizeof(struct pcb) - 16); 3019 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 3020 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 3021 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 3022 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 3023 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 3024 ltr(gsel_tss); 3025 3026 /* pointer to selector slot for %fs/%gs */ 3027 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 3028 3029 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 3030 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 3031 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 3032 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 3033#ifdef PAE 3034 dblfault_tss.tss_cr3 = (int)IdlePDPT; 3035#else 3036 dblfault_tss.tss_cr3 = (int)IdlePTD; 3037#endif 3038 dblfault_tss.tss_eip = (int)dblfault_handler; 3039 dblfault_tss.tss_eflags = PSL_KERNEL; 3040 dblfault_tss.tss_ds = dblfault_tss.tss_es = 3041 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 3042 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 3043 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 3044 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 3045 3046 vm86_initialize(); 3047 getmemsize(first); 3048 init_param2(physmem); 3049 3050 /* now running on new page tables, configured,and u/iom is accessible */ 3051 3052 msgbufinit(msgbufp, msgbufsize); 3053 3054 /* make a call gate to reenter kernel with */ 3055 gdp = &ldt[LSYS5CALLS_SEL].gd; 3056 3057 x = (int) &IDTVEC(lcall_syscall); 3058 gdp->gd_looffset = x; 3059 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 3060 gdp->gd_stkcpy = 1; 3061 gdp->gd_type = SDT_SYS386CGT; 3062 gdp->gd_dpl = SEL_UPL; 3063 gdp->gd_p = 1; 3064 gdp->gd_hioffset = x >> 16; 3065 3066 /* XXX does this work? */ 3067 /* XXX yes! */ 3068 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3069 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3070 3071 /* transfer to user mode */ 3072 3073 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 3074 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 3075 3076 /* setup proc 0's pcb */ 3077 thread0.td_pcb->pcb_flags = 0; 3078#ifdef PAE 3079 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 3080#else 3081 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 3082#endif 3083 thread0.td_pcb->pcb_ext = 0; 3084 thread0.td_frame = &proc0_tf; 3085 3086 cpu_probe_amdc1e(); 3087 3088#ifdef FDT 3089 x86_init_fdt(); 3090#endif 3091} 3092#endif 3093 3094void 3095cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 3096{ 3097 3098 pcpu->pc_acpi_id = 0xffffffff; 3099} 3100 3101void 3102spinlock_enter(void) 3103{ 3104 struct thread *td; 3105 register_t flags; 3106 3107 td = curthread; 3108 if (td->td_md.md_spinlock_count == 0) { 3109 flags = intr_disable(); 3110 td->td_md.md_spinlock_count = 1; 3111 td->td_md.md_saved_flags = flags; 3112 } else 3113 td->td_md.md_spinlock_count++; 3114 critical_enter(); 3115} 3116 3117void 3118spinlock_exit(void) 3119{ 3120 struct thread *td; 3121 register_t flags; 3122 3123 td = curthread; 3124 critical_exit(); 3125 flags = td->td_md.md_saved_flags; 3126 td->td_md.md_spinlock_count--; 3127 if (td->td_md.md_spinlock_count == 0) 3128 intr_restore(flags); 3129} 3130 3131#if defined(I586_CPU) && !defined(NO_F00F_HACK) 3132static void f00f_hack(void *unused); 3133SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 3134 3135static void 3136f00f_hack(void *unused) 3137{ 3138 struct gate_descriptor *new_idt; 3139 vm_offset_t tmp; 3140 3141 if (!has_f00f_bug) 3142 return; 3143 3144 GIANT_REQUIRED; 3145 3146 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 3147 3148 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 3149 if (tmp == 0) 3150 panic("kmem_malloc returned 0"); 3151 3152 /* Put the problematic entry (#6) at the end of the lower page. */ 3153 new_idt = (struct gate_descriptor*) 3154 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 3155 bcopy(idt, new_idt, sizeof(idt0)); 3156 r_idt.rd_base = (u_int)new_idt; 3157 lidt(&r_idt); 3158 idt = new_idt; 3159 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 3160} 3161#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 3162 3163/* 3164 * Construct a PCB from a trapframe. This is called from kdb_trap() where 3165 * we want to start a backtrace from the function that caused us to enter 3166 * the debugger. We have the context in the trapframe, but base the trace 3167 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 3168 * enough for a backtrace. 3169 */ 3170void 3171makectx(struct trapframe *tf, struct pcb *pcb) 3172{ 3173 3174 pcb->pcb_edi = tf->tf_edi; 3175 pcb->pcb_esi = tf->tf_esi; 3176 pcb->pcb_ebp = tf->tf_ebp; 3177 pcb->pcb_ebx = tf->tf_ebx; 3178 pcb->pcb_eip = tf->tf_eip; 3179 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 3180} 3181 3182int 3183ptrace_set_pc(struct thread *td, u_long addr) 3184{ 3185 3186 td->td_frame->tf_eip = addr; 3187 return (0); 3188} 3189 3190int 3191ptrace_single_step(struct thread *td) 3192{ 3193 td->td_frame->tf_eflags |= PSL_T; 3194 return (0); 3195} 3196 3197int 3198ptrace_clear_single_step(struct thread *td) 3199{ 3200 td->td_frame->tf_eflags &= ~PSL_T; 3201 return (0); 3202} 3203 3204int 3205fill_regs(struct thread *td, struct reg *regs) 3206{ 3207 struct pcb *pcb; 3208 struct trapframe *tp; 3209 3210 tp = td->td_frame; 3211 pcb = td->td_pcb; 3212 regs->r_gs = pcb->pcb_gs; 3213 return (fill_frame_regs(tp, regs)); 3214} 3215 3216int 3217fill_frame_regs(struct trapframe *tp, struct reg *regs) 3218{ 3219 regs->r_fs = tp->tf_fs; 3220 regs->r_es = tp->tf_es; 3221 regs->r_ds = tp->tf_ds; 3222 regs->r_edi = tp->tf_edi; 3223 regs->r_esi = tp->tf_esi; 3224 regs->r_ebp = tp->tf_ebp; 3225 regs->r_ebx = tp->tf_ebx; 3226 regs->r_edx = tp->tf_edx; 3227 regs->r_ecx = tp->tf_ecx; 3228 regs->r_eax = tp->tf_eax; 3229 regs->r_eip = tp->tf_eip; 3230 regs->r_cs = tp->tf_cs; 3231 regs->r_eflags = tp->tf_eflags; 3232 regs->r_esp = tp->tf_esp; 3233 regs->r_ss = tp->tf_ss; 3234 return (0); 3235} 3236 3237int 3238set_regs(struct thread *td, struct reg *regs) 3239{ 3240 struct pcb *pcb; 3241 struct trapframe *tp; 3242 3243 tp = td->td_frame; 3244 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 3245 !CS_SECURE(regs->r_cs)) 3246 return (EINVAL); 3247 pcb = td->td_pcb; 3248 tp->tf_fs = regs->r_fs; 3249 tp->tf_es = regs->r_es; 3250 tp->tf_ds = regs->r_ds; 3251 tp->tf_edi = regs->r_edi; 3252 tp->tf_esi = regs->r_esi; 3253 tp->tf_ebp = regs->r_ebp; 3254 tp->tf_ebx = regs->r_ebx; 3255 tp->tf_edx = regs->r_edx; 3256 tp->tf_ecx = regs->r_ecx; 3257 tp->tf_eax = regs->r_eax; 3258 tp->tf_eip = regs->r_eip; 3259 tp->tf_cs = regs->r_cs; 3260 tp->tf_eflags = regs->r_eflags; 3261 tp->tf_esp = regs->r_esp; 3262 tp->tf_ss = regs->r_ss; 3263 pcb->pcb_gs = regs->r_gs; 3264 return (0); 3265} 3266 3267#ifdef CPU_ENABLE_SSE 3268static void 3269fill_fpregs_xmm(sv_xmm, sv_87) 3270 struct savexmm *sv_xmm; 3271 struct save87 *sv_87; 3272{ 3273 register struct env87 *penv_87 = &sv_87->sv_env; 3274 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3275 int i; 3276 3277 bzero(sv_87, sizeof(*sv_87)); 3278 3279 /* FPU control/status */ 3280 penv_87->en_cw = penv_xmm->en_cw; 3281 penv_87->en_sw = penv_xmm->en_sw; 3282 penv_87->en_tw = penv_xmm->en_tw; 3283 penv_87->en_fip = penv_xmm->en_fip; 3284 penv_87->en_fcs = penv_xmm->en_fcs; 3285 penv_87->en_opcode = penv_xmm->en_opcode; 3286 penv_87->en_foo = penv_xmm->en_foo; 3287 penv_87->en_fos = penv_xmm->en_fos; 3288 3289 /* FPU registers */ 3290 for (i = 0; i < 8; ++i) 3291 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3292} 3293 3294static void 3295set_fpregs_xmm(sv_87, sv_xmm) 3296 struct save87 *sv_87; 3297 struct savexmm *sv_xmm; 3298{ 3299 register struct env87 *penv_87 = &sv_87->sv_env; 3300 register struct envxmm *penv_xmm = &sv_xmm->sv_env; 3301 int i; 3302 3303 /* FPU control/status */ 3304 penv_xmm->en_cw = penv_87->en_cw; 3305 penv_xmm->en_sw = penv_87->en_sw; 3306 penv_xmm->en_tw = penv_87->en_tw; 3307 penv_xmm->en_fip = penv_87->en_fip; 3308 penv_xmm->en_fcs = penv_87->en_fcs; 3309 penv_xmm->en_opcode = penv_87->en_opcode; 3310 penv_xmm->en_foo = penv_87->en_foo; 3311 penv_xmm->en_fos = penv_87->en_fos; 3312 3313 /* FPU registers */ 3314 for (i = 0; i < 8; ++i) 3315 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3316} 3317#endif /* CPU_ENABLE_SSE */ 3318 3319int 3320fill_fpregs(struct thread *td, struct fpreg *fpregs) 3321{ 3322 3323 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 3324 P_SHOULDSTOP(td->td_proc), 3325 ("not suspended thread %p", td)); 3326#ifdef DEV_NPX 3327 npxgetregs(td); 3328#else 3329 bzero(fpregs, sizeof(*fpregs)); 3330#endif 3331#ifdef CPU_ENABLE_SSE 3332 if (cpu_fxsr) 3333 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm, 3334 (struct save87 *)fpregs); 3335 else 3336#endif /* CPU_ENABLE_SSE */ 3337 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs, 3338 sizeof(*fpregs)); 3339 return (0); 3340} 3341 3342int 3343set_fpregs(struct thread *td, struct fpreg *fpregs) 3344{ 3345 3346#ifdef CPU_ENABLE_SSE 3347 if (cpu_fxsr) 3348 set_fpregs_xmm((struct save87 *)fpregs, 3349 &td->td_pcb->pcb_user_save.sv_xmm); 3350 else 3351#endif /* CPU_ENABLE_SSE */ 3352 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87, 3353 sizeof(*fpregs)); 3354#ifdef DEV_NPX 3355 npxuserinited(td); 3356#endif 3357 return (0); 3358} 3359 3360/* 3361 * Get machine context. 3362 */ 3363int 3364get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3365{ 3366 struct trapframe *tp; 3367 struct segment_descriptor *sdp; 3368 3369 tp = td->td_frame; 3370 3371 PROC_LOCK(curthread->td_proc); 3372 mcp->mc_onstack = sigonstack(tp->tf_esp); 3373 PROC_UNLOCK(curthread->td_proc); 3374 mcp->mc_gs = td->td_pcb->pcb_gs; 3375 mcp->mc_fs = tp->tf_fs; 3376 mcp->mc_es = tp->tf_es; 3377 mcp->mc_ds = tp->tf_ds; 3378 mcp->mc_edi = tp->tf_edi; 3379 mcp->mc_esi = tp->tf_esi; 3380 mcp->mc_ebp = tp->tf_ebp; 3381 mcp->mc_isp = tp->tf_isp; 3382 mcp->mc_eflags = tp->tf_eflags; 3383 if (flags & GET_MC_CLEAR_RET) { 3384 mcp->mc_eax = 0; 3385 mcp->mc_edx = 0; 3386 mcp->mc_eflags &= ~PSL_C; 3387 } else { 3388 mcp->mc_eax = tp->tf_eax; 3389 mcp->mc_edx = tp->tf_edx; 3390 } 3391 mcp->mc_ebx = tp->tf_ebx; 3392 mcp->mc_ecx = tp->tf_ecx; 3393 mcp->mc_eip = tp->tf_eip; 3394 mcp->mc_cs = tp->tf_cs; 3395 mcp->mc_esp = tp->tf_esp; 3396 mcp->mc_ss = tp->tf_ss; 3397 mcp->mc_len = sizeof(*mcp); 3398 get_fpcontext(td, mcp); 3399 sdp = &td->td_pcb->pcb_fsd; 3400 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3401 sdp = &td->td_pcb->pcb_gsd; 3402 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3403 mcp->mc_flags = 0; 3404 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3405 return (0); 3406} 3407 3408/* 3409 * Set machine context. 3410 * 3411 * However, we don't set any but the user modifiable flags, and we won't 3412 * touch the cs selector. 3413 */ 3414int 3415set_mcontext(struct thread *td, const mcontext_t *mcp) 3416{ 3417 struct trapframe *tp; 3418 int eflags, ret; 3419 3420 tp = td->td_frame; 3421 if (mcp->mc_len != sizeof(*mcp)) 3422 return (EINVAL); 3423 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3424 (tp->tf_eflags & ~PSL_USERCHANGE); 3425 if ((ret = set_fpcontext(td, mcp)) == 0) { 3426 tp->tf_fs = mcp->mc_fs; 3427 tp->tf_es = mcp->mc_es; 3428 tp->tf_ds = mcp->mc_ds; 3429 tp->tf_edi = mcp->mc_edi; 3430 tp->tf_esi = mcp->mc_esi; 3431 tp->tf_ebp = mcp->mc_ebp; 3432 tp->tf_ebx = mcp->mc_ebx; 3433 tp->tf_edx = mcp->mc_edx; 3434 tp->tf_ecx = mcp->mc_ecx; 3435 tp->tf_eax = mcp->mc_eax; 3436 tp->tf_eip = mcp->mc_eip; 3437 tp->tf_eflags = eflags; 3438 tp->tf_esp = mcp->mc_esp; 3439 tp->tf_ss = mcp->mc_ss; 3440 td->td_pcb->pcb_gs = mcp->mc_gs; 3441 ret = 0; 3442 } 3443 return (ret); 3444} 3445 3446static void 3447get_fpcontext(struct thread *td, mcontext_t *mcp) 3448{ 3449 3450#ifndef DEV_NPX 3451 mcp->mc_fpformat = _MC_FPFMT_NODEV; 3452 mcp->mc_ownedfp = _MC_FPOWNED_NONE; 3453 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); 3454#else 3455 mcp->mc_ownedfp = npxgetregs(td); 3456 bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0], 3457 sizeof(mcp->mc_fpstate)); 3458 mcp->mc_fpformat = npxformat(); 3459#endif 3460} 3461 3462static int 3463set_fpcontext(struct thread *td, const mcontext_t *mcp) 3464{ 3465 3466 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3467 return (0); 3468 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3469 mcp->mc_fpformat != _MC_FPFMT_XMM) 3470 return (EINVAL); 3471 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) 3472 /* We don't care what state is left in the FPU or PCB. */ 3473 fpstate_drop(td); 3474 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3475 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3476#ifdef DEV_NPX 3477#ifdef CPU_ENABLE_SSE 3478 if (cpu_fxsr) 3479 ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env. 3480 en_mxcsr &= cpu_mxcsr_mask; 3481#endif 3482 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate); 3483#endif 3484 } else 3485 return (EINVAL); 3486 return (0); 3487} 3488 3489static void 3490fpstate_drop(struct thread *td) 3491{ 3492 3493 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3494 critical_enter(); 3495#ifdef DEV_NPX 3496 if (PCPU_GET(fpcurthread) == td) 3497 npxdrop(); 3498#endif 3499 /* 3500 * XXX force a full drop of the npx. The above only drops it if we 3501 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3502 * 3503 * XXX I don't much like npxgetregs()'s semantics of doing a full 3504 * drop. Dropping only to the pcb matches fnsave's behaviour. 3505 * We only need to drop to !PCB_INITDONE in sendsig(). But 3506 * sendsig() is the only caller of npxgetregs()... perhaps we just 3507 * have too many layers. 3508 */ 3509 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3510 PCB_NPXUSERINITDONE); 3511 critical_exit(); 3512} 3513 3514int 3515fill_dbregs(struct thread *td, struct dbreg *dbregs) 3516{ 3517 struct pcb *pcb; 3518 3519 if (td == NULL) { 3520 dbregs->dr[0] = rdr0(); 3521 dbregs->dr[1] = rdr1(); 3522 dbregs->dr[2] = rdr2(); 3523 dbregs->dr[3] = rdr3(); 3524 dbregs->dr[4] = rdr4(); 3525 dbregs->dr[5] = rdr5(); 3526 dbregs->dr[6] = rdr6(); 3527 dbregs->dr[7] = rdr7(); 3528 } else { 3529 pcb = td->td_pcb; 3530 dbregs->dr[0] = pcb->pcb_dr0; 3531 dbregs->dr[1] = pcb->pcb_dr1; 3532 dbregs->dr[2] = pcb->pcb_dr2; 3533 dbregs->dr[3] = pcb->pcb_dr3; 3534 dbregs->dr[4] = 0; 3535 dbregs->dr[5] = 0; 3536 dbregs->dr[6] = pcb->pcb_dr6; 3537 dbregs->dr[7] = pcb->pcb_dr7; 3538 } 3539 return (0); 3540} 3541 3542int 3543set_dbregs(struct thread *td, struct dbreg *dbregs) 3544{ 3545 struct pcb *pcb; 3546 int i; 3547 3548 if (td == NULL) { 3549 load_dr0(dbregs->dr[0]); 3550 load_dr1(dbregs->dr[1]); 3551 load_dr2(dbregs->dr[2]); 3552 load_dr3(dbregs->dr[3]); 3553 load_dr4(dbregs->dr[4]); 3554 load_dr5(dbregs->dr[5]); 3555 load_dr6(dbregs->dr[6]); 3556 load_dr7(dbregs->dr[7]); 3557 } else { 3558 /* 3559 * Don't let an illegal value for dr7 get set. Specifically, 3560 * check for undefined settings. Setting these bit patterns 3561 * result in undefined behaviour and can lead to an unexpected 3562 * TRCTRAP. 3563 */ 3564 for (i = 0; i < 4; i++) { 3565 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3566 return (EINVAL); 3567 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3568 return (EINVAL); 3569 } 3570 3571 pcb = td->td_pcb; 3572 3573 /* 3574 * Don't let a process set a breakpoint that is not within the 3575 * process's address space. If a process could do this, it 3576 * could halt the system by setting a breakpoint in the kernel 3577 * (if ddb was enabled). Thus, we need to check to make sure 3578 * that no breakpoints are being enabled for addresses outside 3579 * process's address space. 3580 * 3581 * XXX - what about when the watched area of the user's 3582 * address space is written into from within the kernel 3583 * ... wouldn't that still cause a breakpoint to be generated 3584 * from within kernel mode? 3585 */ 3586 3587 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3588 /* dr0 is enabled */ 3589 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3590 return (EINVAL); 3591 } 3592 3593 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3594 /* dr1 is enabled */ 3595 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3596 return (EINVAL); 3597 } 3598 3599 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3600 /* dr2 is enabled */ 3601 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3602 return (EINVAL); 3603 } 3604 3605 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3606 /* dr3 is enabled */ 3607 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3608 return (EINVAL); 3609 } 3610 3611 pcb->pcb_dr0 = dbregs->dr[0]; 3612 pcb->pcb_dr1 = dbregs->dr[1]; 3613 pcb->pcb_dr2 = dbregs->dr[2]; 3614 pcb->pcb_dr3 = dbregs->dr[3]; 3615 pcb->pcb_dr6 = dbregs->dr[6]; 3616 pcb->pcb_dr7 = dbregs->dr[7]; 3617 3618 pcb->pcb_flags |= PCB_DBREGS; 3619 } 3620 3621 return (0); 3622} 3623 3624/* 3625 * Return > 0 if a hardware breakpoint has been hit, and the 3626 * breakpoint was in user space. Return 0, otherwise. 3627 */ 3628int 3629user_dbreg_trap(void) 3630{ 3631 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ 3632 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3633 int nbp; /* number of breakpoints that triggered */ 3634 caddr_t addr[4]; /* breakpoint addresses */ 3635 int i; 3636 3637 dr7 = rdr7(); 3638 if ((dr7 & 0x000000ff) == 0) { 3639 /* 3640 * all GE and LE bits in the dr7 register are zero, 3641 * thus the trap couldn't have been caused by the 3642 * hardware debug registers 3643 */ 3644 return 0; 3645 } 3646 3647 nbp = 0; 3648 dr6 = rdr6(); 3649 bp = dr6 & 0x0000000f; 3650 3651 if (!bp) { 3652 /* 3653 * None of the breakpoint bits are set meaning this 3654 * trap was not caused by any of the debug registers 3655 */ 3656 return 0; 3657 } 3658 3659 /* 3660 * at least one of the breakpoints were hit, check to see 3661 * which ones and if any of them are user space addresses 3662 */ 3663 3664 if (bp & 0x01) { 3665 addr[nbp++] = (caddr_t)rdr0(); 3666 } 3667 if (bp & 0x02) { 3668 addr[nbp++] = (caddr_t)rdr1(); 3669 } 3670 if (bp & 0x04) { 3671 addr[nbp++] = (caddr_t)rdr2(); 3672 } 3673 if (bp & 0x08) { 3674 addr[nbp++] = (caddr_t)rdr3(); 3675 } 3676 3677 for (i = 0; i < nbp; i++) { 3678 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3679 /* 3680 * addr[i] is in user space 3681 */ 3682 return nbp; 3683 } 3684 } 3685 3686 /* 3687 * None of the breakpoints are in user space. 3688 */ 3689 return 0; 3690} 3691 3692#ifdef KDB 3693 3694/* 3695 * Provide inb() and outb() as functions. They are normally only available as 3696 * inline functions, thus cannot be called from the debugger. 3697 */ 3698 3699/* silence compiler warnings */ 3700u_char inb_(u_short); 3701void outb_(u_short, u_char); 3702 3703u_char 3704inb_(u_short port) 3705{ 3706 return inb(port); 3707} 3708 3709void 3710outb_(u_short port, u_char data) 3711{ 3712 outb(port, data); 3713} 3714 3715#endif /* KDB */ 3716