1/* $NetBSD$ */ 2 3/*- 4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Coyote Point Systems, Inc. which was written under contract to Coyote 14 * Point by Jed Davis and Devon O'Dell. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38/* 39 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 40 * 41 * Permission to use, copy, modify, and distribute this software for any 42 * purpose with or without fee is hereby granted, provided that the above 43 * copyright notice and this permission notice appear in all copies. 44 * 45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 52 */ 53 54/* 55 * Copyright (c) 2007 Manuel Bouyer. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 76 * 77 */ 78 79/*- 80 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 81 * All rights reserved. 82 * 83 * This code is derived from software contributed to Berkeley by 84 * William Jolitz. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. Neither the name of the University nor the names of its contributors 95 * may be used to endorse or promote products derived from this software 96 * without specific prior written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 100 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 101 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 102 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 103 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 104 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 105 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 106 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 107 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 108 * SUCH DAMAGE. 109 * 110 * @(#)machdep.c 7.4 (Berkeley) 6/3/91 111 */ 112 113#include <sys/cdefs.h> 114__KERNEL_RCSID(0, "$NetBSD$"); 115 116/* #define XENDEBUG_LOW */ 117 118#include "opt_modular.h" 119#include "opt_user_ldt.h" 120#include "opt_ddb.h" 121#include "opt_kgdb.h" 122#include "opt_cpureset_delay.h" 123#include "opt_mtrr.h" 124#include "opt_realmem.h" 125#include "opt_xen.h" 126#ifndef XEN 127#include "opt_physmem.h" 128#endif 129#include "isa.h" 130#include "pci.h" 131 132#include <sys/param.h> 133#include <sys/systm.h> 134#include <sys/signal.h> 135#include <sys/signalvar.h> 136#include <sys/kernel.h> 137#include <sys/cpu.h> 138#include <sys/exec.h> 139#include <sys/exec_aout.h> /* for MID_* */ 140#include <sys/reboot.h> 141#include <sys/conf.h> 142#include <sys/mbuf.h> 143#include <sys/msgbuf.h> 144#include <sys/mount.h> 145#include <sys/core.h> 146#include <sys/kcore.h> 147#include <sys/ucontext.h> 148#include <machine/kcore.h> 149#include <sys/ras.h> 150#include <sys/sa.h> 151#include <sys/savar.h> 152#include <sys/syscallargs.h> 153#include <sys/ksyms.h> 154#include <sys/device.h> 155#include <sys/lwp.h> 156#include <sys/proc.h> 157 158#ifdef KGDB 159#include <sys/kgdb.h> 160#endif 161 162#include <dev/cons.h> 163#include <dev/mm.h> 164 165#include <uvm/uvm.h> 166#include <uvm/uvm_page.h> 167 168#include <sys/sysctl.h> 169 170#include <machine/cpu.h> 171#include <machine/cpufunc.h> 172#include <machine/gdt.h> 173#include <machine/intr.h> 174#include <machine/pio.h> 175#include <machine/psl.h> 176#include <machine/reg.h> 177#include <machine/specialreg.h> 178#include <machine/bootinfo.h> 179#include <machine/fpu.h> 180#include <machine/mtrr.h> 181#include <machine/mpbiosvar.h> 182 183#include <x86/cputypes.h> 184#include <x86/cpuvar.h> 185#include <x86/machdep.h> 186 187#include <x86/x86/tsc.h> 188 189#include <dev/isa/isareg.h> 190#include <machine/isa_machdep.h> 191#include <dev/ic/i8042reg.h> 192 193#ifdef XEN 194#include <xen/xen.h> 195#include <xen/hypervisor.h> 196#include <xen/evtchn.h> 197#endif 198 199#ifdef DDB 200#include <machine/db_machdep.h> 201#include <ddb/db_extern.h> 202#include <ddb/db_output.h> 203#include <ddb/db_interface.h> 204#endif 205 206#include "acpica.h" 207 208#if NACPICA > 0 209#include <dev/acpi/acpivar.h> 210#define ACPI_MACHDEP_PRIVATE 211#include <machine/acpi_machdep.h> 212#endif 213 214#include "isa.h" 215#include "isadma.h" 216#include "ksyms.h" 217 218/* the following is used externally (sysctl_hw) */ 219char machine[] = "amd64"; /* CPU "architecture" */ 220char machine_arch[] = "x86_64"; /* machine == machine_arch */ 221 222/* Our exported CPU info; we have only one right now. */ 223struct cpu_info cpu_info_primary; 224struct cpu_info *cpu_info_list; 225 226extern struct bi_devmatch *x86_alldisks; 227extern int x86_ndisks; 228 229#ifdef CPURESET_DELAY 230int cpureset_delay = CPURESET_DELAY; 231#else 232int cpureset_delay = 2000; /* default to 2s */ 233#endif 234 235int cpu_class = CPUCLASS_686; 236 237#ifdef MTRR 238struct mtrr_funcs *mtrr_funcs; 239#endif 240 241int physmem; 242uint64_t dumpmem_low; 243uint64_t dumpmem_high; 244int cpu_class; 245int use_pae; 246 247#ifndef NO_SPARSE_DUMP 248int sparse_dump = 0; 249 250paddr_t max_paddr = 0; 251unsigned char *sparse_dump_physmap; 252#endif 253 254char *dump_headerbuf, *dump_headerbuf_ptr; 255#define dump_headerbuf_size PAGE_SIZE 256#define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size) 257#define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr) 258daddr_t dump_header_blkno; 259 260size_t dump_nmemsegs; 261size_t dump_npages; 262size_t dump_header_size; 263size_t dump_totalbytesleft; 264 265vaddr_t msgbuf_vaddr; 266paddr_t msgbuf_paddr; 267 268struct { 269 paddr_t paddr; 270 psize_t sz; 271} msgbuf_p_seg[VM_PHYSSEG_MAX]; 272unsigned int msgbuf_p_cnt = 0; 273 274vaddr_t idt_vaddr; 275paddr_t idt_paddr; 276 277vaddr_t lo32_vaddr; 278paddr_t lo32_paddr; 279 280vaddr_t module_start, module_end; 281static struct vm_map module_map_store; 282extern struct vm_map *module_map; 283vaddr_t kern_end; 284 285struct vm_map *phys_map = NULL; 286 287extern paddr_t avail_start, avail_end; 288#ifdef XEN 289extern paddr_t pmap_pa_start, pmap_pa_end; 290#endif 291 292#ifndef XEN 293void (*delay_func)(unsigned int) = i8254_delay; 294void (*initclock_func)(void) = i8254_initclocks; 295#else /* XEN */ 296void (*delay_func)(unsigned int) = xen_delay; 297void (*initclock_func)(void) = xen_initclocks; 298#endif 299 300 301#ifdef MTRR 302struct mtrr_funcs *mtrr_funcs; 303#endif 304 305/* 306 * Size of memory segments, before any memory is stolen. 307 */ 308phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 309int mem_cluster_cnt; 310 311char x86_64_doubleflt_stack[4096]; 312 313int cpu_dump(void); 314int cpu_dumpsize(void); 315u_long cpu_dump_mempagecnt(void); 316void dodumpsys(void); 317void dumpsys(void); 318 319extern int time_adjusted; /* XXX no common header */ 320 321void dump_misc_init(void); 322void dump_seg_prep(void); 323int dump_seg_iter(int (*)(paddr_t, paddr_t)); 324 325#ifndef NO_SPARSE_DUMP 326void sparse_dump_reset(void); 327void sparse_dump_mark(vaddr_t, vaddr_t, int); 328void cpu_dump_prep_sparse(void); 329#endif 330 331void dump_header_start(void); 332int dump_header_flush(void); 333int dump_header_addbytes(const void*, size_t); 334int dump_header_addseg(paddr_t, paddr_t); 335int dump_header_finish(void); 336 337int dump_seg_count_range(paddr_t, paddr_t); 338int dumpsys_seg(paddr_t, paddr_t); 339 340void init_x86_64(paddr_t); 341 342/* 343 * Machine-dependent startup code 344 */ 345void 346cpu_startup(void) 347{ 348 int x, y; 349 vaddr_t minaddr, maxaddr; 350 psize_t sz; 351 352 /* 353 * For console drivers that require uvm and pmap to be initialized, 354 * we'll give them one more chance here... 355 */ 356 consinit(); 357 358 /* 359 * Initialize error message buffer (et end of core). 360 */ 361 if (msgbuf_p_cnt == 0) 362 panic("msgbuf paddr map has not been set up"); 363 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) 364 continue; 365 366 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, 367 UVM_KMF_VAONLY); 368 if (msgbuf_vaddr == 0) 369 panic("failed to valloc msgbuf_vaddr"); 370 371 /* msgbuf_paddr was init'd in pmap */ 372 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { 373 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) 374 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, 375 msgbuf_p_seg[y].paddr + x * PAGE_SIZE, 376 VM_PROT_READ | UVM_PROT_WRITE, 0); 377 } 378 379 pmap_update(pmap_kernel()); 380 381 initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); 382 383 minaddr = 0; 384 385 /* 386 * Allocate a submap for physio 387 */ 388 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 389 VM_PHYS_SIZE, 0, false, NULL); 390 391 uvm_map_setup(&module_map_store, module_start, module_end, 0); 392 module_map_store.pmap = pmap_kernel(); 393 module_map = &module_map_store; 394 395 /* Say hello. */ 396 banner(); 397 398#if NISA > 0 || NPCI > 0 399 /* Safe for i/o port / memory space allocation to use malloc now. */ 400 x86_bus_space_mallocok(); 401#endif 402 403 gdt_init(); 404 x86_64_proc0_tss_ldt_init(); 405 406 cpu_init_tss(&cpu_info_primary); 407#if !defined(XEN) 408 ltr(cpu_info_primary.ci_tss_sel); 409#endif /* !defined(XEN) */ 410 411 x86_startup(); 412} 413 414#ifdef XEN 415/* used in assembly */ 416void hypervisor_callback(void); 417void failsafe_callback(void); 418void x86_64_switch_context(struct pcb *); 419void x86_64_tls_switch(struct lwp *); 420 421void 422x86_64_switch_context(struct pcb *new) 423{ 424 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); 425 struct physdev_op physop; 426 physop.cmd = PHYSDEVOP_SET_IOPL; 427 physop.u.set_iopl.iopl = new->pcb_iopl; 428 HYPERVISOR_physdev_op(&physop); 429} 430 431void 432x86_64_tls_switch(struct lwp *l) 433{ 434 struct cpu_info *ci = curcpu(); 435 struct pcb *pcb = lwp_getpcb(l); 436 struct trapframe *tf = l->l_md.md_regs; 437 438 /* 439 * Raise the IPL to IPL_HIGH. 440 * FPU IPIs can alter the LWP's saved cr0. Dropping the priority 441 * is deferred until mi_switch(), when cpu_switchto() returns. 442 */ 443 (void)splhigh(); 444 /* 445 * If our floating point registers are on a different CPU, 446 * set CR0_TS so we'll trap rather than reuse bogus state. 447 */ 448 if (l != ci->ci_fpcurlwp) { 449 HYPERVISOR_fpu_taskswitch(1); 450 } 451 452 /* Update TLS segment pointers */ 453 if (pcb->pcb_flags & PCB_COMPAT32) { 454 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 455 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 456 setfs(tf->tf_fs); 457 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); 458 } else { 459 setfs(0); 460 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); 461 HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); 462 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); 463 } 464} 465#endif /* XEN */ 466 467/* 468 * Set up proc0's TSS and LDT. 469 */ 470void 471x86_64_proc0_tss_ldt_init(void) 472{ 473 struct lwp *l = &lwp0; 474 struct pcb *pcb = lwp_getpcb(l); 475 476 pcb->pcb_flags = 0; 477 pcb->pcb_fs = 0; 478 pcb->pcb_gs = 0; 479 pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + KSTACK_SIZE - 16) & ~0xf; 480 pcb->pcb_iopl = SEL_KPL; 481 482 pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 483 pcb->pcb_cr0 = rcr0() & ~CR0_TS; 484 l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; 485 486#if !defined(XEN) 487 lldt(pmap_kernel()->pm_ldt_sel); 488#else 489 { 490 struct physdev_op physop; 491 xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3); 492 /* Reset TS bit and set kernel stack for interrupt handlers */ 493 HYPERVISOR_fpu_taskswitch(1); 494 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); 495 physop.cmd = PHYSDEVOP_SET_IOPL; 496 physop.u.set_iopl.iopl = pcb->pcb_iopl; 497 HYPERVISOR_physdev_op(&physop); 498 } 499#endif /* XEN */ 500} 501 502/* 503 * Set up TSS and I/O bitmap. 504 */ 505void 506cpu_init_tss(struct cpu_info *ci) 507{ 508 struct x86_64_tss *tss = &ci->ci_tss; 509 uintptr_t p; 510 511 tss->tss_iobase = IOMAP_INVALOFF << 16; 512 /* tss->tss_ist[0] is filled by cpu_intr_init */ 513 514 /* double fault */ 515 tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16; 516 517 /* NMI */ 518 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED); 519 tss->tss_ist[2] = p + PAGE_SIZE - 16; 520 ci->ci_tss_sel = tss_alloc(tss); 521} 522 523/* 524 * machine dependent system variables. 525 */ 526static int 527sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) 528{ 529 struct btinfo_bootpath *bibp; 530 struct sysctlnode node; 531 532 bibp = lookup_bootinfo(BTINFO_BOOTPATH); 533 if(!bibp) 534 return(ENOENT); /* ??? */ 535 536 node = *rnode; 537 node.sysctl_data = bibp->bootpath; 538 node.sysctl_size = sizeof(bibp->bootpath); 539 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 540} 541 542static int 543sysctl_machdep_diskinfo(SYSCTLFN_ARGS) 544{ 545 struct sysctlnode node; 546 547 if (x86_alldisks == NULL) 548 return (ENOENT); 549 550 node = *rnode; 551 node.sysctl_data = x86_alldisks; 552 node.sysctl_size = sizeof(struct disklist) + 553 (x86_ndisks - 1) * sizeof(struct nativedisk_info); 554 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 555} 556 557SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup") 558{ 559 extern uint64_t tsc_freq; 560 561 sysctl_createv(clog, 0, NULL, NULL, 562 CTLFLAG_PERMANENT, 563 CTLTYPE_NODE, "machdep", NULL, 564 NULL, 0, NULL, 0, 565 CTL_MACHDEP, CTL_EOL); 566 567 sysctl_createv(clog, 0, NULL, NULL, 568 CTLFLAG_PERMANENT, 569 CTLTYPE_STRUCT, "console_device", NULL, 570 sysctl_consdev, 0, NULL, sizeof(dev_t), 571 CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); 572 sysctl_createv(clog, 0, NULL, NULL, 573 CTLFLAG_PERMANENT, 574 CTLTYPE_STRING, "booted_kernel", NULL, 575 sysctl_machdep_booted_kernel, 0, NULL, 0, 576 CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); 577 sysctl_createv(clog, 0, NULL, NULL, 578 CTLFLAG_PERMANENT, 579 CTLTYPE_STRUCT, "diskinfo", NULL, 580 sysctl_machdep_diskinfo, 0, NULL, 0, 581 CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); 582 sysctl_createv(clog, 0, NULL, NULL, 583 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 584 CTLTYPE_INT, "fpu_present", NULL, 585 NULL, 1, NULL, 0, 586 CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL); 587 sysctl_createv(clog, 0, NULL, NULL, 588 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 589 CTLTYPE_INT, "sse", NULL, 590 NULL, 1, NULL, 0, 591 CTL_MACHDEP, CPU_SSE, CTL_EOL); 592 sysctl_createv(clog, 0, NULL, NULL, 593 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 594 CTLTYPE_INT, "sse2", NULL, 595 NULL, 1, NULL, 0, 596 CTL_MACHDEP, CPU_SSE2, CTL_EOL); 597 sysctl_createv(clog, 0, NULL, NULL, 598 CTLFLAG_PERMANENT, 599 CTLTYPE_QUAD, "tsc_freq", NULL, 600 NULL, 0, &tsc_freq, 0, 601 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 602 sysctl_createv(clog, 0, NULL, NULL, 603 CTLFLAG_PERMANENT, 604 CTLTYPE_INT, "pae", 605 SYSCTL_DESCR("Whether the kernel uses PAE"), 606 NULL, 0, &use_pae, 0, 607 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 608#ifndef NO_SPARSE_DUMP 609 /* XXXjld Does this really belong under machdep, and not e.g. kern? */ 610 sysctl_createv(clog, 0, NULL, NULL, 611 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 612 CTLTYPE_INT, "sparse_dump", NULL, 613 NULL, 0, &sparse_dump, 0, 614 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 615#endif 616} 617 618void 619buildcontext(struct lwp *l, void *catcher, void *f) 620{ 621 struct trapframe *tf = l->l_md.md_regs; 622 623 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 624 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 625 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 626 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 627 628 tf->tf_rip = (uint64_t)catcher; 629 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 630 tf->tf_rflags &= ~PSL_CLEARSIG; 631 tf->tf_rsp = (uint64_t)f; 632 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 633 634 /* Ensure FP state is reset, if FP is used. */ 635 l->l_md.md_flags &= ~MDP_USEDFPU; 636} 637 638void 639sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) 640{ 641 642 printf("sendsig_sigcontext: illegal\n"); 643 sigexit(curlwp, SIGILL); 644} 645 646void 647sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) 648{ 649 struct lwp *l = curlwp; 650 struct proc *p = l->l_proc; 651 struct sigacts *ps = p->p_sigacts; 652 int onstack, tocopy, error; 653 int sig = ksi->ksi_signo; 654 struct sigframe_siginfo *fp, frame; 655 sig_t catcher = SIGACTION(p, sig).sa_handler; 656 struct trapframe *tf = l->l_md.md_regs; 657 char *sp; 658 659 KASSERT(mutex_owned(p->p_lock)); 660 661 /* Do we need to jump onto the signal stack? */ 662 onstack = 663 (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && 664 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; 665 666 /* Allocate space for the signal handler context. */ 667 if (onstack) 668 sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); 669 else 670 sp = (char *)tf->tf_rsp - 128; 671 672 sp -= sizeof(struct sigframe_siginfo); 673 /* 674 * Round down the stackpointer to a multiple of 16 for 675 * fxsave and the ABI. 676 */ 677 fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8); 678 679 /* 680 * Don't bother copying out FP state if there is none. 681 */ 682 if (l->l_md.md_flags & MDP_USEDFPU) 683 tocopy = sizeof (struct sigframe_siginfo); 684 else 685 tocopy = sizeof (struct sigframe_siginfo) - 686 sizeof (frame.sf_uc.uc_mcontext.__fpregs); 687 688 frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; 689 frame.sf_si._info = ksi->ksi_info; 690 frame.sf_uc.uc_flags = _UC_SIGMASK; 691 frame.sf_uc.uc_sigmask = *mask; 692 frame.sf_uc.uc_link = l->l_ctxlink; 693 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) 694 ? _UC_SETSTACK : _UC_CLRSTACK; 695 memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack)); 696 sendsig_reset(l, sig); 697 698 mutex_exit(p->p_lock); 699 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); 700 error = copyout(&frame, fp, tocopy); 701 mutex_enter(p->p_lock); 702 703 if (error != 0) { 704 /* 705 * Process has trashed its stack; give it an illegal 706 * instruction to halt it in its tracks. 707 */ 708 sigexit(l, SIGILL); 709 /* NOTREACHED */ 710 } 711 712 buildcontext(l, catcher, fp); 713 714 tf->tf_rdi = sig; 715 tf->tf_rsi = (uint64_t)&fp->sf_si; 716 tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; 717 718 /* Remember that we're now on the signal stack. */ 719 if (onstack) 720 l->l_sigstk.ss_flags |= SS_ONSTACK; 721 722 if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { 723 /* 724 * process has given an invalid address for the 725 * handler. Stop it, but do not do it before so 726 * we can return the right info to userland (or in core dump) 727 */ 728 sigexit(l, SIGILL); 729 /* NOTREACHED */ 730 } 731} 732 733void 734cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas, void *ap, void *sp, sa_upcall_t upcall) 735{ 736 struct trapframe *tf; 737 738 tf = l->l_md.md_regs; 739 740#if 0 741 printf("proc %d: upcall to lwp %d, type %d ev %d int %d sas %p to %p\n", 742 (int)l->l_proc->p_pid, (int)l->l_lid, type, nevents, ninterrupted, 743 sas, (void *)upcall); 744#endif 745 746 tf->tf_rdi = type; 747 tf->tf_rsi = (u_int64_t)sas; 748 tf->tf_rdx = nevents; 749 tf->tf_rcx = ninterrupted; 750 tf->tf_r8 = (u_int64_t)ap; 751 752 tf->tf_rip = (u_int64_t)upcall; 753 tf->tf_rsp = ((unsigned long)sp & ~15) - 8; 754 tf->tf_rbp = 0; /* indicate call-frame-top to debuggers */ 755 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 756 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 757 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 758 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 759 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 760 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 761 tf->tf_rflags &= ~(PSL_T|PSL_VM|PSL_AC); 762 763 l->l_md.md_flags |= MDP_IRET; 764} 765 766struct pcb dumppcb; 767 768void 769cpu_reboot(int howto, char *bootstr) 770{ 771 static bool syncdone = false; 772 int s = IPL_NONE; 773 774 if (cold) { 775 howto |= RB_HALT; 776 goto haltsys; 777 } 778 779 boothowto = howto; 780 781 /* i386 maybe_dump() */ 782 783 /* 784 * If we've panic'd, don't make the situation potentially 785 * worse by syncing or unmounting the file systems. 786 */ 787 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { 788 if (!syncdone) { 789 syncdone = true; 790 /* XXX used to force unmount as well, here */ 791 vfs_sync_all(curlwp); 792 /* 793 * If we've been adjusting the clock, the todr 794 * will be out of synch; adjust it now. 795 * 796 * XXX used to do this after unmounting all 797 * filesystems with vfs_shutdown(). 798 */ 799 if (time_adjusted != 0) 800 resettodr(); 801 } 802 803 while (vfs_unmountall1(curlwp, false, false) || 804 config_detach_all(boothowto) || 805 vfs_unmount_forceone(curlwp)) 806 ; /* do nothing */ 807 } else 808 suspendsched(); 809 810 pmf_system_shutdown(boothowto); 811 812 /* Disable interrupts. */ 813 s = splhigh(); 814 815 /* Do a dump if requested. */ 816 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) 817 dumpsys(); 818 819haltsys: 820 doshutdownhooks(); 821 822 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { 823#ifndef XEN 824#if NACPICA > 0 825 if (s != IPL_NONE) 826 splx(s); 827 828 acpi_enter_sleep_state(ACPI_STATE_S5); 829#endif 830#else /* XEN */ 831 HYPERVISOR_shutdown(); 832#endif /* XEN */ 833 } 834 835 cpu_broadcast_halt(); 836 837 if (howto & RB_HALT) { 838#if NACPICA > 0 839 acpi_disable(); 840#endif 841 842 printf("\n"); 843 printf("The operating system has halted.\n"); 844 printf("Please press any key to reboot.\n\n"); 845 cnpollc(1); /* for proper keyboard command handling */ 846 cngetc(); 847 cnpollc(0); 848 } 849 850 printf("rebooting...\n"); 851 if (cpureset_delay > 0) 852 delay(cpureset_delay * 1000); 853 cpu_reset(); 854 for(;;) ; 855 /*NOTREACHED*/ 856} 857 858/* 859 * XXXfvdl share dumpcode. 860 */ 861 862 /* 863 * Perform assorted dump-related initialization tasks. Assumes that 864 * the maximum physical memory address will not increase afterwards. 865 */ 866void 867dump_misc_init(void) 868{ 869#ifndef NO_SPARSE_DUMP 870 int i; 871#endif 872 873 if (dump_headerbuf != NULL) 874 return; /* already called */ 875 876#ifndef NO_SPARSE_DUMP 877 for (i = 0; i < mem_cluster_cnt; ++i) { 878 paddr_t top = mem_clusters[i].start + mem_clusters[i].size; 879 if (max_paddr < top) 880 max_paddr = top; 881 } 882#ifdef DEBUG 883 printf("dump_misc_init: max_paddr = 0x%lx\n", 884 (unsigned long)max_paddr); 885#endif 886 if (max_paddr == 0) { 887 printf("Your machine does not initialize mem_clusters; " 888 "sparse_dumps disabled\n"); 889 sparse_dump = 0; 890 } else { 891 sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, 892 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), 893 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 894 } 895#endif 896 dump_headerbuf = (void *)uvm_km_alloc(kernel_map, 897 dump_headerbuf_size, 898 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 899 /* XXXjld should check for failure here, disable dumps if so. */ 900} 901 902#ifndef NO_SPARSE_DUMP 903/* 904 * Clear the set of pages to include in a sparse dump. 905 */ 906void 907sparse_dump_reset(void) 908{ 909 memset(sparse_dump_physmap, 0, 910 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); 911} 912 913/* 914 * Include or exclude pages in a sparse dump, by half-open virtual 915 * address interval (which may wrap around the end of the space). 916 */ 917void 918sparse_dump_mark(vaddr_t vbegin, vaddr_t vend, int includep) 919{ 920 pmap_t pmap; 921 paddr_t p; 922 vaddr_t v; 923 924 /* 925 * If a partial page is called for, the whole page must be included. 926 */ 927 if (includep) { 928 vbegin = rounddown(vbegin, PAGE_SIZE); 929 vend = roundup(vend, PAGE_SIZE); 930 } else { 931 vbegin = roundup(vbegin, PAGE_SIZE); 932 vend = rounddown(vend, PAGE_SIZE); 933 } 934 935 pmap = pmap_kernel(); 936 for (v = vbegin; v != vend; v += PAGE_SIZE) { 937 if (pmap_extract(pmap, v, &p)) { 938 if (includep) 939 setbit(sparse_dump_physmap, p/PAGE_SIZE); 940 else 941 clrbit(sparse_dump_physmap, p/PAGE_SIZE); 942 } 943 } 944} 945 946/* 947 * Machine-dependently decides on the contents of a sparse dump, using 948 * the above. 949 */ 950void 951cpu_dump_prep_sparse(void) 952{ 953 sparse_dump_reset(); 954 /* XXX could the alternate recursive page table be skipped? */ 955 sparse_dump_mark((vaddr_t)PTE_BASE, (vaddr_t)KERN_BASE, 1); 956 /* Memory for I/O buffers could be unmarked here, for example. */ 957 /* The kernel text could also be unmarked, but gdb would be upset. */ 958} 959#endif 960 961/* 962 * Abstractly iterate over the collection of memory segments to be 963 * dumped; the callback lacks the customary environment-pointer 964 * argument because none of the current users really need one. 965 * 966 * To be used only after dump_seg_prep is called to set things up. 967 */ 968int 969dump_seg_iter(int (*callback)(paddr_t, paddr_t)) 970{ 971 int error, i; 972 973#define CALLBACK(start,size) do { \ 974 error = callback(start,size); \ 975 if (error) \ 976 return error; \ 977} while(0) 978 979 for (i = 0; i < mem_cluster_cnt; ++i) { 980#ifndef NO_SPARSE_DUMP 981 /* 982 * The bitmap is scanned within each memory segment, 983 * rather than over its entire domain, in case any 984 * pages outside of the memory proper have been mapped 985 * into kva; they might be devices that wouldn't 986 * appreciate being arbitrarily read, and including 987 * them could also break the assumption that a sparse 988 * dump will always be smaller than a full one. 989 */ 990 if (sparse_dump && sparse_dump_physmap) { 991 paddr_t p, start, end; 992 int lastset; 993 994 start = mem_clusters[i].start; 995 end = start + mem_clusters[i].size; 996 start = rounddown(start, PAGE_SIZE); /* unnecessary? */ 997 lastset = 0; 998 for (p = start; p < end; p += PAGE_SIZE) { 999 int thisset = isset(sparse_dump_physmap, 1000 p/PAGE_SIZE); 1001 1002 if (!lastset && thisset) 1003 start = p; 1004 if (lastset && !thisset) 1005 CALLBACK(start, p - start); 1006 lastset = thisset; 1007 } 1008 if (lastset) 1009 CALLBACK(start, p - start); 1010 } else 1011#endif 1012 CALLBACK(mem_clusters[i].start, mem_clusters[i].size); 1013 } 1014 return 0; 1015#undef CALLBACK 1016} 1017 1018/* 1019 * Prepare for an impending core dump: decide what's being dumped and 1020 * how much space it will take up. 1021 */ 1022void 1023dump_seg_prep(void) 1024{ 1025#ifndef NO_SPARSE_DUMP 1026 if (sparse_dump && sparse_dump_physmap) 1027 cpu_dump_prep_sparse(); 1028#endif 1029 1030 dump_nmemsegs = 0; 1031 dump_npages = 0; 1032 dump_seg_iter(dump_seg_count_range); 1033 1034 dump_header_size = ALIGN(sizeof(kcore_seg_t)) + 1035 ALIGN(sizeof(cpu_kcore_hdr_t)) + 1036 ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); 1037 dump_header_size = roundup(dump_header_size, dbtob(1)); 1038 1039 /* 1040 * savecore(8) will read this to decide how many pages to 1041 * copy, and cpu_dumpconf has already used the pessimistic 1042 * value to set dumplo, so it's time to tell the truth. 1043 */ 1044 dumpsize = dump_npages; /* XXX could these just be one variable? */ 1045} 1046 1047int 1048dump_seg_count_range(paddr_t start, paddr_t size) 1049{ 1050 ++dump_nmemsegs; 1051 dump_npages += size / PAGE_SIZE; 1052 return 0; 1053} 1054 1055/* 1056 * A sparse dump's header may be rather large, due to the number of 1057 * "segments" emitted. These routines manage a simple output buffer, 1058 * so that the header can be written to disk incrementally. 1059 */ 1060void 1061dump_header_start(void) 1062{ 1063 dump_headerbuf_ptr = dump_headerbuf; 1064 dump_header_blkno = dumplo; 1065} 1066 1067int 1068dump_header_flush(void) 1069{ 1070 const struct bdevsw *bdev; 1071 size_t to_write; 1072 int error; 1073 1074 bdev = bdevsw_lookup(dumpdev); 1075 to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); 1076 error = bdev->d_dump(dumpdev, dump_header_blkno, 1077 dump_headerbuf, to_write); 1078 dump_header_blkno += btodb(to_write); 1079 dump_headerbuf_ptr = dump_headerbuf; 1080 return error; 1081} 1082 1083int 1084dump_header_addbytes(const void* vptr, size_t n) 1085{ 1086 const char* ptr = vptr; 1087 int error; 1088 1089 while (n > dump_headerbuf_avail) { 1090 memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); 1091 ptr += dump_headerbuf_avail; 1092 n -= dump_headerbuf_avail; 1093 dump_headerbuf_ptr = dump_headerbuf_end; 1094 error = dump_header_flush(); 1095 if (error) 1096 return error; 1097 } 1098 memcpy(dump_headerbuf_ptr, ptr, n); 1099 dump_headerbuf_ptr += n; 1100 1101 return 0; 1102} 1103 1104int 1105dump_header_addseg(paddr_t start, paddr_t size) 1106{ 1107 phys_ram_seg_t seg = { start, size }; 1108 1109 return dump_header_addbytes(&seg, sizeof(seg)); 1110} 1111 1112int 1113dump_header_finish(void) 1114{ 1115 memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); 1116 return dump_header_flush(); 1117} 1118 1119 1120/* 1121 * These variables are needed by /sbin/savecore 1122 */ 1123uint32_t dumpmag = 0x8fca0101; /* magic number */ 1124int dumpsize = 0; /* pages */ 1125long dumplo = 0; /* blocks */ 1126 1127/* 1128 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers 1129 * for a full (non-sparse) dump. 1130 */ 1131int 1132cpu_dumpsize(void) 1133{ 1134 int size; 1135 1136 size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + 1137 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); 1138 if (roundup(size, dbtob(1)) != dbtob(1)) 1139 return (-1); 1140 1141 return (1); 1142} 1143 1144/* 1145 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped 1146 * for a full (non-sparse) dump. 1147 */ 1148u_long 1149cpu_dump_mempagecnt(void) 1150{ 1151 u_long i, n; 1152 1153 n = 0; 1154 for (i = 0; i < mem_cluster_cnt; i++) 1155 n += atop(mem_clusters[i].size); 1156 return (n); 1157} 1158 1159/* 1160 * cpu_dump: dump the machine-dependent kernel core dump headers. 1161 */ 1162int 1163cpu_dump(void) 1164{ 1165 int (*dump)(dev_t, daddr_t, void *, size_t); 1166 kcore_seg_t seg; 1167 cpu_kcore_hdr_t cpuhdr; 1168 const struct bdevsw *bdev; 1169 1170 bdev = bdevsw_lookup(dumpdev); 1171 if (bdev == NULL) 1172 return (ENXIO); 1173 1174 dump = bdev->d_dump; 1175 1176 /* 1177 * Generate a segment header. 1178 */ 1179 CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); 1180 seg.c_size = dump_header_size - ALIGN(sizeof(seg)); 1181 (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg))); 1182 1183 /* 1184 * Add the machine-dependent header info. 1185 */ 1186 cpuhdr.ptdpaddr = PDPpaddr; 1187 cpuhdr.nmemsegs = dump_nmemsegs; 1188 (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); 1189 1190 /* 1191 * Write out the memory segment descriptors. 1192 */ 1193 return dump_seg_iter(dump_header_addseg); 1194} 1195 1196/* 1197 * Doadump comes here after turning off memory management and 1198 * getting on the dump stack, either when called above, or by 1199 * the auto-restart code. 1200 */ 1201#define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ 1202static vaddr_t dumpspace; 1203 1204vaddr_t 1205reserve_dumppages(vaddr_t p) 1206{ 1207 1208 dumpspace = p; 1209 return (p + BYTES_PER_DUMP); 1210} 1211 1212int 1213dumpsys_seg(paddr_t maddr, paddr_t bytes) 1214{ 1215 u_long i, m, n; 1216 daddr_t blkno; 1217 const struct bdevsw *bdev; 1218 int (*dump)(dev_t, daddr_t, void *, size_t); 1219 int error; 1220 1221 if (dumpdev == NODEV) 1222 return ENODEV; 1223 bdev = bdevsw_lookup(dumpdev); 1224 if (bdev == NULL || bdev->d_psize == NULL) 1225 return ENODEV; 1226 1227 dump = bdev->d_dump; 1228 1229 blkno = dump_header_blkno; 1230 for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { 1231 /* Print out how many MBs we have left to go. */ 1232 if ((dump_totalbytesleft % (1024*1024)) == 0) 1233 printf_nolog("%lu ", (unsigned long) 1234 (dump_totalbytesleft / (1024 * 1024))); 1235 1236 /* Limit size for next transfer. */ 1237 n = bytes - i; 1238 if (n > BYTES_PER_DUMP) 1239 n = BYTES_PER_DUMP; 1240 1241 for (m = 0; m < n; m += NBPG) 1242 pmap_kenter_pa(dumpspace + m, maddr + m, 1243 VM_PROT_READ, 0); 1244 pmap_update(pmap_kernel()); 1245 1246 error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); 1247 if (error) 1248 return error; 1249 maddr += n; 1250 blkno += btodb(n); /* XXX? */ 1251 1252#if 0 /* XXX this doesn't work. grr. */ 1253 /* operator aborting dump? */ 1254 if (sget() != NULL) 1255 return EINTR; 1256#endif 1257 } 1258 dump_header_blkno = blkno; 1259 1260 return 0; 1261} 1262 1263void 1264dodumpsys(void) 1265{ 1266 const struct bdevsw *bdev; 1267 int dumpend, psize; 1268 int error; 1269 1270 if (dumpdev == NODEV) 1271 return; 1272 1273 bdev = bdevsw_lookup(dumpdev); 1274 if (bdev == NULL || bdev->d_psize == NULL) 1275 return; 1276 /* 1277 * For dumps during autoconfiguration, 1278 * if dump device has already configured... 1279 */ 1280 if (dumpsize == 0) 1281 cpu_dumpconf(); 1282 if (dumplo <= 0 || dumpsize == 0) { 1283 printf("\ndump to dev %u,%u not possible\n", major(dumpdev), 1284 minor(dumpdev)); 1285 return; 1286 } 1287 printf("\ndumping to dev %llu,%llu offset %ld\n", 1288 (unsigned long long)major(dumpdev), 1289 (unsigned long long)minor(dumpdev), dumplo); 1290 1291 psize = bdev_size(dumpdev); 1292 printf("dump "); 1293 if (psize == -1) { 1294 printf("area unavailable\n"); 1295 return; 1296 } 1297 1298#if 0 /* XXX this doesn't work. grr. */ 1299 /* toss any characters present prior to dump */ 1300 while (sget() != NULL); /*syscons and pccons differ */ 1301#endif 1302 1303 dump_seg_prep(); 1304 dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); 1305 if (dumpend > psize) { 1306 printf("failed: insufficient space (%d < %d)\n", 1307 psize, dumpend); 1308 goto failed; 1309 } 1310 1311 dump_header_start(); 1312 if ((error = cpu_dump()) != 0) 1313 goto err; 1314 if ((error = dump_header_finish()) != 0) 1315 goto err; 1316 1317 if (dump_header_blkno != dumplo + btodb(dump_header_size)) { 1318 printf("BAD header size (%ld [written] != %ld [expected])\n", 1319 (long)(dump_header_blkno - dumplo), 1320 (long)btodb(dump_header_size)); 1321 goto failed; 1322 } 1323 1324 dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); 1325 error = dump_seg_iter(dumpsys_seg); 1326 1327 if (error == 0 && dump_header_blkno != dumpend) { 1328 printf("BAD dump size (%ld [written] != %ld [expected])\n", 1329 (long)(dumpend - dumplo), 1330 (long)(dump_header_blkno - dumplo)); 1331 goto failed; 1332 } 1333 1334err: 1335 switch (error) { 1336 1337 case ENXIO: 1338 printf("device bad\n"); 1339 break; 1340 1341 case EFAULT: 1342 printf("device not ready\n"); 1343 break; 1344 1345 case EINVAL: 1346 printf("area improper\n"); 1347 break; 1348 1349 case EIO: 1350 printf("i/o error\n"); 1351 break; 1352 1353 case EINTR: 1354 printf("aborted from console\n"); 1355 break; 1356 1357 case 0: 1358 printf("succeeded\n"); 1359 break; 1360 1361 default: 1362 printf("error %d\n", error); 1363 break; 1364 } 1365failed: 1366 printf("\n\n"); 1367 delay(5000000); /* 5 seconds */ 1368} 1369 1370/* 1371 * This is called by main to set dumplo and dumpsize. 1372 * Dumps always skip the first PAGE_SIZE of disk space 1373 * in case there might be a disk label stored there. 1374 * If there is extra space, put dump at the end to 1375 * reduce the chance that swapping trashes it. 1376 * 1377 * Sparse dumps can't placed as close to the end as possible, because 1378 * savecore(8) has to know where to start reading in the dump device 1379 * before it has access to any of the crashed system's state. 1380 * 1381 * Note also that a sparse dump will never be larger than a full one: 1382 * in order to add a phys_ram_seg_t to the header, at least one page 1383 * must be removed. 1384 */ 1385void 1386cpu_dumpconf(void) 1387{ 1388 int nblks, dumpblks; /* size of dump area */ 1389 1390 if (dumpdev == NODEV) 1391 goto bad; 1392 nblks = bdev_size(dumpdev); 1393 if (nblks <= ctod(1)) 1394 goto bad; 1395 1396 dumpblks = cpu_dumpsize(); 1397 if (dumpblks < 0) 1398 goto bad; 1399 dumpblks += ctod(cpu_dump_mempagecnt()); 1400 1401 /* If dump won't fit (incl. room for possible label), punt. */ 1402 if (dumpblks > (nblks - ctod(1))) { 1403#ifndef NO_SPARSE_DUMP 1404 /* A sparse dump might (and hopefully will) fit. */ 1405 dumplo = ctod(1); 1406#else 1407 /* But if we're not configured for that, punt. */ 1408 goto bad; 1409#endif 1410 } else { 1411 /* Put dump at end of partition */ 1412 dumplo = nblks - dumpblks; 1413 } 1414 1415 /* dumpsize is in page units, and doesn't include headers. */ 1416 dumpsize = cpu_dump_mempagecnt(); 1417 1418 /* Now that we've decided this will work, init ancillary stuff. */ 1419 dump_misc_init(); 1420 return; 1421 1422 bad: 1423 dumpsize = 0; 1424} 1425 1426/* 1427 * Clear registers on exec 1428 */ 1429void 1430setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) 1431{ 1432 struct pcb *pcb = lwp_getpcb(l); 1433 struct trapframe *tf; 1434 1435 /* If we were using the FPU, forget about it. */ 1436 if (pcb->pcb_fpcpu != NULL) { 1437 fpusave_lwp(l, false); 1438 } 1439 1440#ifdef USER_LDT 1441 pmap_ldt_cleanup(l); 1442#endif 1443 1444 l->l_md.md_flags &= ~MDP_USEDFPU; 1445 pcb->pcb_flags = 0; 1446 pcb->pcb_savefpu.fp_fxsave.fx_fcw = __NetBSD_NPXCW__; 1447 pcb->pcb_savefpu.fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__; 1448 pcb->pcb_savefpu.fp_fxsave.fx_mxcsr_mask = __INITIAL_MXCSR_MASK__; 1449 1450 l->l_proc->p_flag &= ~PK_32; 1451 1452 tf = l->l_md.md_regs; 1453 tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL); 1454 tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL); 1455 cpu_fsgs_zero(l); 1456 tf->tf_rdi = 0; 1457 tf->tf_rsi = 0; 1458 tf->tf_rbp = 0; 1459 tf->tf_rbx = l->l_proc->p_psstrp; 1460 tf->tf_rdx = 0; 1461 tf->tf_rcx = 0; 1462 tf->tf_rax = 0; 1463 tf->tf_rip = pack->ep_entry; 1464 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); 1465 tf->tf_rflags = PSL_USERSET; 1466 tf->tf_rsp = stack; 1467 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 1468} 1469 1470/* 1471 * Initialize segments and descriptor tables 1472 */ 1473 1474#ifdef XEN 1475struct trap_info *xen_idt; 1476int xen_idt_idx; 1477#endif 1478char *ldtstore; 1479char *gdtstore; 1480 1481void 1482setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel) 1483{ 1484 1485 kpreempt_disable(); 1486 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); 1487 1488 gd->gd_looffset = (uint64_t)func & 0xffff; 1489 gd->gd_selector = sel; 1490 gd->gd_ist = ist; 1491 gd->gd_type = type; 1492 gd->gd_dpl = dpl; 1493 gd->gd_p = 1; 1494 gd->gd_hioffset = (uint64_t)func >> 16; 1495 gd->gd_zero = 0; 1496 gd->gd_xx1 = 0; 1497 gd->gd_xx2 = 0; 1498 gd->gd_xx3 = 0; 1499 1500 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1501 kpreempt_enable(); 1502} 1503 1504void 1505unsetgate(struct gate_descriptor *gd) 1506{ 1507 1508 kpreempt_disable(); 1509 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); 1510 1511 memset(gd, 0, sizeof (*gd)); 1512 1513 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1514 kpreempt_enable(); 1515} 1516 1517void 1518setregion(struct region_descriptor *rd, void *base, uint16_t limit) 1519{ 1520 rd->rd_limit = limit; 1521 rd->rd_base = (uint64_t)base; 1522} 1523 1524/* 1525 * Note that the base and limit fields are ignored in long mode. 1526 */ 1527void 1528set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, 1529 int type, int dpl, int gran, int def32, int is64) 1530{ 1531 sd->sd_lolimit = (unsigned)limit; 1532 sd->sd_lobase = (unsigned long)base; 1533 sd->sd_type = type; 1534 sd->sd_dpl = dpl; 1535 sd->sd_p = 1; 1536 sd->sd_hilimit = (unsigned)limit >> 16; 1537 sd->sd_avl = 0; 1538 sd->sd_long = is64; 1539 sd->sd_def32 = def32; 1540 sd->sd_gran = gran; 1541 sd->sd_hibase = (unsigned long)base >> 24; 1542} 1543 1544void 1545set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, 1546 int type, int dpl, int gran) 1547{ 1548 memset(sd, 0, sizeof *sd); 1549 sd->sd_lolimit = (unsigned)limit; 1550 sd->sd_lobase = (uint64_t)base; 1551 sd->sd_type = type; 1552 sd->sd_dpl = dpl; 1553 sd->sd_p = 1; 1554 sd->sd_hilimit = (unsigned)limit >> 16; 1555 sd->sd_gran = gran; 1556 sd->sd_hibase = (uint64_t)base >> 24; 1557} 1558 1559void 1560cpu_init_idt(void) 1561{ 1562#ifndef XEN 1563 struct region_descriptor region; 1564 1565 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); 1566 lidt(®ion); 1567#else 1568 if (HYPERVISOR_set_trap_table(xen_idt)) 1569 panic("HYPERVISOR_set_trap_table() failed"); 1570#endif 1571} 1572 1573#define IDTVEC(name) __CONCAT(X, name) 1574typedef void (vector)(void); 1575extern vector IDTVEC(syscall); 1576extern vector IDTVEC(syscall32); 1577extern vector IDTVEC(osyscall); 1578extern vector IDTVEC(oosyscall); 1579extern vector *IDTVEC(exceptions)[]; 1580 1581static void 1582init_x86_64_msgbuf(void) 1583{ 1584 /* Message buffer is located at end of core. */ 1585 struct vm_physseg *vps; 1586 psize_t sz = round_page(MSGBUFSIZE); 1587 psize_t reqsz = sz; 1588 int x; 1589 1590 search_again: 1591 vps = NULL; 1592 1593 for (x = 0; x < vm_nphysseg; x++) { 1594 vps = VM_PHYSMEM_PTR(x); 1595 if (ctob(vps->avail_end) == avail_end) 1596 break; 1597 } 1598 if (x == vm_nphysseg) 1599 panic("init_x86_64: can't find end of memory"); 1600 1601 /* Shrink so it'll fit in the last segment. */ 1602 if ((vps->avail_end - vps->avail_start) < atop(sz)) 1603 sz = ctob(vps->avail_end - vps->avail_start); 1604 1605 vps->avail_end -= atop(sz); 1606 vps->end -= atop(sz); 1607 msgbuf_p_seg[msgbuf_p_cnt].sz = sz; 1608 msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end); 1609 1610 /* Remove the last segment if it now has no pages. */ 1611 if (vps->start == vps->end) { 1612 for (vm_nphysseg--; x < vm_nphysseg; x++) 1613 VM_PHYSMEM_PTR_SWAP(x, x + 1); 1614 } 1615 1616 /* Now find where the new avail_end is. */ 1617 for (avail_end = 0, x = 0; x < vm_nphysseg; x++) 1618 if (VM_PHYSMEM_PTR(x)->avail_end > avail_end) 1619 avail_end = VM_PHYSMEM_PTR(x)->avail_end; 1620 avail_end = ctob(avail_end); 1621 1622 if (sz == reqsz) 1623 return; 1624 1625 reqsz -= sz; 1626 if (msgbuf_p_cnt == VM_PHYSSEG_MAX) { 1627 /* No more segments available, bail out. */ 1628 printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n", 1629 (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz)); 1630 return; 1631 } 1632 1633 sz = reqsz; 1634 goto search_again; 1635} 1636 1637static void 1638init_x86_64_ksyms(void) 1639{ 1640#if NKSYMS || defined(DDB) || defined(MODULAR) 1641 extern int end; 1642 extern int *esym; 1643#ifndef XEN 1644 struct btinfo_symtab *symtab; 1645 vaddr_t tssym, tesym; 1646#endif 1647 1648#ifdef DDB 1649 db_machine_init(); 1650#endif 1651 1652#ifndef XEN 1653 symtab = lookup_bootinfo(BTINFO_SYMTAB); 1654 if (symtab) { 1655 tssym = (vaddr_t)symtab->ssym + KERNBASE; 1656 tesym = (vaddr_t)symtab->esym + KERNBASE; 1657 ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); 1658 } else 1659 ksyms_addsyms_elf(*(long *)(void *)&end, 1660 ((long *)(void *)&end) + 1, esym); 1661#else /* XEN */ 1662 esym = xen_start_info.mod_start ? 1663 (void *)xen_start_info.mod_start : 1664 (void *)xen_start_info.mfn_list; 1665 ksyms_addsyms_elf(*(int *)(void *)&end, 1666 ((int *)(void *)&end) + 1, esym); 1667#endif /* XEN */ 1668#endif 1669} 1670 1671void 1672init_x86_64(paddr_t first_avail) 1673{ 1674 extern void consinit(void); 1675 struct region_descriptor region; 1676 struct mem_segment_descriptor *ldt_segp; 1677 struct pcb *pcb; 1678 int x; 1679#ifndef XEN 1680 int ist; 1681 extern struct extent *iomem_ex; 1682#if !defined(REALEXTMEM) && !defined(REALBASEMEM) 1683 struct btinfo_memmap *bim; 1684#endif 1685#endif /* !XEN */ 1686 1687 cpu_probe(&cpu_info_primary); 1688 1689#ifdef XEN 1690 KASSERT(HYPERVISOR_shared_info != NULL); 1691 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; 1692 1693 __PRINTK(("init_x86_64(0x%lx)\n", first_avail)); 1694#endif /* XEN */ 1695 1696 cpu_init_msrs(&cpu_info_primary, true); 1697 1698 pcb = lwp_getpcb(&lwp0); 1699 1700 use_pae = 1; /* PAE always enabled in long mode */ 1701 1702#ifdef XEN 1703 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); 1704 pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; 1705 __PRINTK(("pcb_cr3 0x%lx\n", xen_start_info.pt_base - KERNBASE)); 1706#endif 1707 1708#if NISA > 0 || NPCI > 0 1709 x86_bus_space_init(); 1710#endif 1711 1712 consinit(); /* XXX SHOULD NOT BE DONE HERE */ 1713 1714 /* 1715 * Initialize PAGE_SIZE-dependent variables. 1716 */ 1717 uvm_setpagesize(); 1718 1719 uvmexp.ncolors = 2; 1720 1721#ifndef XEN 1722 /* 1723 * Low memory reservations: 1724 * Page 0: BIOS data 1725 * Page 1: BIOS callback (not used yet, for symmetry with i386) 1726 * Page 2: MP bootstrap 1727 * Page 3: ACPI wakeup code 1728 * Page 4: Temporary page table for 0MB-4MB 1729 * Page 5: Temporary page directory 1730 * Page 6: Temporary page map level 3 1731 * Page 7: Temporary page map level 4 1732 */ 1733 avail_start = 8 * PAGE_SIZE; 1734 1735#if !defined(REALBASEMEM) && !defined(REALEXTMEM) 1736 1737 /* 1738 * Check to see if we have a memory map from the BIOS (passed 1739 * to us by the boot program. 1740 */ 1741 bim = lookup_bootinfo(BTINFO_MEMMAP); 1742 if (bim != NULL && bim->num > 0) 1743 initx86_parse_memmap(bim, iomem_ex); 1744 1745#endif /* ! REALBASEMEM && ! REALEXTMEM */ 1746 1747 /* 1748 * If the loop above didn't find any valid segment, fall back to 1749 * former code. 1750 */ 1751 if (mem_cluster_cnt == 0) 1752 initx86_fake_memmap(iomem_ex); 1753 1754#else /* XEN */ 1755 /* Parse Xen command line (replace bootinfo */ 1756 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); 1757 1758 /* Determine physical address space */ 1759 avail_start = first_avail; 1760 avail_end = ctob(xen_start_info.nr_pages); 1761 pmap_pa_start = (KERNTEXTOFF - KERNBASE); 1762 pmap_pa_end = avail_end; 1763 __PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n", 1764 pmap_pa_start, avail_start, avail_end)); 1765#endif /* !XEN */ 1766 1767 /* 1768 * Call pmap initialization to make new kernel address space. 1769 * We must do this before loading pages into the VM system. 1770 */ 1771 pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); 1772 1773 if (avail_start != PAGE_SIZE) 1774 pmap_prealloc_lowmem_ptps(); 1775 1776#ifndef XEN 1777 initx86_load_memmap(first_avail); 1778 1779#else /* XEN */ 1780 kern_end = KERNBASE + first_avail; 1781 physmem = xen_start_info.nr_pages; 1782 1783 uvm_page_physload(atop(avail_start), 1784 atop(avail_end), atop(avail_start), 1785 atop(avail_end), VM_FREELIST_DEFAULT); 1786#endif /* !XEN */ 1787 1788 init_x86_64_msgbuf(); 1789 1790 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); 1791 1792 kpreempt_disable(); 1793 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1794 pmap_update(pmap_kernel()); 1795 memset((void *)idt_vaddr, 0, PAGE_SIZE); 1796 1797#ifndef XEN 1798 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1799#endif 1800 pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE, 1801 VM_PROT_READ|VM_PROT_WRITE, 0); 1802#ifdef XEN 1803 /* Steal one more page for LDT */ 1804 pmap_kenter_pa(idt_vaddr + 2 * PAGE_SIZE, idt_paddr + 2 * PAGE_SIZE, 1805 VM_PROT_READ|VM_PROT_WRITE, 0); 1806#endif 1807 pmap_kenter_pa(lo32_vaddr, lo32_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1808 pmap_update(pmap_kernel()); 1809 1810#ifndef XEN 1811 idt_init(); 1812 idt = (struct gate_descriptor *)idt_vaddr; 1813 gdtstore = (char *)(idt + NIDT); 1814 ldtstore = gdtstore + DYNSEL_START; 1815#else 1816 xen_idt = (struct trap_info *)idt_vaddr; 1817 xen_idt_idx = 0; 1818 /* Xen wants page aligned GDT/LDT in separated pages */ 1819 ldtstore = (char *) roundup((vaddr_t) (xen_idt + NIDT), PAGE_SIZE); 1820 gdtstore = (char *) (ldtstore + PAGE_SIZE); 1821#endif /* XEN */ 1822 1823 /* make gdt gates and memory segments */ 1824 set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, 1825 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); 1826 1827 set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, 1828 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); 1829 1830#ifndef XEN 1831 set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, 1832 LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); 1833#endif 1834 1835 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, 1836 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); 1837 1838 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, 1839 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); 1840 1841 /* make ldt gates and memory segments */ 1842 setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), 1843 &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL, 1844 GSEL(GCODE_SEL, SEL_KPL)); 1845 *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = 1846 *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); 1847 *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = 1848 *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); 1849 1850 /* 1851 * 32 bit GDT entries. 1852 */ 1853 1854 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, 1855 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); 1856 1857 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, 1858 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1859 1860 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, 1861 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1862 1863 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, 1864 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1865 1866 /* 1867 * 32 bit LDT entries. 1868 */ 1869 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); 1870 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1871 SDT_MEMERA, SEL_UPL, 1, 1, 0); 1872 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); 1873 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1874 SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1875 1876 /* 1877 * Other entries. 1878 */ 1879 memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL), 1880 (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), 1881 sizeof (struct gate_descriptor)); 1882 memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL), 1883 (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), 1884 sizeof (struct gate_descriptor)); 1885 1886 /* exceptions */ 1887 for (x = 0; x < 32; x++) { 1888#ifndef XEN 1889 idt_vec_reserve(x); 1890 switch (x) { 1891 case 2: /* NMI */ 1892 ist = 3; 1893 break; 1894 case 8: /* double fault */ 1895 ist = 2; 1896 break; 1897 default: 1898 ist = 0; 1899 break; 1900 } 1901 setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, 1902 (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, 1903 GSEL(GCODE_SEL, SEL_KPL)); 1904#else /* XEN */ 1905 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); 1906 xen_idt[xen_idt_idx].vector = x; 1907 1908 switch (x) { 1909 case 2: /* NMI */ 1910 case 18: /* MCA */ 1911 TI_SET_IF(&(xen_idt[xen_idt_idx]), 2); 1912 break; 1913 case 3: 1914 case 4: 1915 xen_idt[xen_idt_idx].flags = SEL_UPL; 1916 break; 1917 default: 1918 xen_idt[xen_idt_idx].flags = SEL_KPL; 1919 break; 1920 } 1921 1922 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); 1923 xen_idt[xen_idt_idx].address = 1924 (unsigned long)IDTVEC(exceptions)[x]; 1925 xen_idt_idx++; 1926#endif /* XEN */ 1927 } 1928 1929 /* new-style interrupt gate for syscalls */ 1930#ifndef XEN 1931 idt_vec_reserve(128); 1932 setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, 1933 GSEL(GCODE_SEL, SEL_KPL)); 1934#else 1935 xen_idt[xen_idt_idx].vector = 128; 1936 xen_idt[xen_idt_idx].flags = SEL_KPL; 1937 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); 1938 xen_idt[xen_idt_idx].address = (unsigned long) &IDTVEC(osyscall); 1939 xen_idt_idx++; 1940 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1941#endif /* XEN */ 1942 kpreempt_enable(); 1943 1944 setregion(®ion, gdtstore, DYNSEL_START - 1); 1945 lgdt(®ion); 1946 1947#ifdef XEN 1948 /* Init Xen callbacks and syscall handlers */ 1949 if (HYPERVISOR_set_callbacks( 1950 (unsigned long) hypervisor_callback, 1951 (unsigned long) failsafe_callback, 1952 (unsigned long) Xsyscall)) 1953 panic("HYPERVISOR_set_callbacks() failed"); 1954#endif /* XEN */ 1955 cpu_init_idt(); 1956 1957 init_x86_64_ksyms(); 1958 1959#ifndef XEN 1960 intr_default_setup(); 1961#else 1962 events_default_setup(); 1963#endif 1964 1965 splraise(IPL_HIGH); 1966 x86_enable_intr(); 1967 1968#ifdef DDB 1969 if (boothowto & RB_KDB) 1970 Debugger(); 1971#endif 1972#ifdef KGDB 1973 kgdb_port_init(); 1974 if (boothowto & RB_KDB) { 1975 kgdb_debug_init = 1; 1976 kgdb_connect(1); 1977 } 1978#endif 1979} 1980 1981void 1982cpu_reset(void) 1983{ 1984 x86_disable_intr(); 1985 1986#ifdef XEN 1987 HYPERVISOR_reboot(); 1988#else 1989 1990 x86_reset(); 1991 1992 /* 1993 * Try to cause a triple fault and watchdog reset by making the IDT 1994 * invalid and causing a fault. 1995 */ 1996 kpreempt_disable(); 1997 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); 1998 pmap_changeprot_local(idt_vaddr + PAGE_SIZE, 1999 VM_PROT_READ|VM_PROT_WRITE); 2000 memset((void *)idt, 0, NIDT * sizeof(idt[0])); 2001 kpreempt_enable(); 2002 breakpoint(); 2003 2004#if 0 2005 /* 2006 * Try to cause a triple fault and watchdog reset by unmapping the 2007 * entire address space and doing a TLB flush. 2008 */ 2009 memset((void *)PTD, 0, PAGE_SIZE); 2010 tlbflush(); 2011#endif 2012#endif /* XEN */ 2013 2014 for (;;); 2015} 2016 2017void 2018cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) 2019{ 2020 const struct trapframe *tf = l->l_md.md_regs; 2021 __greg_t ras_rip; 2022 2023 /* Copy general registers member by member */ 2024#define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg; 2025 _FRAME_GREG(copy_from_tf) 2026#undef copy_from_tf 2027 2028 if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, 2029 (void *) mcp->__gregs[_REG_RIP])) != -1) 2030 mcp->__gregs[_REG_RIP] = ras_rip; 2031 2032 *flags |= _UC_CPU; 2033 2034 mcp->_mc_tlsbase = (uintptr_t)l->l_private;; 2035 *flags |= _UC_TLSBASE; 2036 2037 if ((l->l_md.md_flags & MDP_USEDFPU) != 0) { 2038 struct pcb *pcb = lwp_getpcb(l); 2039 2040 if (pcb->pcb_fpcpu) { 2041 fpusave_lwp(l, true); 2042 } 2043 memcpy(mcp->__fpregs, &pcb->pcb_savefpu.fp_fxsave, 2044 sizeof (mcp->__fpregs)); 2045 *flags |= _UC_FPU; 2046 } 2047} 2048 2049int 2050cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) 2051{ 2052 struct trapframe *tf = l->l_md.md_regs; 2053 const __greg_t *gr = mcp->__gregs; 2054 struct pcb *pcb = lwp_getpcb(l); 2055 struct proc *p = l->l_proc; 2056 int error; 2057 int err, trapno; 2058 int64_t rflags; 2059 2060 if ((flags & _UC_CPU) != 0) { 2061 error = cpu_mcontext_validate(l, mcp); 2062 if (error != 0) 2063 return error; 2064 /* 2065 * save and restore some values we don't want to change. 2066 * _FRAME_GREG(copy_to_tf) below overwrites them. 2067 * 2068 * XXX maybe inline this. 2069 */ 2070 rflags = tf->tf_rflags; 2071 err = tf->tf_err; 2072 trapno = tf->tf_trapno; 2073 2074 /* Copy general registers member by member */ 2075#define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG]; 2076 _FRAME_GREG(copy_to_tf) 2077#undef copy_to_tf 2078 2079#ifdef XEN 2080 /* 2081 * Xen has its own way of dealing with %cs and %ss, 2082 * reset it to proper values. 2083 */ 2084 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 2085 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 2086#endif 2087 rflags &= ~PSL_USER; 2088 tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); 2089 tf->tf_err = err; 2090 tf->tf_trapno = trapno; 2091 2092 l->l_md.md_flags |= MDP_IRET; 2093 } 2094 2095 if (pcb->pcb_fpcpu != NULL) 2096 fpusave_lwp(l, false); 2097 2098 if ((flags & _UC_FPU) != 0) { 2099 memcpy(&pcb->pcb_savefpu.fp_fxsave, mcp->__fpregs, 2100 sizeof (mcp->__fpregs)); 2101 l->l_md.md_flags |= MDP_USEDFPU; 2102 } 2103 2104 if ((flags & _UC_TLSBASE) != 0) 2105 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); 2106 2107 mutex_enter(p->p_lock); 2108 if (flags & _UC_SETSTACK) 2109 l->l_sigstk.ss_flags |= SS_ONSTACK; 2110 if (flags & _UC_CLRSTACK) 2111 l->l_sigstk.ss_flags &= ~SS_ONSTACK; 2112 mutex_exit(p->p_lock); 2113 2114 return 0; 2115} 2116 2117int 2118cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) 2119{ 2120 const __greg_t *gr; 2121 uint16_t sel; 2122 int error; 2123 struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap; 2124 struct proc *p = l->l_proc; 2125 struct trapframe *tf = l->l_md.md_regs; 2126 2127 gr = mcp->__gregs; 2128 2129 if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) 2130 return EINVAL; 2131 2132 if (__predict_false(pmap->pm_ldt != NULL)) { 2133 error = valid_user_selector(l, gr[_REG_ES], NULL, 0); 2134 if (error != 0) 2135 return error; 2136 2137 error = valid_user_selector(l, gr[_REG_FS], NULL, 0); 2138 if (error != 0) 2139 return error; 2140 2141 error = valid_user_selector(l, gr[_REG_GS], NULL, 0); 2142 if (error != 0) 2143 return error; 2144 2145 if ((gr[_REG_DS] & 0xffff) == 0) 2146 return EINVAL; 2147 error = valid_user_selector(l, gr[_REG_DS], NULL, 0); 2148 if (error != 0) 2149 return error; 2150 2151#ifndef XEN 2152 if ((gr[_REG_SS] & 0xffff) == 0) 2153 return EINVAL; 2154 error = valid_user_selector(l, gr[_REG_SS], NULL, 0); 2155 if (error != 0) 2156 return error; 2157#endif 2158 } else { 2159#define VUD(sel) \ 2160 ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) 2161 sel = gr[_REG_ES] & 0xffff; 2162 if (sel != 0 && !VUD(sel)) 2163 return EINVAL; 2164 2165/* XXX: Shouldn't this be FSEL32? */ 2166#define VUF(sel) \ 2167 ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) 2168 sel = gr[_REG_FS] & 0xffff; 2169 if (sel != 0 && !VUF(sel)) 2170 return EINVAL; 2171 2172#define VUG(sel) \ 2173 ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel)) 2174 sel = gr[_REG_GS] & 0xffff; 2175 if (sel != 0 && !VUG(sel)) 2176 return EINVAL; 2177 2178 sel = gr[_REG_DS] & 0xffff; 2179 if (!VUD(sel)) 2180 return EINVAL; 2181 2182#ifndef XEN 2183 sel = gr[_REG_SS] & 0xffff; 2184 if (!VUD(sel)) 2185 return EINVAL; 2186#endif 2187 2188 } 2189 2190#ifndef XEN 2191#define VUC(sel) \ 2192 ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel)) 2193 sel = gr[_REG_CS] & 0xffff; 2194 if (!VUC(sel)) 2195 return EINVAL; 2196#endif 2197 2198 if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) 2199 return EINVAL; 2200 return 0; 2201} 2202 2203void 2204cpu_initclocks(void) 2205{ 2206 (*initclock_func)(); 2207} 2208 2209int 2210memseg_baseaddr(struct lwp *l, uint64_t seg, char *ldtp, int llen, 2211 uint64_t *addr) 2212{ 2213 int off, len; 2214 char *dt; 2215 struct mem_segment_descriptor *sdp; 2216 struct proc *p = l->l_proc; 2217 struct pmap *pmap= p->p_vmspace->vm_map.pmap; 2218 uint64_t base; 2219 2220 seg &= 0xffff; 2221 2222 if (seg == 0) { 2223 if (addr != NULL) 2224 *addr = 0; 2225 return 0; 2226 } 2227 2228 off = (seg & 0xfff8); 2229 if (seg & SEL_LDT) { 2230 if (ldtp != NULL) { 2231 dt = ldtp; 2232 len = llen; 2233 } else if (pmap->pm_ldt != NULL) { 2234 len = pmap->pm_ldt_len; /* XXX broken */ 2235 dt = (char *)pmap->pm_ldt; 2236 } else { 2237 dt = ldtstore; 2238 len = LDT_SIZE; 2239 } 2240 2241 if (off > (len - 8)) 2242 return EINVAL; 2243 } else { 2244 if (seg != GUDATA_SEL || seg != GUDATA32_SEL) 2245 return EINVAL; 2246 } 2247 2248 sdp = (struct mem_segment_descriptor *)(dt + off); 2249 if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0) 2250 return EINVAL; 2251 2252 base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase); 2253 if (sdp->sd_gran == 1) 2254 base <<= PAGE_SHIFT; 2255 2256 if (base >= VM_MAXUSER_ADDRESS) 2257 return EINVAL; 2258 2259 if (addr == NULL) 2260 return 0; 2261 2262 *addr = base; 2263 2264 return 0; 2265} 2266 2267int 2268valid_user_selector(struct lwp *l, uint64_t seg, char *ldtp, int len) 2269{ 2270 return memseg_baseaddr(l, seg, ldtp, len, NULL); 2271} 2272 2273int 2274mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) 2275{ 2276 extern int start, __data_start; 2277 const vaddr_t v = (vaddr_t)ptr; 2278 2279 if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) { 2280 *handled = true; 2281 if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE)) 2282 return EFAULT; 2283 2284 } else if (v >= module_start && v < module_end) { 2285 *handled = true; 2286 if (!uvm_map_checkprot(module_map, v, v + 1, prot)) 2287 return EFAULT; 2288 } else { 2289 *handled = false; 2290 } 2291 return 0; 2292} 2293 2294/* 2295 * Zero out an LWP's TLS context (%fs and %gs and associated stuff). 2296 * Used when exec'ing a new program. 2297 */ 2298 2299void 2300cpu_fsgs_zero(struct lwp *l) 2301{ 2302 struct trapframe * const tf = l->l_md.md_regs; 2303 struct pcb *pcb; 2304 uint64_t zero = 0; 2305 2306 pcb = lwp_getpcb(l); 2307 if (l == curlwp) { 2308 kpreempt_disable(); 2309 tf->tf_fs = 0; 2310 tf->tf_gs = 0; 2311 setfs(0); 2312#ifndef XEN 2313 setusergs(0); 2314#else 2315 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); 2316#endif 2317 if ((l->l_proc->p_flag & PK_32) == 0) { 2318#ifndef XEN 2319 wrmsr(MSR_FSBASE, 0); 2320 wrmsr(MSR_KERNELGSBASE, 0); 2321#else 2322 HYPERVISOR_set_segment_base(SEGBASE_FS, 0); 2323 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); 2324#endif 2325 } 2326 pcb->pcb_fs = 0; 2327 pcb->pcb_gs = 0; 2328 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); 2329 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); 2330 kpreempt_enable(); 2331 } else { 2332 tf->tf_fs = 0; 2333 tf->tf_gs = 0; 2334 pcb->pcb_fs = 0; 2335 pcb->pcb_gs = 0; 2336 } 2337 2338} 2339 2340/* 2341 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. 2342 * Used only for 32-bit processes. 2343 */ 2344 2345void 2346cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) 2347{ 2348 struct trapframe *tf; 2349 struct pcb *pcb; 2350 2351 KASSERT(l->l_proc->p_flag & PK_32); 2352 tf = l->l_md.md_regs; 2353 if (l == curlwp) { 2354 pcb = lwp_getpcb(l); 2355 kpreempt_disable(); 2356 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 2357 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 2358 setfs(fssel); 2359#ifndef XEN 2360 setusergs(gssel); 2361#else 2362 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel); 2363#endif 2364 tf->tf_fs = fssel; 2365 tf->tf_gs = gssel; 2366 kpreempt_enable(); 2367 } else { 2368 tf->tf_fs = fssel; 2369 tf->tf_gs = gssel; 2370 } 2371} 2372 2373 2374#ifdef __HAVE_DIRECT_MAP 2375bool 2376mm_md_direct_mapped_io(void *addr, paddr_t *paddr) 2377{ 2378 vaddr_t va = (vaddr_t)addr; 2379 2380 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2381 *paddr = PMAP_DIRECT_UNMAP(va); 2382 return true; 2383 } 2384 return false; 2385} 2386 2387bool 2388mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) 2389{ 2390 *vaddr = PMAP_DIRECT_MAP(paddr); 2391 return true; 2392} 2393#endif 2394