1/*	$NetBSD$	*/
2
3/*-
4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5 *     The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility, NASA Ames Research Center.
11 *
12 * This code is derived from software contributed to The NetBSD Foundation
13 * by Coyote Point Systems, Inc. which was written under contract to Coyote
14 * Point by Jed Davis and Devon O'Dell.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 *    notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in the
23 *    documentation and/or other materials provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38/*
39 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40 *
41 * Permission to use, copy, modify, and distribute this software for any
42 * purpose with or without fee is hereby granted, provided that the above
43 * copyright notice and this permission notice appear in all copies.
44 *
45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52 */
53
54/*
55 * Copyright (c) 2007 Manuel Bouyer.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76 *
77 */
78
79/*-
80 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
81 * All rights reserved.
82 *
83 * This code is derived from software contributed to Berkeley by
84 * William Jolitz.
85 *
86 * Redistribution and use in source and binary forms, with or without
87 * modification, are permitted provided that the following conditions
88 * are met:
89 * 1. Redistributions of source code must retain the above copyright
90 *    notice, this list of conditions and the following disclaimer.
91 * 2. Redistributions in binary form must reproduce the above copyright
92 *    notice, this list of conditions and the following disclaimer in the
93 *    documentation and/or other materials provided with the distribution.
94 * 3. Neither the name of the University nor the names of its contributors
95 *    may be used to endorse or promote products derived from this software
96 *    without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
100 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
101 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
102 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
103 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
104 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
105 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
106 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
107 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
108 * SUCH DAMAGE.
109 *
110 *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
111 */
112
113#include <sys/cdefs.h>
114__KERNEL_RCSID(0, "$NetBSD$");
115
116/* #define XENDEBUG_LOW  */
117
118#include "opt_modular.h"
119#include "opt_user_ldt.h"
120#include "opt_ddb.h"
121#include "opt_kgdb.h"
122#include "opt_cpureset_delay.h"
123#include "opt_mtrr.h"
124#include "opt_realmem.h"
125#include "opt_xen.h"
126#ifndef XEN
127#include "opt_physmem.h"
128#endif
129#include "isa.h"
130#include "pci.h"
131
132#include <sys/param.h>
133#include <sys/systm.h>
134#include <sys/signal.h>
135#include <sys/signalvar.h>
136#include <sys/kernel.h>
137#include <sys/cpu.h>
138#include <sys/exec.h>
139#include <sys/exec_aout.h>	/* for MID_* */
140#include <sys/reboot.h>
141#include <sys/conf.h>
142#include <sys/mbuf.h>
143#include <sys/msgbuf.h>
144#include <sys/mount.h>
145#include <sys/core.h>
146#include <sys/kcore.h>
147#include <sys/ucontext.h>
148#include <machine/kcore.h>
149#include <sys/ras.h>
150#include <sys/sa.h>
151#include <sys/savar.h>
152#include <sys/syscallargs.h>
153#include <sys/ksyms.h>
154#include <sys/device.h>
155#include <sys/lwp.h>
156#include <sys/proc.h>
157
158#ifdef KGDB
159#include <sys/kgdb.h>
160#endif
161
162#include <dev/cons.h>
163#include <dev/mm.h>
164
165#include <uvm/uvm.h>
166#include <uvm/uvm_page.h>
167
168#include <sys/sysctl.h>
169
170#include <machine/cpu.h>
171#include <machine/cpufunc.h>
172#include <machine/gdt.h>
173#include <machine/intr.h>
174#include <machine/pio.h>
175#include <machine/psl.h>
176#include <machine/reg.h>
177#include <machine/specialreg.h>
178#include <machine/bootinfo.h>
179#include <machine/fpu.h>
180#include <machine/mtrr.h>
181#include <machine/mpbiosvar.h>
182
183#include <x86/cputypes.h>
184#include <x86/cpuvar.h>
185#include <x86/machdep.h>
186
187#include <x86/x86/tsc.h>
188
189#include <dev/isa/isareg.h>
190#include <machine/isa_machdep.h>
191#include <dev/ic/i8042reg.h>
192
193#ifdef XEN
194#include <xen/xen.h>
195#include <xen/hypervisor.h>
196#include <xen/evtchn.h>
197#endif
198
199#ifdef DDB
200#include <machine/db_machdep.h>
201#include <ddb/db_extern.h>
202#include <ddb/db_output.h>
203#include <ddb/db_interface.h>
204#endif
205
206#include "acpica.h"
207
208#if NACPICA > 0
209#include <dev/acpi/acpivar.h>
210#define ACPI_MACHDEP_PRIVATE
211#include <machine/acpi_machdep.h>
212#endif
213
214#include "isa.h"
215#include "isadma.h"
216#include "ksyms.h"
217
218/* the following is used externally (sysctl_hw) */
219char machine[] = "amd64";		/* CPU "architecture" */
220char machine_arch[] = "x86_64";		/* machine == machine_arch */
221
222/* Our exported CPU info; we have only one right now. */
223struct cpu_info cpu_info_primary;
224struct cpu_info *cpu_info_list;
225
226extern struct bi_devmatch *x86_alldisks;
227extern int x86_ndisks;
228
229#ifdef CPURESET_DELAY
230int	cpureset_delay = CPURESET_DELAY;
231#else
232int     cpureset_delay = 2000; /* default to 2s */
233#endif
234
235int	cpu_class = CPUCLASS_686;
236
237#ifdef MTRR
238struct mtrr_funcs *mtrr_funcs;
239#endif
240
241int	physmem;
242uint64_t	dumpmem_low;
243uint64_t	dumpmem_high;
244int	cpu_class;
245int	use_pae;
246
247#ifndef NO_SPARSE_DUMP
248int sparse_dump = 0;
249
250paddr_t max_paddr = 0;
251unsigned char *sparse_dump_physmap;
252#endif
253
254char *dump_headerbuf, *dump_headerbuf_ptr;
255#define dump_headerbuf_size PAGE_SIZE
256#define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
257#define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
258daddr_t dump_header_blkno;
259
260size_t dump_nmemsegs;
261size_t dump_npages;
262size_t dump_header_size;
263size_t dump_totalbytesleft;
264
265vaddr_t	msgbuf_vaddr;
266paddr_t msgbuf_paddr;
267
268struct {
269	paddr_t paddr;
270	psize_t sz;
271} msgbuf_p_seg[VM_PHYSSEG_MAX];
272unsigned int msgbuf_p_cnt = 0;
273
274vaddr_t	idt_vaddr;
275paddr_t	idt_paddr;
276
277vaddr_t lo32_vaddr;
278paddr_t lo32_paddr;
279
280vaddr_t module_start, module_end;
281static struct vm_map module_map_store;
282extern struct vm_map *module_map;
283vaddr_t kern_end;
284
285struct vm_map *phys_map = NULL;
286
287extern	paddr_t avail_start, avail_end;
288#ifdef XEN
289extern  paddr_t pmap_pa_start, pmap_pa_end;
290#endif
291
292#ifndef XEN
293void (*delay_func)(unsigned int) = i8254_delay;
294void (*initclock_func)(void) = i8254_initclocks;
295#else /* XEN */
296void (*delay_func)(unsigned int) = xen_delay;
297void (*initclock_func)(void) = xen_initclocks;
298#endif
299
300
301#ifdef MTRR
302struct mtrr_funcs *mtrr_funcs;
303#endif
304
305/*
306 * Size of memory segments, before any memory is stolen.
307 */
308phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
309int	mem_cluster_cnt;
310
311char	x86_64_doubleflt_stack[4096];
312
313int	cpu_dump(void);
314int	cpu_dumpsize(void);
315u_long	cpu_dump_mempagecnt(void);
316void	dodumpsys(void);
317void	dumpsys(void);
318
319extern int time_adjusted;	/* XXX no common header */
320
321void dump_misc_init(void);
322void dump_seg_prep(void);
323int dump_seg_iter(int (*)(paddr_t, paddr_t));
324
325#ifndef NO_SPARSE_DUMP
326void sparse_dump_reset(void);
327void sparse_dump_mark(vaddr_t, vaddr_t, int);
328void cpu_dump_prep_sparse(void);
329#endif
330
331void dump_header_start(void);
332int dump_header_flush(void);
333int dump_header_addbytes(const void*, size_t);
334int dump_header_addseg(paddr_t, paddr_t);
335int dump_header_finish(void);
336
337int dump_seg_count_range(paddr_t, paddr_t);
338int dumpsys_seg(paddr_t, paddr_t);
339
340void	init_x86_64(paddr_t);
341
342/*
343 * Machine-dependent startup code
344 */
345void
346cpu_startup(void)
347{
348	int x, y;
349	vaddr_t minaddr, maxaddr;
350	psize_t sz;
351
352	/*
353	 * For console drivers that require uvm and pmap to be initialized,
354	 * we'll give them one more chance here...
355	 */
356	consinit();
357
358	/*
359	 * Initialize error message buffer (et end of core).
360	 */
361	if (msgbuf_p_cnt == 0)
362		panic("msgbuf paddr map has not been set up");
363	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
364		continue;
365
366	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0,
367	    UVM_KMF_VAONLY);
368	if (msgbuf_vaddr == 0)
369		panic("failed to valloc msgbuf_vaddr");
370
371	/* msgbuf_paddr was init'd in pmap */
372	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
373		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
374			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
375				       msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
376				       VM_PROT_READ | UVM_PROT_WRITE, 0);
377	}
378
379	pmap_update(pmap_kernel());
380
381	initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
382
383	minaddr = 0;
384
385	/*
386	 * Allocate a submap for physio
387	 */
388	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
389				   VM_PHYS_SIZE, 0, false, NULL);
390
391	uvm_map_setup(&module_map_store, module_start, module_end, 0);
392	module_map_store.pmap = pmap_kernel();
393	module_map = &module_map_store;
394
395	/* Say hello. */
396	banner();
397
398#if NISA > 0 || NPCI > 0
399	/* Safe for i/o port / memory space allocation to use malloc now. */
400	x86_bus_space_mallocok();
401#endif
402
403	gdt_init();
404	x86_64_proc0_tss_ldt_init();
405
406	cpu_init_tss(&cpu_info_primary);
407#if !defined(XEN)
408	ltr(cpu_info_primary.ci_tss_sel);
409#endif /* !defined(XEN) */
410
411	x86_startup();
412}
413
414#ifdef XEN
415/* used in assembly */
416void hypervisor_callback(void);
417void failsafe_callback(void);
418void x86_64_switch_context(struct pcb *);
419void x86_64_tls_switch(struct lwp *);
420
421void
422x86_64_switch_context(struct pcb *new)
423{
424	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
425	struct physdev_op physop;
426	physop.cmd = PHYSDEVOP_SET_IOPL;
427	physop.u.set_iopl.iopl = new->pcb_iopl;
428	HYPERVISOR_physdev_op(&physop);
429}
430
431void
432x86_64_tls_switch(struct lwp *l)
433{
434	struct cpu_info *ci = curcpu();
435	struct pcb *pcb = lwp_getpcb(l);
436	struct trapframe *tf = l->l_md.md_regs;
437
438	/*
439	 * Raise the IPL to IPL_HIGH.
440	 * FPU IPIs can alter the LWP's saved cr0.  Dropping the priority
441	 * is deferred until mi_switch(), when cpu_switchto() returns.
442	 */
443	(void)splhigh();
444	/*
445	 * If our floating point registers are on a different CPU,
446	 * set CR0_TS so we'll trap rather than reuse bogus state.
447	 */
448	if (l != ci->ci_fpcurlwp) {
449		HYPERVISOR_fpu_taskswitch(1);
450	}
451
452	/* Update TLS segment pointers */
453	if (pcb->pcb_flags & PCB_COMPAT32) {
454		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
455		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
456		setfs(tf->tf_fs);
457		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
458	} else {
459		setfs(0);
460		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
461		HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
462		HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
463	}
464}
465#endif /* XEN */
466
467/*
468 * Set up proc0's TSS and LDT.
469 */
470void
471x86_64_proc0_tss_ldt_init(void)
472{
473	struct lwp *l = &lwp0;
474	struct pcb *pcb = lwp_getpcb(l);
475
476	pcb->pcb_flags = 0;
477	pcb->pcb_fs = 0;
478	pcb->pcb_gs = 0;
479	pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + KSTACK_SIZE - 16) & ~0xf;
480	pcb->pcb_iopl = SEL_KPL;
481
482	pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
483	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
484	l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
485
486#if !defined(XEN)
487	lldt(pmap_kernel()->pm_ldt_sel);
488#else
489	{
490	struct physdev_op physop;
491	xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3);
492	/* Reset TS bit and set kernel stack for interrupt handlers */
493	HYPERVISOR_fpu_taskswitch(1);
494	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
495	physop.cmd = PHYSDEVOP_SET_IOPL;
496	physop.u.set_iopl.iopl = pcb->pcb_iopl;
497	HYPERVISOR_physdev_op(&physop);
498	}
499#endif /* XEN */
500}
501
502/*
503 * Set up TSS and I/O bitmap.
504 */
505void
506cpu_init_tss(struct cpu_info *ci)
507{
508	struct x86_64_tss *tss = &ci->ci_tss;
509	uintptr_t p;
510
511	tss->tss_iobase = IOMAP_INVALOFF << 16;
512	/* tss->tss_ist[0] is filled by cpu_intr_init */
513
514	/* double fault */
515	tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16;
516
517	/* NMI */
518	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
519	tss->tss_ist[2] = p + PAGE_SIZE - 16;
520	ci->ci_tss_sel = tss_alloc(tss);
521}
522
523/*
524 * machine dependent system variables.
525 */
526static int
527sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
528{
529	struct btinfo_bootpath *bibp;
530	struct sysctlnode node;
531
532	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
533	if(!bibp)
534		return(ENOENT); /* ??? */
535
536	node = *rnode;
537	node.sysctl_data = bibp->bootpath;
538	node.sysctl_size = sizeof(bibp->bootpath);
539	return (sysctl_lookup(SYSCTLFN_CALL(&node)));
540}
541
542static int
543sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
544{
545        struct sysctlnode node;
546
547	if (x86_alldisks == NULL)
548		return (ENOENT);
549
550        node = *rnode;
551        node.sysctl_data = x86_alldisks;
552        node.sysctl_size = sizeof(struct disklist) +
553	    (x86_ndisks - 1) * sizeof(struct nativedisk_info);
554        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
555}
556
557SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
558{
559	extern uint64_t tsc_freq;
560
561	sysctl_createv(clog, 0, NULL, NULL,
562		       CTLFLAG_PERMANENT,
563		       CTLTYPE_NODE, "machdep", NULL,
564		       NULL, 0, NULL, 0,
565		       CTL_MACHDEP, CTL_EOL);
566
567	sysctl_createv(clog, 0, NULL, NULL,
568		       CTLFLAG_PERMANENT,
569		       CTLTYPE_STRUCT, "console_device", NULL,
570		       sysctl_consdev, 0, NULL, sizeof(dev_t),
571		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
572	sysctl_createv(clog, 0, NULL, NULL,
573		       CTLFLAG_PERMANENT,
574		       CTLTYPE_STRING, "booted_kernel", NULL,
575		       sysctl_machdep_booted_kernel, 0, NULL, 0,
576		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
577	sysctl_createv(clog, 0, NULL, NULL,
578		       CTLFLAG_PERMANENT,
579		       CTLTYPE_STRUCT, "diskinfo", NULL,
580		       sysctl_machdep_diskinfo, 0, NULL, 0,
581		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
582	sysctl_createv(clog, 0, NULL, NULL,
583		       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
584		       CTLTYPE_INT, "fpu_present", NULL,
585		       NULL, 1, NULL, 0,
586		       CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
587	sysctl_createv(clog, 0, NULL, NULL,
588		       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
589		       CTLTYPE_INT, "sse", NULL,
590		       NULL, 1, NULL, 0,
591		       CTL_MACHDEP, CPU_SSE, CTL_EOL);
592	sysctl_createv(clog, 0, NULL, NULL,
593		       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
594		       CTLTYPE_INT, "sse2", NULL,
595		       NULL, 1, NULL, 0,
596		       CTL_MACHDEP, CPU_SSE2, CTL_EOL);
597	sysctl_createv(clog, 0, NULL, NULL,
598		       CTLFLAG_PERMANENT,
599		       CTLTYPE_QUAD, "tsc_freq", NULL,
600		       NULL, 0, &tsc_freq, 0,
601		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
602	sysctl_createv(clog, 0, NULL, NULL,
603		       CTLFLAG_PERMANENT,
604		       CTLTYPE_INT, "pae",
605		       SYSCTL_DESCR("Whether the kernel uses PAE"),
606		       NULL, 0, &use_pae, 0,
607		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
608#ifndef NO_SPARSE_DUMP
609	/* XXXjld Does this really belong under machdep, and not e.g. kern? */
610	sysctl_createv(clog, 0, NULL, NULL,
611		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
612		       CTLTYPE_INT, "sparse_dump", NULL,
613		       NULL, 0, &sparse_dump, 0,
614		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
615#endif
616}
617
618void
619buildcontext(struct lwp *l, void *catcher, void *f)
620{
621	struct trapframe *tf = l->l_md.md_regs;
622
623	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
624	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
625	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
626	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
627
628	tf->tf_rip = (uint64_t)catcher;
629	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
630	tf->tf_rflags &= ~PSL_CLEARSIG;
631	tf->tf_rsp = (uint64_t)f;
632	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
633
634	/* Ensure FP state is reset, if FP is used. */
635	l->l_md.md_flags &= ~MDP_USEDFPU;
636}
637
638void
639sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
640{
641
642	printf("sendsig_sigcontext: illegal\n");
643	sigexit(curlwp, SIGILL);
644}
645
646void
647sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
648{
649	struct lwp *l = curlwp;
650	struct proc *p = l->l_proc;
651	struct sigacts *ps = p->p_sigacts;
652	int onstack, tocopy, error;
653	int sig = ksi->ksi_signo;
654	struct sigframe_siginfo *fp, frame;
655	sig_t catcher = SIGACTION(p, sig).sa_handler;
656	struct trapframe *tf = l->l_md.md_regs;
657	char *sp;
658
659	KASSERT(mutex_owned(p->p_lock));
660
661	/* Do we need to jump onto the signal stack? */
662	onstack =
663	    (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
664	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
665
666	/* Allocate space for the signal handler context. */
667	if (onstack)
668		sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
669	else
670		sp = (char *)tf->tf_rsp - 128;
671
672	sp -= sizeof(struct sigframe_siginfo);
673	/*
674	 * Round down the stackpointer to a multiple of 16 for
675	 * fxsave and the ABI.
676	 */
677	fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);
678
679	/*
680	 * Don't bother copying out FP state if there is none.
681	 */
682	if (l->l_md.md_flags & MDP_USEDFPU)
683		tocopy = sizeof (struct sigframe_siginfo);
684	else
685		tocopy = sizeof (struct sigframe_siginfo) -
686		    sizeof (frame.sf_uc.uc_mcontext.__fpregs);
687
688	frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
689	frame.sf_si._info = ksi->ksi_info;
690	frame.sf_uc.uc_flags = _UC_SIGMASK;
691	frame.sf_uc.uc_sigmask = *mask;
692	frame.sf_uc.uc_link = l->l_ctxlink;
693	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
694	    ? _UC_SETSTACK : _UC_CLRSTACK;
695	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
696	sendsig_reset(l, sig);
697
698	mutex_exit(p->p_lock);
699	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
700	error = copyout(&frame, fp, tocopy);
701	mutex_enter(p->p_lock);
702
703	if (error != 0) {
704		/*
705		 * Process has trashed its stack; give it an illegal
706		 * instruction to halt it in its tracks.
707		 */
708		sigexit(l, SIGILL);
709		/* NOTREACHED */
710	}
711
712	buildcontext(l, catcher, fp);
713
714	tf->tf_rdi = sig;
715	tf->tf_rsi = (uint64_t)&fp->sf_si;
716	tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
717
718	/* Remember that we're now on the signal stack. */
719	if (onstack)
720		l->l_sigstk.ss_flags |= SS_ONSTACK;
721
722	if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
723		/*
724		 * process has given an invalid address for the
725		 * handler. Stop it, but do not do it before so
726		 * we can return the right info to userland (or in core dump)
727		 */
728		sigexit(l, SIGILL);
729		/* NOTREACHED */
730	}
731}
732
733void
734cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas, void *ap, void *sp, sa_upcall_t upcall)
735{
736	struct trapframe *tf;
737
738	tf = l->l_md.md_regs;
739
740#if 0
741	printf("proc %d: upcall to lwp %d, type %d ev %d int %d sas %p to %p\n",
742	    (int)l->l_proc->p_pid, (int)l->l_lid, type, nevents, ninterrupted,
743	    sas, (void *)upcall);
744#endif
745
746	tf->tf_rdi = type;
747	tf->tf_rsi = (u_int64_t)sas;
748	tf->tf_rdx = nevents;
749	tf->tf_rcx = ninterrupted;
750	tf->tf_r8 = (u_int64_t)ap;
751
752	tf->tf_rip = (u_int64_t)upcall;
753	tf->tf_rsp = ((unsigned long)sp & ~15) - 8;
754	tf->tf_rbp = 0; /* indicate call-frame-top to debuggers */
755	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
756	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
757	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
758	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
759	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
760	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
761	tf->tf_rflags &= ~(PSL_T|PSL_VM|PSL_AC);
762
763	l->l_md.md_flags |= MDP_IRET;
764}
765
766struct pcb dumppcb;
767
768void
769cpu_reboot(int howto, char *bootstr)
770{
771	static bool syncdone = false;
772	int s = IPL_NONE;
773
774	if (cold) {
775		howto |= RB_HALT;
776		goto haltsys;
777	}
778
779	boothowto = howto;
780
781	/* i386 maybe_dump() */
782
783	/*
784	 * If we've panic'd, don't make the situation potentially
785	 * worse by syncing or unmounting the file systems.
786	 */
787	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
788		if (!syncdone) {
789			syncdone = true;
790			/* XXX used to force unmount as well, here */
791			vfs_sync_all(curlwp);
792			/*
793			 * If we've been adjusting the clock, the todr
794			 * will be out of synch; adjust it now.
795			 *
796			 * XXX used to do this after unmounting all
797			 * filesystems with vfs_shutdown().
798			 */
799			if (time_adjusted != 0)
800				resettodr();
801		}
802
803		while (vfs_unmountall1(curlwp, false, false) ||
804		       config_detach_all(boothowto) ||
805		       vfs_unmount_forceone(curlwp))
806			;	/* do nothing */
807	} else
808		suspendsched();
809
810	pmf_system_shutdown(boothowto);
811
812	/* Disable interrupts. */
813	s = splhigh();
814
815	/* Do a dump if requested. */
816	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
817		dumpsys();
818
819haltsys:
820	doshutdownhooks();
821
822        if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
823#ifndef XEN
824#if NACPICA > 0
825		if (s != IPL_NONE)
826			splx(s);
827
828		acpi_enter_sleep_state(ACPI_STATE_S5);
829#endif
830#else /* XEN */
831		HYPERVISOR_shutdown();
832#endif /* XEN */
833	}
834
835	cpu_broadcast_halt();
836
837	if (howto & RB_HALT) {
838#if NACPICA > 0
839		acpi_disable();
840#endif
841
842		printf("\n");
843		printf("The operating system has halted.\n");
844		printf("Please press any key to reboot.\n\n");
845		cnpollc(1);	/* for proper keyboard command handling */
846		cngetc();
847		cnpollc(0);
848	}
849
850	printf("rebooting...\n");
851	if (cpureset_delay > 0)
852		delay(cpureset_delay * 1000);
853	cpu_reset();
854	for(;;) ;
855	/*NOTREACHED*/
856}
857
858/*
859 * XXXfvdl share dumpcode.
860 */
861
862 /*
863 * Perform assorted dump-related initialization tasks.  Assumes that
864 * the maximum physical memory address will not increase afterwards.
865 */
866void
867dump_misc_init(void)
868{
869#ifndef NO_SPARSE_DUMP
870	int i;
871#endif
872
873	if (dump_headerbuf != NULL)
874		return; /* already called */
875
876#ifndef NO_SPARSE_DUMP
877	for (i = 0; i < mem_cluster_cnt; ++i) {
878		paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
879		if (max_paddr < top)
880			max_paddr = top;
881	}
882#ifdef DEBUG
883	printf("dump_misc_init: max_paddr = 0x%lx\n",
884	    (unsigned long)max_paddr);
885#endif
886	if (max_paddr == 0) {
887		printf("Your machine does not initialize mem_clusters; "
888		    "sparse_dumps disabled\n");
889		sparse_dump = 0;
890	} else {
891		sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
892		    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
893		    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
894	}
895#endif
896	dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
897	    dump_headerbuf_size,
898	    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
899	/* XXXjld should check for failure here, disable dumps if so. */
900}
901
902#ifndef NO_SPARSE_DUMP
903/*
904 * Clear the set of pages to include in a sparse dump.
905 */
906void
907sparse_dump_reset(void)
908{
909	memset(sparse_dump_physmap, 0,
910	    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
911}
912
913/*
914 * Include or exclude pages in a sparse dump, by half-open virtual
915 * address interval (which may wrap around the end of the space).
916 */
917void
918sparse_dump_mark(vaddr_t vbegin, vaddr_t vend, int includep)
919{
920	pmap_t pmap;
921	paddr_t p;
922	vaddr_t v;
923
924	/*
925	 * If a partial page is called for, the whole page must be included.
926	 */
927	if (includep) {
928		vbegin = rounddown(vbegin, PAGE_SIZE);
929		vend = roundup(vend, PAGE_SIZE);
930	} else {
931		vbegin = roundup(vbegin, PAGE_SIZE);
932		vend = rounddown(vend, PAGE_SIZE);
933	}
934
935	pmap = pmap_kernel();
936	for (v = vbegin; v != vend; v += PAGE_SIZE) {
937		if (pmap_extract(pmap, v, &p)) {
938			if (includep)
939				setbit(sparse_dump_physmap, p/PAGE_SIZE);
940			else
941				clrbit(sparse_dump_physmap, p/PAGE_SIZE);
942		}
943	}
944}
945
946/*
947 * Machine-dependently decides on the contents of a sparse dump, using
948 * the above.
949 */
950void
951cpu_dump_prep_sparse(void)
952{
953	sparse_dump_reset();
954	/* XXX could the alternate recursive page table be skipped? */
955	sparse_dump_mark((vaddr_t)PTE_BASE, (vaddr_t)KERN_BASE, 1);
956	/* Memory for I/O buffers could be unmarked here, for example. */
957	/* The kernel text could also be unmarked, but gdb would be upset. */
958}
959#endif
960
961/*
962 * Abstractly iterate over the collection of memory segments to be
963 * dumped; the callback lacks the customary environment-pointer
964 * argument because none of the current users really need one.
965 *
966 * To be used only after dump_seg_prep is called to set things up.
967 */
968int
969dump_seg_iter(int (*callback)(paddr_t, paddr_t))
970{
971	int error, i;
972
973#define CALLBACK(start,size) do {     \
974	error = callback(start,size); \
975	if (error)                    \
976		return error;         \
977} while(0)
978
979	for (i = 0; i < mem_cluster_cnt; ++i) {
980#ifndef NO_SPARSE_DUMP
981		/*
982		 * The bitmap is scanned within each memory segment,
983		 * rather than over its entire domain, in case any
984		 * pages outside of the memory proper have been mapped
985		 * into kva; they might be devices that wouldn't
986		 * appreciate being arbitrarily read, and including
987		 * them could also break the assumption that a sparse
988		 * dump will always be smaller than a full one.
989		 */
990		if (sparse_dump && sparse_dump_physmap) {
991			paddr_t p, start, end;
992			int lastset;
993
994			start = mem_clusters[i].start;
995			end = start + mem_clusters[i].size;
996			start = rounddown(start, PAGE_SIZE); /* unnecessary? */
997			lastset = 0;
998			for (p = start; p < end; p += PAGE_SIZE) {
999				int thisset = isset(sparse_dump_physmap,
1000				    p/PAGE_SIZE);
1001
1002				if (!lastset && thisset)
1003					start = p;
1004				if (lastset && !thisset)
1005					CALLBACK(start, p - start);
1006				lastset = thisset;
1007			}
1008			if (lastset)
1009				CALLBACK(start, p - start);
1010		} else
1011#endif
1012			CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
1013	}
1014	return 0;
1015#undef CALLBACK
1016}
1017
1018/*
1019 * Prepare for an impending core dump: decide what's being dumped and
1020 * how much space it will take up.
1021 */
1022void
1023dump_seg_prep(void)
1024{
1025#ifndef NO_SPARSE_DUMP
1026	if (sparse_dump && sparse_dump_physmap)
1027		cpu_dump_prep_sparse();
1028#endif
1029
1030	dump_nmemsegs = 0;
1031	dump_npages = 0;
1032	dump_seg_iter(dump_seg_count_range);
1033
1034	dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
1035	    ALIGN(sizeof(cpu_kcore_hdr_t)) +
1036	    ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
1037	dump_header_size = roundup(dump_header_size, dbtob(1));
1038
1039	/*
1040	 * savecore(8) will read this to decide how many pages to
1041	 * copy, and cpu_dumpconf has already used the pessimistic
1042	 * value to set dumplo, so it's time to tell the truth.
1043	 */
1044	dumpsize = dump_npages; /* XXX could these just be one variable? */
1045}
1046
1047int
1048dump_seg_count_range(paddr_t start, paddr_t size)
1049{
1050	++dump_nmemsegs;
1051	dump_npages += size / PAGE_SIZE;
1052	return 0;
1053}
1054
1055/*
1056 * A sparse dump's header may be rather large, due to the number of
1057 * "segments" emitted.  These routines manage a simple output buffer,
1058 * so that the header can be written to disk incrementally.
1059 */
1060void
1061dump_header_start(void)
1062{
1063	dump_headerbuf_ptr = dump_headerbuf;
1064	dump_header_blkno = dumplo;
1065}
1066
1067int
1068dump_header_flush(void)
1069{
1070	const struct bdevsw *bdev;
1071	size_t to_write;
1072	int error;
1073
1074	bdev = bdevsw_lookup(dumpdev);
1075	to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
1076	error = bdev->d_dump(dumpdev, dump_header_blkno,
1077	    dump_headerbuf, to_write);
1078	dump_header_blkno += btodb(to_write);
1079	dump_headerbuf_ptr = dump_headerbuf;
1080	return error;
1081}
1082
1083int
1084dump_header_addbytes(const void* vptr, size_t n)
1085{
1086	const char* ptr = vptr;
1087	int error;
1088
1089	while (n > dump_headerbuf_avail) {
1090		memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
1091		ptr += dump_headerbuf_avail;
1092		n -= dump_headerbuf_avail;
1093		dump_headerbuf_ptr = dump_headerbuf_end;
1094		error = dump_header_flush();
1095		if (error)
1096			return error;
1097	}
1098	memcpy(dump_headerbuf_ptr, ptr, n);
1099	dump_headerbuf_ptr += n;
1100
1101	return 0;
1102}
1103
1104int
1105dump_header_addseg(paddr_t start, paddr_t size)
1106{
1107	phys_ram_seg_t seg = { start, size };
1108
1109	return dump_header_addbytes(&seg, sizeof(seg));
1110}
1111
1112int
1113dump_header_finish(void)
1114{
1115	memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
1116	return dump_header_flush();
1117}
1118
1119
1120/*
1121 * These variables are needed by /sbin/savecore
1122 */
1123uint32_t	dumpmag = 0x8fca0101;	/* magic number */
1124int 	dumpsize = 0;		/* pages */
1125long	dumplo = 0; 		/* blocks */
1126
1127/*
1128 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
1129 * for a full (non-sparse) dump.
1130 */
1131int
1132cpu_dumpsize(void)
1133{
1134	int size;
1135
1136	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1137	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1138	if (roundup(size, dbtob(1)) != dbtob(1))
1139		return (-1);
1140
1141	return (1);
1142}
1143
1144/*
1145 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1146 * for a full (non-sparse) dump.
1147 */
1148u_long
1149cpu_dump_mempagecnt(void)
1150{
1151	u_long i, n;
1152
1153	n = 0;
1154	for (i = 0; i < mem_cluster_cnt; i++)
1155		n += atop(mem_clusters[i].size);
1156	return (n);
1157}
1158
1159/*
1160 * cpu_dump: dump the machine-dependent kernel core dump headers.
1161 */
1162int
1163cpu_dump(void)
1164{
1165	int (*dump)(dev_t, daddr_t, void *, size_t);
1166	kcore_seg_t seg;
1167	cpu_kcore_hdr_t cpuhdr;
1168	const struct bdevsw *bdev;
1169
1170	bdev = bdevsw_lookup(dumpdev);
1171	if (bdev == NULL)
1172		return (ENXIO);
1173
1174	dump = bdev->d_dump;
1175
1176	/*
1177	 * Generate a segment header.
1178	 */
1179	CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1180	seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1181	(void)dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1182
1183	/*
1184	 * Add the machine-dependent header info.
1185	 */
1186	cpuhdr.ptdpaddr = PDPpaddr;
1187	cpuhdr.nmemsegs = dump_nmemsegs;
1188	(void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1189
1190	/*
1191	 * Write out the memory segment descriptors.
1192	 */
1193	return dump_seg_iter(dump_header_addseg);
1194}
1195
1196/*
1197 * Doadump comes here after turning off memory management and
1198 * getting on the dump stack, either when called above, or by
1199 * the auto-restart code.
1200 */
1201#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
1202static vaddr_t dumpspace;
1203
1204vaddr_t
1205reserve_dumppages(vaddr_t p)
1206{
1207
1208	dumpspace = p;
1209	return (p + BYTES_PER_DUMP);
1210}
1211
1212int
1213dumpsys_seg(paddr_t maddr, paddr_t bytes)
1214{
1215	u_long i, m, n;
1216	daddr_t blkno;
1217	const struct bdevsw *bdev;
1218	int (*dump)(dev_t, daddr_t, void *, size_t);
1219	int error;
1220
1221	if (dumpdev == NODEV)
1222		return ENODEV;
1223	bdev = bdevsw_lookup(dumpdev);
1224	if (bdev == NULL || bdev->d_psize == NULL)
1225		return ENODEV;
1226
1227	dump = bdev->d_dump;
1228
1229	blkno = dump_header_blkno;
1230	for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1231		/* Print out how many MBs we have left to go. */
1232		if ((dump_totalbytesleft % (1024*1024)) == 0)
1233			printf_nolog("%lu ", (unsigned long)
1234			    (dump_totalbytesleft / (1024 * 1024)));
1235
1236		/* Limit size for next transfer. */
1237		n = bytes - i;
1238		if (n > BYTES_PER_DUMP)
1239			n = BYTES_PER_DUMP;
1240
1241		for (m = 0; m < n; m += NBPG)
1242			pmap_kenter_pa(dumpspace + m, maddr + m,
1243			    VM_PROT_READ, 0);
1244		pmap_update(pmap_kernel());
1245
1246		error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1247		if (error)
1248			return error;
1249		maddr += n;
1250		blkno += btodb(n);		/* XXX? */
1251
1252#if 0	/* XXX this doesn't work.  grr. */
1253		/* operator aborting dump? */
1254		if (sget() != NULL)
1255			return EINTR;
1256#endif
1257	}
1258	dump_header_blkno = blkno;
1259
1260	return 0;
1261}
1262
1263void
1264dodumpsys(void)
1265{
1266	const struct bdevsw *bdev;
1267	int dumpend, psize;
1268	int error;
1269
1270	if (dumpdev == NODEV)
1271		return;
1272
1273	bdev = bdevsw_lookup(dumpdev);
1274	if (bdev == NULL || bdev->d_psize == NULL)
1275		return;
1276	/*
1277	 * For dumps during autoconfiguration,
1278	 * if dump device has already configured...
1279	 */
1280	if (dumpsize == 0)
1281		cpu_dumpconf();
1282	if (dumplo <= 0 || dumpsize == 0) {
1283		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1284		    minor(dumpdev));
1285		return;
1286	}
1287	printf("\ndumping to dev %llu,%llu offset %ld\n",
1288	    (unsigned long long)major(dumpdev),
1289	    (unsigned long long)minor(dumpdev), dumplo);
1290
1291	psize = bdev_size(dumpdev);
1292	printf("dump ");
1293	if (psize == -1) {
1294		printf("area unavailable\n");
1295		return;
1296	}
1297
1298#if 0	/* XXX this doesn't work.  grr. */
1299	/* toss any characters present prior to dump */
1300	while (sget() != NULL); /*syscons and pccons differ */
1301#endif
1302
1303	dump_seg_prep();
1304	dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1305	if (dumpend > psize) {
1306		printf("failed: insufficient space (%d < %d)\n",
1307		    psize, dumpend);
1308		goto failed;
1309	}
1310
1311	dump_header_start();
1312	if ((error = cpu_dump()) != 0)
1313		goto err;
1314	if ((error = dump_header_finish()) != 0)
1315		goto err;
1316
1317	if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1318		printf("BAD header size (%ld [written] != %ld [expected])\n",
1319		    (long)(dump_header_blkno - dumplo),
1320		    (long)btodb(dump_header_size));
1321		goto failed;
1322	}
1323
1324	dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1325	error = dump_seg_iter(dumpsys_seg);
1326
1327	if (error == 0 && dump_header_blkno != dumpend) {
1328		printf("BAD dump size (%ld [written] != %ld [expected])\n",
1329		    (long)(dumpend - dumplo),
1330		    (long)(dump_header_blkno - dumplo));
1331		goto failed;
1332	}
1333
1334err:
1335	switch (error) {
1336
1337	case ENXIO:
1338		printf("device bad\n");
1339		break;
1340
1341	case EFAULT:
1342		printf("device not ready\n");
1343		break;
1344
1345	case EINVAL:
1346		printf("area improper\n");
1347		break;
1348
1349	case EIO:
1350		printf("i/o error\n");
1351		break;
1352
1353	case EINTR:
1354		printf("aborted from console\n");
1355		break;
1356
1357	case 0:
1358		printf("succeeded\n");
1359		break;
1360
1361	default:
1362		printf("error %d\n", error);
1363		break;
1364	}
1365failed:
1366	printf("\n\n");
1367	delay(5000000);		/* 5 seconds */
1368}
1369
1370/*
1371 * This is called by main to set dumplo and dumpsize.
1372 * Dumps always skip the first PAGE_SIZE of disk space
1373 * in case there might be a disk label stored there.
1374 * If there is extra space, put dump at the end to
1375 * reduce the chance that swapping trashes it.
1376 *
1377 * Sparse dumps can't placed as close to the end as possible, because
1378 * savecore(8) has to know where to start reading in the dump device
1379 * before it has access to any of the crashed system's state.
1380 *
1381 * Note also that a sparse dump will never be larger than a full one:
1382 * in order to add a phys_ram_seg_t to the header, at least one page
1383 * must be removed.
1384 */
1385void
1386cpu_dumpconf(void)
1387{
1388	int nblks, dumpblks;	/* size of dump area */
1389
1390	if (dumpdev == NODEV)
1391		goto bad;
1392	nblks = bdev_size(dumpdev);
1393	if (nblks <= ctod(1))
1394		goto bad;
1395
1396	dumpblks = cpu_dumpsize();
1397	if (dumpblks < 0)
1398		goto bad;
1399	dumpblks += ctod(cpu_dump_mempagecnt());
1400
1401	/* If dump won't fit (incl. room for possible label), punt. */
1402	if (dumpblks > (nblks - ctod(1))) {
1403#ifndef NO_SPARSE_DUMP
1404		/* A sparse dump might (and hopefully will) fit. */
1405		dumplo = ctod(1);
1406#else
1407		/* But if we're not configured for that, punt. */
1408		goto bad;
1409#endif
1410	} else {
1411		/* Put dump at end of partition */
1412		dumplo = nblks - dumpblks;
1413	}
1414
1415	/* dumpsize is in page units, and doesn't include headers. */
1416	dumpsize = cpu_dump_mempagecnt();
1417
1418	/* Now that we've decided this will work, init ancillary stuff. */
1419	dump_misc_init();
1420	return;
1421
1422 bad:
1423	dumpsize = 0;
1424}
1425
1426/*
1427 * Clear registers on exec
1428 */
1429void
1430setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1431{
1432	struct pcb *pcb = lwp_getpcb(l);
1433	struct trapframe *tf;
1434
1435	/* If we were using the FPU, forget about it. */
1436	if (pcb->pcb_fpcpu != NULL) {
1437		fpusave_lwp(l, false);
1438	}
1439
1440#ifdef USER_LDT
1441	pmap_ldt_cleanup(l);
1442#endif
1443
1444	l->l_md.md_flags &= ~MDP_USEDFPU;
1445	pcb->pcb_flags = 0;
1446	pcb->pcb_savefpu.fp_fxsave.fx_fcw = __NetBSD_NPXCW__;
1447	pcb->pcb_savefpu.fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__;
1448	pcb->pcb_savefpu.fp_fxsave.fx_mxcsr_mask = __INITIAL_MXCSR_MASK__;
1449
1450	l->l_proc->p_flag &= ~PK_32;
1451
1452	tf = l->l_md.md_regs;
1453	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
1454	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
1455	cpu_fsgs_zero(l);
1456	tf->tf_rdi = 0;
1457	tf->tf_rsi = 0;
1458	tf->tf_rbp = 0;
1459	tf->tf_rbx = l->l_proc->p_psstrp;
1460	tf->tf_rdx = 0;
1461	tf->tf_rcx = 0;
1462	tf->tf_rax = 0;
1463	tf->tf_rip = pack->ep_entry;
1464	tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1465	tf->tf_rflags = PSL_USERSET;
1466	tf->tf_rsp = stack;
1467	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1468}
1469
1470/*
1471 * Initialize segments and descriptor tables
1472 */
1473
1474#ifdef XEN
1475struct trap_info *xen_idt;
1476int xen_idt_idx;
1477#endif
1478char *ldtstore;
1479char *gdtstore;
1480
1481void
1482setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel)
1483{
1484
1485	kpreempt_disable();
1486	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1487
1488	gd->gd_looffset = (uint64_t)func & 0xffff;
1489	gd->gd_selector = sel;
1490	gd->gd_ist = ist;
1491	gd->gd_type = type;
1492	gd->gd_dpl = dpl;
1493	gd->gd_p = 1;
1494	gd->gd_hioffset = (uint64_t)func >> 16;
1495	gd->gd_zero = 0;
1496	gd->gd_xx1 = 0;
1497	gd->gd_xx2 = 0;
1498	gd->gd_xx3 = 0;
1499
1500	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1501	kpreempt_enable();
1502}
1503
1504void
1505unsetgate(struct gate_descriptor *gd)
1506{
1507
1508	kpreempt_disable();
1509	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1510
1511	memset(gd, 0, sizeof (*gd));
1512
1513	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1514	kpreempt_enable();
1515}
1516
1517void
1518setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1519{
1520	rd->rd_limit = limit;
1521	rd->rd_base = (uint64_t)base;
1522}
1523
1524/*
1525 * Note that the base and limit fields are ignored in long mode.
1526 */
1527void
1528set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1529	int type, int dpl, int gran, int def32, int is64)
1530{
1531	sd->sd_lolimit = (unsigned)limit;
1532	sd->sd_lobase = (unsigned long)base;
1533	sd->sd_type = type;
1534	sd->sd_dpl = dpl;
1535	sd->sd_p = 1;
1536	sd->sd_hilimit = (unsigned)limit >> 16;
1537	sd->sd_avl = 0;
1538	sd->sd_long = is64;
1539	sd->sd_def32 = def32;
1540	sd->sd_gran = gran;
1541	sd->sd_hibase = (unsigned long)base >> 24;
1542}
1543
1544void
1545set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1546	int type, int dpl, int gran)
1547{
1548	memset(sd, 0, sizeof *sd);
1549	sd->sd_lolimit = (unsigned)limit;
1550	sd->sd_lobase = (uint64_t)base;
1551	sd->sd_type = type;
1552	sd->sd_dpl = dpl;
1553	sd->sd_p = 1;
1554	sd->sd_hilimit = (unsigned)limit >> 16;
1555	sd->sd_gran = gran;
1556	sd->sd_hibase = (uint64_t)base >> 24;
1557}
1558
1559void
1560cpu_init_idt(void)
1561{
1562#ifndef XEN
1563	struct region_descriptor region;
1564
1565	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1566	lidt(&region);
1567#else
1568	if (HYPERVISOR_set_trap_table(xen_idt))
1569		panic("HYPERVISOR_set_trap_table() failed");
1570#endif
1571}
1572
1573#define	IDTVEC(name)	__CONCAT(X, name)
1574typedef void (vector)(void);
1575extern vector IDTVEC(syscall);
1576extern vector IDTVEC(syscall32);
1577extern vector IDTVEC(osyscall);
1578extern vector IDTVEC(oosyscall);
1579extern vector *IDTVEC(exceptions)[];
1580
1581static void
1582init_x86_64_msgbuf(void)
1583{
1584	/* Message buffer is located at end of core. */
1585	struct vm_physseg *vps;
1586	psize_t sz = round_page(MSGBUFSIZE);
1587	psize_t reqsz = sz;
1588	int x;
1589
1590 search_again:
1591	vps = NULL;
1592
1593	for (x = 0; x < vm_nphysseg; x++) {
1594		vps = VM_PHYSMEM_PTR(x);
1595		if (ctob(vps->avail_end) == avail_end)
1596			break;
1597	}
1598	if (x == vm_nphysseg)
1599		panic("init_x86_64: can't find end of memory");
1600
1601	/* Shrink so it'll fit in the last segment. */
1602	if ((vps->avail_end - vps->avail_start) < atop(sz))
1603		sz = ctob(vps->avail_end - vps->avail_start);
1604
1605	vps->avail_end -= atop(sz);
1606	vps->end -= atop(sz);
1607            msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1608            msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end);
1609
1610	/* Remove the last segment if it now has no pages. */
1611	if (vps->start == vps->end) {
1612		for (vm_nphysseg--; x < vm_nphysseg; x++)
1613			VM_PHYSMEM_PTR_SWAP(x, x + 1);
1614	}
1615
1616	/* Now find where the new avail_end is. */
1617	for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1618		if (VM_PHYSMEM_PTR(x)->avail_end > avail_end)
1619			avail_end = VM_PHYSMEM_PTR(x)->avail_end;
1620	avail_end = ctob(avail_end);
1621
1622	if (sz == reqsz)
1623		return;
1624
1625	reqsz -= sz;
1626	if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1627		/* No more segments available, bail out. */
1628		printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1629		    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1630		return;
1631	}
1632
1633	sz = reqsz;
1634	goto search_again;
1635}
1636
1637static void
1638init_x86_64_ksyms(void)
1639{
1640#if NKSYMS || defined(DDB) || defined(MODULAR)
1641	extern int end;
1642	extern int *esym;
1643#ifndef XEN
1644	struct btinfo_symtab *symtab;
1645	vaddr_t tssym, tesym;
1646#endif
1647
1648#ifdef DDB
1649	db_machine_init();
1650#endif
1651
1652#ifndef XEN
1653	symtab = lookup_bootinfo(BTINFO_SYMTAB);
1654	if (symtab) {
1655		tssym = (vaddr_t)symtab->ssym + KERNBASE;
1656		tesym = (vaddr_t)symtab->esym + KERNBASE;
1657		ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1658	} else
1659		ksyms_addsyms_elf(*(long *)(void *)&end,
1660		    ((long *)(void *)&end) + 1, esym);
1661#else  /* XEN */
1662	esym = xen_start_info.mod_start ?
1663	    (void *)xen_start_info.mod_start :
1664	    (void *)xen_start_info.mfn_list;
1665	ksyms_addsyms_elf(*(int *)(void *)&end,
1666	    ((int *)(void *)&end) + 1, esym);
1667#endif /* XEN */
1668#endif
1669}
1670
1671void
1672init_x86_64(paddr_t first_avail)
1673{
1674	extern void consinit(void);
1675	struct region_descriptor region;
1676	struct mem_segment_descriptor *ldt_segp;
1677	struct pcb *pcb;
1678	int x;
1679#ifndef XEN
1680	int ist;
1681	extern struct extent *iomem_ex;
1682#if !defined(REALEXTMEM) && !defined(REALBASEMEM)
1683	struct btinfo_memmap *bim;
1684#endif
1685#endif /* !XEN */
1686
1687	cpu_probe(&cpu_info_primary);
1688
1689#ifdef XEN
1690	KASSERT(HYPERVISOR_shared_info != NULL);
1691	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1692
1693	__PRINTK(("init_x86_64(0x%lx)\n", first_avail));
1694#endif /* XEN */
1695
1696	cpu_init_msrs(&cpu_info_primary, true);
1697
1698	pcb = lwp_getpcb(&lwp0);
1699
1700	use_pae = 1; /* PAE always enabled in long mode */
1701
1702#ifdef XEN
1703	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1704	pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1705	__PRINTK(("pcb_cr3 0x%lx\n", xen_start_info.pt_base - KERNBASE));
1706#endif
1707
1708#if NISA > 0 || NPCI > 0
1709	x86_bus_space_init();
1710#endif
1711
1712	consinit();	/* XXX SHOULD NOT BE DONE HERE */
1713
1714	/*
1715	 * Initialize PAGE_SIZE-dependent variables.
1716	 */
1717	uvm_setpagesize();
1718
1719	uvmexp.ncolors = 2;
1720
1721#ifndef XEN
1722	/*
1723	 * Low memory reservations:
1724	 * Page 0:	BIOS data
1725	 * Page 1:	BIOS callback (not used yet, for symmetry with i386)
1726	 * Page 2:	MP bootstrap
1727	 * Page 3:	ACPI wakeup code
1728	 * Page 4:	Temporary page table for 0MB-4MB
1729	 * Page 5:	Temporary page directory
1730	 * Page 6:	Temporary page map level 3
1731	 * Page 7:	Temporary page map level 4
1732	 */
1733	avail_start = 8 * PAGE_SIZE;
1734
1735#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
1736
1737	/*
1738	 * Check to see if we have a memory map from the BIOS (passed
1739	 * to us by the boot program.
1740	 */
1741	bim = lookup_bootinfo(BTINFO_MEMMAP);
1742	if (bim != NULL && bim->num > 0)
1743		initx86_parse_memmap(bim, iomem_ex);
1744
1745#endif	/* ! REALBASEMEM && ! REALEXTMEM */
1746
1747	/*
1748	 * If the loop above didn't find any valid segment, fall back to
1749	 * former code.
1750	 */
1751	if (mem_cluster_cnt == 0)
1752		initx86_fake_memmap(iomem_ex);
1753
1754#else	/* XEN */
1755	/* Parse Xen command line (replace bootinfo */
1756	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1757
1758	/* Determine physical address space */
1759	avail_start = first_avail;
1760	avail_end = ctob(xen_start_info.nr_pages);
1761	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1762	pmap_pa_end = avail_end;
1763	__PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n",
1764	    pmap_pa_start, avail_start, avail_end));
1765#endif	/* !XEN */
1766
1767	/*
1768	 * Call pmap initialization to make new kernel address space.
1769	 * We must do this before loading pages into the VM system.
1770	 */
1771	pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1772
1773	if (avail_start != PAGE_SIZE)
1774		pmap_prealloc_lowmem_ptps();
1775
1776#ifndef XEN
1777	initx86_load_memmap(first_avail);
1778
1779#else	/* XEN */
1780	kern_end = KERNBASE + first_avail;
1781	physmem = xen_start_info.nr_pages;
1782
1783	uvm_page_physload(atop(avail_start),
1784		atop(avail_end), atop(avail_start),
1785		atop(avail_end), VM_FREELIST_DEFAULT);
1786#endif	/* !XEN */
1787
1788	init_x86_64_msgbuf();
1789
1790	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1791
1792	kpreempt_disable();
1793	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1794	pmap_update(pmap_kernel());
1795	memset((void *)idt_vaddr, 0, PAGE_SIZE);
1796
1797#ifndef XEN
1798	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1799#endif
1800	pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE,
1801	    VM_PROT_READ|VM_PROT_WRITE, 0);
1802#ifdef XEN
1803	/* Steal one more page for LDT */
1804	pmap_kenter_pa(idt_vaddr + 2 * PAGE_SIZE, idt_paddr + 2 * PAGE_SIZE,
1805	    VM_PROT_READ|VM_PROT_WRITE, 0);
1806#endif
1807	pmap_kenter_pa(lo32_vaddr, lo32_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1808	pmap_update(pmap_kernel());
1809
1810#ifndef XEN
1811	idt_init();
1812	idt = (struct gate_descriptor *)idt_vaddr;
1813	gdtstore = (char *)(idt + NIDT);
1814	ldtstore = gdtstore + DYNSEL_START;
1815#else
1816	xen_idt = (struct trap_info *)idt_vaddr;
1817	xen_idt_idx = 0;
1818	/* Xen wants page aligned GDT/LDT in separated pages */
1819	ldtstore = (char *) roundup((vaddr_t) (xen_idt + NIDT), PAGE_SIZE);
1820	gdtstore = (char *) (ldtstore + PAGE_SIZE);
1821#endif /* XEN */
1822
1823	/* make gdt gates and memory segments */
1824	set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1825	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1826
1827	set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1828	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1829
1830#ifndef XEN
1831	set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1832	    LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1833#endif
1834
1835	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1836	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1837
1838	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1839	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1840
1841	/* make ldt gates and memory segments */
1842	setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1843	    &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL,
1844	    GSEL(GCODE_SEL, SEL_KPL));
1845	*(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1846	    *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1847	*(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1848	    *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1849
1850	/*
1851	 * 32 bit GDT entries.
1852	 */
1853
1854	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1855	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1856
1857	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1858	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1859
1860	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1861	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1862
1863	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1864	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1865
1866	/*
1867	 * 32 bit LDT entries.
1868	 */
1869	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1870	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1871	    SDT_MEMERA, SEL_UPL, 1, 1, 0);
1872	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1873	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1874	    SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1875
1876	/*
1877	 * Other entries.
1878	 */
1879	memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL),
1880	    (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1881	    sizeof (struct gate_descriptor));
1882	memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL),
1883	    (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1884	    sizeof (struct gate_descriptor));
1885
1886	/* exceptions */
1887	for (x = 0; x < 32; x++) {
1888#ifndef XEN
1889		idt_vec_reserve(x);
1890		switch (x) {
1891		case 2:	/* NMI */
1892			ist = 3;
1893			break;
1894		case 8:	/* double fault */
1895			ist = 2;
1896			break;
1897		default:
1898			ist = 0;
1899			break;
1900		}
1901		setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1902		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
1903		    GSEL(GCODE_SEL, SEL_KPL));
1904#else /* XEN */
1905		pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1906		xen_idt[xen_idt_idx].vector = x;
1907
1908		switch (x) {
1909		case 2:  /* NMI */
1910		case 18: /* MCA */
1911			TI_SET_IF(&(xen_idt[xen_idt_idx]), 2);
1912			break;
1913		case 3:
1914		case 4:
1915			xen_idt[xen_idt_idx].flags = SEL_UPL;
1916			break;
1917		default:
1918			xen_idt[xen_idt_idx].flags = SEL_KPL;
1919			break;
1920		}
1921
1922		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1923		xen_idt[xen_idt_idx].address =
1924		    (unsigned long)IDTVEC(exceptions)[x];
1925		xen_idt_idx++;
1926#endif /* XEN */
1927	}
1928
1929	/* new-style interrupt gate for syscalls */
1930#ifndef XEN
1931	idt_vec_reserve(128);
1932	setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1933	    GSEL(GCODE_SEL, SEL_KPL));
1934#else
1935	xen_idt[xen_idt_idx].vector = 128;
1936	xen_idt[xen_idt_idx].flags = SEL_KPL;
1937	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1938	xen_idt[xen_idt_idx].address =  (unsigned long) &IDTVEC(osyscall);
1939	xen_idt_idx++;
1940	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1941#endif /* XEN */
1942	kpreempt_enable();
1943
1944	setregion(&region, gdtstore, DYNSEL_START - 1);
1945	lgdt(&region);
1946
1947#ifdef XEN
1948	/* Init Xen callbacks and syscall handlers */
1949	if (HYPERVISOR_set_callbacks(
1950	    (unsigned long) hypervisor_callback,
1951	    (unsigned long) failsafe_callback,
1952	    (unsigned long) Xsyscall))
1953		panic("HYPERVISOR_set_callbacks() failed");
1954#endif /* XEN */
1955	cpu_init_idt();
1956
1957	init_x86_64_ksyms();
1958
1959#ifndef XEN
1960	intr_default_setup();
1961#else
1962	events_default_setup();
1963#endif
1964
1965	splraise(IPL_HIGH);
1966	x86_enable_intr();
1967
1968#ifdef DDB
1969	if (boothowto & RB_KDB)
1970		Debugger();
1971#endif
1972#ifdef KGDB
1973	kgdb_port_init();
1974	if (boothowto & RB_KDB) {
1975		kgdb_debug_init = 1;
1976		kgdb_connect(1);
1977	}
1978#endif
1979}
1980
1981void
1982cpu_reset(void)
1983{
1984	x86_disable_intr();
1985
1986#ifdef XEN
1987	HYPERVISOR_reboot();
1988#else
1989
1990	x86_reset();
1991
1992	/*
1993	 * Try to cause a triple fault and watchdog reset by making the IDT
1994	 * invalid and causing a fault.
1995	 */
1996	kpreempt_disable();
1997	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1998	pmap_changeprot_local(idt_vaddr + PAGE_SIZE,
1999	    VM_PROT_READ|VM_PROT_WRITE);
2000	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
2001	kpreempt_enable();
2002	breakpoint();
2003
2004#if 0
2005	/*
2006	 * Try to cause a triple fault and watchdog reset by unmapping the
2007	 * entire address space and doing a TLB flush.
2008	 */
2009	memset((void *)PTD, 0, PAGE_SIZE);
2010	tlbflush();
2011#endif
2012#endif	/* XEN */
2013
2014	for (;;);
2015}
2016
2017void
2018cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
2019{
2020	const struct trapframe *tf = l->l_md.md_regs;
2021	__greg_t ras_rip;
2022
2023	/* Copy general registers member by member */
2024#define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg;
2025	_FRAME_GREG(copy_from_tf)
2026#undef copy_from_tf
2027
2028	if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
2029	    (void *) mcp->__gregs[_REG_RIP])) != -1)
2030		mcp->__gregs[_REG_RIP] = ras_rip;
2031
2032	*flags |= _UC_CPU;
2033
2034	mcp->_mc_tlsbase = (uintptr_t)l->l_private;;
2035	*flags |= _UC_TLSBASE;
2036
2037	if ((l->l_md.md_flags & MDP_USEDFPU) != 0) {
2038		struct pcb *pcb = lwp_getpcb(l);
2039
2040		if (pcb->pcb_fpcpu) {
2041			fpusave_lwp(l, true);
2042		}
2043		memcpy(mcp->__fpregs, &pcb->pcb_savefpu.fp_fxsave,
2044		    sizeof (mcp->__fpregs));
2045		*flags |= _UC_FPU;
2046	}
2047}
2048
2049int
2050cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
2051{
2052	struct trapframe *tf = l->l_md.md_regs;
2053	const __greg_t *gr = mcp->__gregs;
2054	struct pcb *pcb = lwp_getpcb(l);
2055	struct proc *p = l->l_proc;
2056	int error;
2057	int err, trapno;
2058	int64_t rflags;
2059
2060	if ((flags & _UC_CPU) != 0) {
2061		error = cpu_mcontext_validate(l, mcp);
2062		if (error != 0)
2063			return error;
2064		/*
2065		 * save and restore some values we don't want to change.
2066		 * _FRAME_GREG(copy_to_tf) below overwrites them.
2067		 *
2068		 * XXX maybe inline this.
2069		 */
2070		rflags = tf->tf_rflags;
2071		err = tf->tf_err;
2072		trapno = tf->tf_trapno;
2073
2074		/* Copy general registers member by member */
2075#define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG];
2076		_FRAME_GREG(copy_to_tf)
2077#undef copy_to_tf
2078
2079#ifdef XEN
2080		/*
2081		 * Xen has its own way of dealing with %cs and %ss,
2082		 * reset it to proper values.
2083		 */
2084		tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
2085		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
2086#endif
2087		rflags &= ~PSL_USER;
2088		tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
2089		tf->tf_err = err;
2090		tf->tf_trapno = trapno;
2091
2092		l->l_md.md_flags |= MDP_IRET;
2093	}
2094
2095	if (pcb->pcb_fpcpu != NULL)
2096		fpusave_lwp(l, false);
2097
2098	if ((flags & _UC_FPU) != 0) {
2099		memcpy(&pcb->pcb_savefpu.fp_fxsave, mcp->__fpregs,
2100		    sizeof (mcp->__fpregs));
2101		l->l_md.md_flags |= MDP_USEDFPU;
2102	}
2103
2104	if ((flags & _UC_TLSBASE) != 0)
2105		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
2106
2107	mutex_enter(p->p_lock);
2108	if (flags & _UC_SETSTACK)
2109		l->l_sigstk.ss_flags |= SS_ONSTACK;
2110	if (flags & _UC_CLRSTACK)
2111		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
2112	mutex_exit(p->p_lock);
2113
2114	return 0;
2115}
2116
2117int
2118cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
2119{
2120	const __greg_t *gr;
2121	uint16_t sel;
2122	int error;
2123	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
2124	struct proc *p = l->l_proc;
2125	struct trapframe *tf = l->l_md.md_regs;
2126
2127	gr = mcp->__gregs;
2128
2129	if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2130		return EINVAL;
2131
2132	if (__predict_false(pmap->pm_ldt != NULL)) {
2133		error = valid_user_selector(l, gr[_REG_ES], NULL, 0);
2134		if (error != 0)
2135			return error;
2136
2137		error = valid_user_selector(l, gr[_REG_FS], NULL, 0);
2138		if (error != 0)
2139			return error;
2140
2141		error = valid_user_selector(l, gr[_REG_GS], NULL, 0);
2142		if (error != 0)
2143			return error;
2144
2145		if ((gr[_REG_DS] & 0xffff) == 0)
2146			return EINVAL;
2147		error = valid_user_selector(l, gr[_REG_DS], NULL, 0);
2148		if (error != 0)
2149			return error;
2150
2151#ifndef XEN
2152		if ((gr[_REG_SS] & 0xffff) == 0)
2153			return EINVAL;
2154		error = valid_user_selector(l, gr[_REG_SS], NULL, 0);
2155		if (error != 0)
2156			return error;
2157#endif
2158	} else {
2159#define VUD(sel) \
2160    ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
2161		sel = gr[_REG_ES] & 0xffff;
2162		if (sel != 0 && !VUD(sel))
2163			return EINVAL;
2164
2165/* XXX: Shouldn't this be FSEL32? */
2166#define VUF(sel) \
2167    ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
2168		sel = gr[_REG_FS] & 0xffff;
2169		if (sel != 0 && !VUF(sel))
2170			return EINVAL;
2171
2172#define VUG(sel) \
2173    ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel))
2174		sel = gr[_REG_GS] & 0xffff;
2175		if (sel != 0 && !VUG(sel))
2176			return EINVAL;
2177
2178		sel = gr[_REG_DS] & 0xffff;
2179		if (!VUD(sel))
2180			return EINVAL;
2181
2182#ifndef XEN
2183		sel = gr[_REG_SS] & 0xffff;
2184		if (!VUD(sel))
2185			return EINVAL;
2186#endif
2187
2188	}
2189
2190#ifndef XEN
2191#define VUC(sel) \
2192    ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel))
2193	sel = gr[_REG_CS] & 0xffff;
2194	if (!VUC(sel))
2195		return EINVAL;
2196#endif
2197
2198	if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2199		return EINVAL;
2200	return 0;
2201}
2202
2203void
2204cpu_initclocks(void)
2205{
2206	(*initclock_func)();
2207}
2208
2209int
2210memseg_baseaddr(struct lwp *l, uint64_t seg, char *ldtp, int llen,
2211		uint64_t *addr)
2212{
2213	int off, len;
2214	char *dt;
2215	struct mem_segment_descriptor *sdp;
2216	struct proc *p = l->l_proc;
2217	struct pmap *pmap= p->p_vmspace->vm_map.pmap;
2218	uint64_t base;
2219
2220	seg &= 0xffff;
2221
2222	if (seg == 0) {
2223		if (addr != NULL)
2224			*addr = 0;
2225		return 0;
2226	}
2227
2228	off = (seg & 0xfff8);
2229	if (seg & SEL_LDT) {
2230		if (ldtp != NULL) {
2231			dt = ldtp;
2232			len = llen;
2233		} else if (pmap->pm_ldt != NULL) {
2234			len = pmap->pm_ldt_len; /* XXX broken */
2235			dt = (char *)pmap->pm_ldt;
2236		} else {
2237			dt = ldtstore;
2238			len = LDT_SIZE;
2239		}
2240
2241		if (off > (len - 8))
2242			return EINVAL;
2243	} else {
2244		if (seg != GUDATA_SEL || seg != GUDATA32_SEL)
2245			return EINVAL;
2246	}
2247
2248	sdp = (struct mem_segment_descriptor *)(dt + off);
2249	if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0)
2250		return EINVAL;
2251
2252	base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase);
2253	if (sdp->sd_gran == 1)
2254		base <<= PAGE_SHIFT;
2255
2256	if (base >= VM_MAXUSER_ADDRESS)
2257		return EINVAL;
2258
2259	if (addr == NULL)
2260		return 0;
2261
2262	*addr = base;
2263
2264	return 0;
2265}
2266
2267int
2268valid_user_selector(struct lwp *l, uint64_t seg, char *ldtp, int len)
2269{
2270	return memseg_baseaddr(l, seg, ldtp, len, NULL);
2271}
2272
2273int
2274mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2275{
2276	extern int start, __data_start;
2277	const vaddr_t v = (vaddr_t)ptr;
2278
2279	if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) {
2280		*handled = true;
2281		if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE))
2282			return EFAULT;
2283
2284	} else if (v >= module_start && v < module_end) {
2285		*handled = true;
2286		if (!uvm_map_checkprot(module_map, v, v + 1, prot))
2287			return EFAULT;
2288	} else {
2289		*handled = false;
2290	}
2291	return 0;
2292}
2293
2294/*
2295 * Zero out an LWP's TLS context (%fs and %gs and associated stuff).
2296 * Used when exec'ing a new program.
2297 */
2298
2299void
2300cpu_fsgs_zero(struct lwp *l)
2301{
2302	struct trapframe * const tf = l->l_md.md_regs;
2303	struct pcb *pcb;
2304	uint64_t zero = 0;
2305
2306	pcb = lwp_getpcb(l);
2307	if (l == curlwp) {
2308		kpreempt_disable();
2309		tf->tf_fs = 0;
2310		tf->tf_gs = 0;
2311		setfs(0);
2312#ifndef XEN
2313		setusergs(0);
2314#else
2315		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
2316#endif
2317		if ((l->l_proc->p_flag & PK_32) == 0) {
2318#ifndef XEN
2319			wrmsr(MSR_FSBASE, 0);
2320			wrmsr(MSR_KERNELGSBASE, 0);
2321#else
2322			HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2323			HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2324#endif
2325		}
2326		pcb->pcb_fs = 0;
2327		pcb->pcb_gs = 0;
2328		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2329		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2330		kpreempt_enable();
2331	} else {
2332		tf->tf_fs = 0;
2333		tf->tf_gs = 0;
2334		pcb->pcb_fs = 0;
2335		pcb->pcb_gs = 0;
2336	}
2337
2338}
2339
2340/*
2341 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2342 * Used only for 32-bit processes.
2343 */
2344
2345void
2346cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2347{
2348	struct trapframe *tf;
2349	struct pcb *pcb;
2350
2351	KASSERT(l->l_proc->p_flag & PK_32);
2352	tf = l->l_md.md_regs;
2353	if (l == curlwp) {
2354		pcb = lwp_getpcb(l);
2355		kpreempt_disable();
2356		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2357		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2358		setfs(fssel);
2359#ifndef XEN
2360		setusergs(gssel);
2361#else
2362		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel);
2363#endif
2364		tf->tf_fs = fssel;
2365		tf->tf_gs = gssel;
2366		kpreempt_enable();
2367	} else {
2368		tf->tf_fs = fssel;
2369		tf->tf_gs = gssel;
2370	}
2371}
2372
2373
2374#ifdef __HAVE_DIRECT_MAP
2375bool
2376mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2377{
2378	vaddr_t va = (vaddr_t)addr;
2379
2380	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2381		*paddr = PMAP_DIRECT_UNMAP(va);
2382		return true;
2383	}
2384	return false;
2385}
2386
2387bool
2388mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2389{
2390	*vaddr = PMAP_DIRECT_MAP(paddr);
2391	return true;
2392}
2393#endif
2394