machdep.c revision 271211
1/*-
2 * Copyright (c) 2003,2004 Marcel Moolenaar
3 * Copyright (c) 2000,2001 Doug Rabson
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/ia64/ia64/machdep.c 271211 2014-09-06 22:17:54Z marcel $");
30
31#include "opt_compat.h"
32#include "opt_ddb.h"
33#include "opt_kstack_pages.h"
34#include "opt_sched.h"
35#include "opt_xtrace.h"
36
37#include <sys/param.h>
38#include <sys/proc.h>
39#include <sys/systm.h>
40#include <sys/bio.h>
41#include <sys/buf.h>
42#include <sys/bus.h>
43#include <sys/cons.h>
44#include <sys/cpu.h>
45#include <sys/efi.h>
46#include <sys/eventhandler.h>
47#include <sys/exec.h>
48#include <sys/imgact.h>
49#include <sys/kdb.h>
50#include <sys/kernel.h>
51#include <sys/linker.h>
52#include <sys/lock.h>
53#include <sys/malloc.h>
54#include <sys/mbuf.h>
55#include <sys/msgbuf.h>
56#include <sys/pcpu.h>
57#include <sys/ptrace.h>
58#include <sys/random.h>
59#include <sys/reboot.h>
60#include <sys/rwlock.h>
61#include <sys/sched.h>
62#include <sys/signalvar.h>
63#include <sys/syscall.h>
64#include <sys/syscallsubr.h>
65#include <sys/sysctl.h>
66#include <sys/sysproto.h>
67#include <sys/ucontext.h>
68#include <sys/uio.h>
69#include <sys/uuid.h>
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72
73#include <ddb/ddb.h>
74
75#include <net/netisr.h>
76
77#include <vm/vm.h>
78#include <vm/vm_extern.h>
79#include <vm/vm_kern.h>
80#include <vm/vm_page.h>
81#include <vm/vm_map.h>
82#include <vm/vm_object.h>
83#include <vm/vm_pager.h>
84
85#include <machine/bootinfo.h>
86#include <machine/cpu.h>
87#include <machine/elf.h>
88#include <machine/fpu.h>
89#include <machine/intr.h>
90#include <machine/kdb.h>
91#include <machine/mca.h>
92#include <machine/md_var.h>
93#include <machine/pal.h>
94#include <machine/pcb.h>
95#include <machine/reg.h>
96#include <machine/sal.h>
97#include <machine/sigframe.h>
98#ifdef SMP
99#include <machine/smp.h>
100#endif
101#include <machine/unwind.h>
102#include <machine/vmparam.h>
103
104/*
105 * For atomicity reasons, we demand that pc_curthread is the first
106 * field in the struct pcpu. It allows us to read the pointer with
107 * a single atomic instruction:
108 *	ld8 %curthread = [r13]
109 * Otherwise we would first have to calculate the load address and
110 * store the result in a temporary register and that for the load:
111 *	add %temp = %offsetof(struct pcpu), r13
112 *	ld8 %curthread = [%temp]
113 * A context switch inbetween the add and the ld8 could have the
114 * thread migrate to a different core. In that case,  %curthread
115 * would be the thread running on the original core and not actually
116 * the current thread.
117 */
118CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
119
120static SYSCTL_NODE(_hw, OID_AUTO, freq, CTLFLAG_RD, 0, "");
121static SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RD, 0, "");
122
123static u_int bus_freq;
124SYSCTL_UINT(_hw_freq, OID_AUTO, bus, CTLFLAG_RD, &bus_freq, 0,
125    "Bus clock frequency");
126
127static u_int cpu_freq;
128SYSCTL_UINT(_hw_freq, OID_AUTO, cpu, CTLFLAG_RD, &cpu_freq, 0,
129    "CPU clock frequency");
130
131static u_int itc_freq;
132SYSCTL_UINT(_hw_freq, OID_AUTO, itc, CTLFLAG_RD, &itc_freq, 0,
133    "ITC frequency");
134
135int cold = 1;
136int unmapped_buf_allowed = 0;
137
138struct bootinfo *bootinfo;
139
140struct pcpu pcpu0;
141
142extern u_int64_t kernel_text[], _end[];
143
144extern u_int64_t ia64_gateway_page[];
145extern u_int64_t break_sigtramp[];
146extern u_int64_t epc_sigtramp[];
147
148struct fpswa_iface *fpswa_iface;
149
150vm_size_t ia64_pal_size;
151vm_paddr_t ia64_pal_base;
152vm_offset_t ia64_port_base;
153
154u_int64_t ia64_lapic_addr = PAL_PIB_DEFAULT_ADDR;
155
156struct ia64_pib *ia64_pib;
157
158static int ia64_sync_icache_needed;
159
160char machine[] = MACHINE;
161SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
162
163static char cpu_model[64];
164SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
165    "The CPU model name");
166
167static char cpu_family[64];
168SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
169    "The CPU family name");
170
171#ifdef DDB
172extern vm_offset_t ksym_start, ksym_end;
173#endif
174
175struct msgbuf *msgbufp = NULL;
176
177/* Other subsystems (e.g., ACPI) can hook this later. */
178void (*cpu_idle_hook)(sbintime_t) = NULL;
179
180struct kva_md_info kmi;
181
182static void
183identifycpu(void)
184{
185	char vendor[17];
186	char *family_name, *model_name;
187	u_int64_t features, tmp;
188	int number, revision, model, family, archrev;
189
190	/*
191	 * Assumes little-endian.
192	 */
193	*(u_int64_t *) &vendor[0] = ia64_get_cpuid(0);
194	*(u_int64_t *) &vendor[8] = ia64_get_cpuid(1);
195	vendor[16] = '\0';
196
197	tmp = ia64_get_cpuid(3);
198	number = (tmp >> 0) & 0xff;
199	revision = (tmp >> 8) & 0xff;
200	model = (tmp >> 16) & 0xff;
201	family = (tmp >> 24) & 0xff;
202	archrev = (tmp >> 32) & 0xff;
203
204	family_name = model_name = "unknown";
205	switch (family) {
206	case 0x07:
207		family_name = "Itanium";
208		model_name = "Merced";
209		break;
210	case 0x1f:
211		family_name = "Itanium 2";
212		switch (model) {
213		case 0x00:
214			model_name = "McKinley";
215			break;
216		case 0x01:
217			/*
218			 * Deerfield is a low-voltage variant based on the
219			 * Madison core. We need circumstantial evidence
220			 * (i.e. the clock frequency) to identify those.
221			 * Allow for roughly 1% error margin.
222			 */
223			if (cpu_freq > 990 && cpu_freq < 1010)
224				model_name = "Deerfield";
225			else
226				model_name = "Madison";
227			break;
228		case 0x02:
229			model_name = "Madison II";
230			break;
231		}
232		break;
233	case 0x20:
234		ia64_sync_icache_needed = 1;
235
236		family_name = "Itanium 2";
237		switch (model) {
238		case 0x00:
239			model_name = "Montecito";
240			break;
241		case 0x01:
242			model_name = "Montvale";
243			break;
244		}
245		break;
246	}
247	snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
248	snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);
249
250	features = ia64_get_cpuid(4);
251
252	printf("CPU: %s (", model_name);
253	if (cpu_freq)
254		printf("%u MHz ", cpu_freq);
255	printf("%s)\n", family_name);
256	printf("  Origin = \"%s\"  Revision = %d\n", vendor, revision);
257	printf("  Features = 0x%b\n", (u_int32_t) features,
258	    "\020"
259	    "\001LB"	/* long branch (brl) instruction. */
260	    "\002SD"	/* Spontaneous deferral. */
261	    "\003AO"	/* 16-byte atomic operations (ld, st, cmpxchg). */ );
262}
263
264static void
265cpu_startup(void *dummy)
266{
267	char nodename[16];
268	struct pcpu *pc;
269	struct pcpu_stats *pcs;
270
271	/*
272	 * Good {morning,afternoon,evening,night}.
273	 */
274	identifycpu();
275
276#ifdef PERFMON
277	perfmon_init();
278#endif
279	printf("real memory  = %ld (%ld MB)\n", ptoa(realmem),
280	    ptoa(realmem) / 1048576);
281
282	vm_ksubmap_init(&kmi);
283
284	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
285	    ptoa(cnt.v_free_count) / 1048576);
286
287	if (fpswa_iface == NULL)
288		printf("Warning: no FPSWA package supplied\n");
289	else
290		printf("FPSWA Revision = 0x%lx, Entry = %p\n",
291		    (long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);
292
293	/*
294	 * Set up buffers, so they can be used to read disk labels.
295	 */
296	bufinit();
297	vm_pager_bufferinit();
298
299	/*
300	 * Traverse the MADT to discover IOSAPIC and Local SAPIC
301	 * information.
302	 */
303	ia64_probe_sapics();
304	ia64_pib = pmap_mapdev(ia64_lapic_addr, sizeof(*ia64_pib));
305
306	ia64_mca_init();
307
308	/*
309	 * Create sysctl tree for per-CPU information.
310	 */
311	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
312		snprintf(nodename, sizeof(nodename), "%u", pc->pc_cpuid);
313		sysctl_ctx_init(&pc->pc_md.sysctl_ctx);
314		pc->pc_md.sysctl_tree = SYSCTL_ADD_NODE(&pc->pc_md.sysctl_ctx,
315		    SYSCTL_STATIC_CHILDREN(_machdep_cpu), OID_AUTO, nodename,
316		    CTLFLAG_RD, NULL, "");
317		if (pc->pc_md.sysctl_tree == NULL)
318			continue;
319
320		pcs = &pc->pc_md.stats;
321
322		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
323		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
324		    "nasts", CTLFLAG_RD, &pcs->pcs_nasts,
325		    "Number of IPI_AST interrupts");
326
327		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
328		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
329		    "nclks", CTLFLAG_RD, &pcs->pcs_nclks,
330		    "Number of clock interrupts");
331
332		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
333		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
334		    "nextints", CTLFLAG_RD, &pcs->pcs_nextints,
335		    "Number of ExtINT interrupts");
336
337		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
338		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
339		    "nhardclocks", CTLFLAG_RD, &pcs->pcs_nhardclocks,
340		    "Number of IPI_HARDCLOCK interrupts");
341
342		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
343		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
344		    "nhighfps", CTLFLAG_RD, &pcs->pcs_nhighfps,
345		    "Number of IPI_HIGH_FP interrupts");
346
347		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
348		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
349		    "nhwints", CTLFLAG_RD, &pcs->pcs_nhwints,
350		    "Number of hardware (device) interrupts");
351
352		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
353		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
354		    "npreempts", CTLFLAG_RD, &pcs->pcs_npreempts,
355		    "Number of IPI_PREEMPT interrupts");
356
357		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
358		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
359		    "nrdvs", CTLFLAG_RD, &pcs->pcs_nrdvs,
360		    "Number of IPI_RENDEZVOUS interrupts");
361
362		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
363		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
364		    "nstops", CTLFLAG_RD, &pcs->pcs_nstops,
365		    "Number of IPI_STOP interrupts");
366
367		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
368		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
369		    "nstrays", CTLFLAG_RD, &pcs->pcs_nstrays,
370		    "Number of stray interrupts");
371	}
372}
373SYSINIT(cpu_startup, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
374
375void
376cpu_flush_dcache(void *ptr, size_t len)
377{
378	vm_offset_t lim, va;
379
380	va = (uintptr_t)ptr & ~31;
381	lim = (uintptr_t)ptr + len;
382	while (va < lim) {
383		ia64_fc(va);
384		va += 32;
385	}
386
387	ia64_srlz_d();
388}
389
390/* Get current clock frequency for the given cpu id. */
391int
392cpu_est_clockrate(int cpu_id, uint64_t *rate)
393{
394
395	if (pcpu_find(cpu_id) == NULL || rate == NULL)
396		return (EINVAL);
397	*rate = (u_long)cpu_freq * 1000000ul;
398	return (0);
399}
400
401void
402cpu_halt()
403{
404
405	efi_reset_system();
406}
407
408void
409cpu_idle(int busy)
410{
411	register_t ie;
412	sbintime_t sbt = -1;
413
414	if (!busy) {
415		critical_enter();
416		sbt = cpu_idleclock();
417	}
418
419	ie = intr_disable();
420	KASSERT(ie != 0, ("%s called with interrupts disabled\n", __func__));
421
422	if (sched_runnable())
423		ia64_enable_intr();
424	else if (cpu_idle_hook != NULL) {
425		(*cpu_idle_hook)(sbt);
426		/* The hook must enable interrupts! */
427	} else {
428		ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
429		ia64_enable_intr();
430	}
431
432	if (!busy) {
433		cpu_activeclock();
434		critical_exit();
435	}
436}
437
438int
439cpu_idle_wakeup(int cpu)
440{
441
442	return (0);
443}
444
445void
446cpu_reset()
447{
448
449	efi_reset_system();
450}
451
452void
453cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx)
454{
455	struct pcb *oldpcb, *newpcb;
456
457	oldpcb = old->td_pcb;
458#ifdef COMPAT_FREEBSD32
459	ia32_savectx(oldpcb);
460#endif
461	if (pcpup->pc_fpcurthread == old)
462		old->td_frame->tf_special.psr |= IA64_PSR_DFH;
463	if (!savectx(oldpcb)) {
464		newpcb = new->td_pcb;
465		oldpcb->pcb_current_pmap =
466		    pmap_switch(newpcb->pcb_current_pmap);
467
468		atomic_store_rel_ptr(&old->td_lock, mtx);
469
470#if defined(SCHED_ULE) && defined(SMP)
471		while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
472			cpu_spinwait();
473#endif
474
475		pcpup->pc_curthread = new;
476
477#ifdef COMPAT_FREEBSD32
478		ia32_restorectx(newpcb);
479#endif
480
481		if (pcpup->pc_fpcurthread == new)
482			new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
483		restorectx(newpcb);
484		/* We should not get here. */
485		panic("cpu_switch: restorectx() returned");
486		/* NOTREACHED */
487	}
488}
489
490void
491cpu_throw(struct thread *old __unused, struct thread *new)
492{
493	struct pcb *newpcb;
494
495	newpcb = new->td_pcb;
496	(void)pmap_switch(newpcb->pcb_current_pmap);
497
498#if defined(SCHED_ULE) && defined(SMP)
499	while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
500		cpu_spinwait();
501#endif
502
503	pcpup->pc_curthread = new;
504
505#ifdef COMPAT_FREEBSD32
506	ia32_restorectx(newpcb);
507#endif
508
509	restorectx(newpcb);
510	/* We should not get here. */
511	panic("cpu_throw: restorectx() returned");
512	/* NOTREACHED */
513}
514
515void
516cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
517{
518
519	/*
520	 * Set pc_acpi_id to "uninitialized".
521	 * See sys/dev/acpica/acpi_cpu.c
522	 */
523	pcpu->pc_acpi_id = 0xffffffff;
524}
525
526void
527cpu_pcpu_setup(struct pcpu *pc, u_int acpi_id, u_int sapic_id)
528{
529
530	pc->pc_acpi_id = acpi_id;
531	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
532}
533
534void
535spinlock_enter(void)
536{
537	struct thread *td;
538	int intr;
539
540	td = curthread;
541	if (td->td_md.md_spinlock_count == 0) {
542		intr = intr_disable();
543		td->td_md.md_spinlock_count = 1;
544		td->td_md.md_saved_intr = intr;
545	} else
546		td->td_md.md_spinlock_count++;
547	critical_enter();
548}
549
550void
551spinlock_exit(void)
552{
553	struct thread *td;
554	int intr;
555
556	td = curthread;
557	critical_exit();
558	intr = td->td_md.md_saved_intr;
559	td->td_md.md_spinlock_count--;
560	if (td->td_md.md_spinlock_count == 0)
561		intr_restore(intr);
562}
563
564void
565kdb_cpu_trap(int vector, int code __unused)
566{
567
568#ifdef XTRACE
569	ia64_xtrace_stop();
570#endif
571	__asm __volatile("flushrs;;");
572
573	/* Restart after the break instruction. */
574	if (vector == IA64_VEC_BREAK &&
575	    kdb_frame->tf_special.ifa == IA64_FIXED_BREAK)
576		kdb_frame->tf_special.psr += IA64_PSR_RI_1;
577}
578
579void
580map_vhpt(uintptr_t vhpt)
581{
582	pt_entry_t pte;
583	uint64_t psr;
584
585	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
586	    PTE_PL_KERN | PTE_AR_RW;
587	pte |= vhpt & PTE_PPN_MASK;
588
589	__asm __volatile("ptr.d %0,%1" :: "r"(vhpt),
590	    "r"(pmap_vhpt_log2size << 2));
591
592	__asm __volatile("mov   %0=psr" : "=r"(psr));
593	__asm __volatile("rsm   psr.ic|psr.i");
594	ia64_srlz_i();
595	ia64_set_ifa(vhpt);
596	ia64_set_itir(pmap_vhpt_log2size << 2);
597	ia64_srlz_d();
598	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(3), "r"(pte));
599	__asm __volatile("mov   psr.l=%0" :: "r" (psr));
600	ia64_srlz_i();
601}
602
603void
604map_pal_code(void)
605{
606	pt_entry_t pte;
607	vm_offset_t va;
608	vm_size_t sz;
609	uint64_t psr;
610	u_int shft;
611
612	if (ia64_pal_size == 0)
613		return;
614
615	va = IA64_PHYS_TO_RR7(ia64_pal_base);
616
617	sz = ia64_pal_size;
618	shft = 0;
619	while (sz > 1) {
620		shft++;
621		sz >>= 1;
622	}
623
624	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
625	    PTE_PL_KERN | PTE_AR_RWX;
626	pte |= ia64_pal_base & PTE_PPN_MASK;
627
628	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(va), "r"(shft<<2));
629
630	__asm __volatile("mov	%0=psr" : "=r"(psr));
631	__asm __volatile("rsm	psr.ic|psr.i");
632	ia64_srlz_i();
633	ia64_set_ifa(va);
634	ia64_set_itir(shft << 2);
635	ia64_srlz_d();
636	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(4), "r"(pte));
637	ia64_srlz_d();
638	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(1), "r"(pte));
639	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
640	ia64_srlz_i();
641}
642
643void
644map_gateway_page(void)
645{
646	pt_entry_t pte;
647	uint64_t psr;
648
649	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
650	    PTE_PL_KERN | PTE_AR_X_RX;
651	pte |= ia64_tpa((uint64_t)ia64_gateway_page) & PTE_PPN_MASK;
652
653	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
654	    "r"(VM_MAXUSER_ADDRESS), "r"(PAGE_SHIFT << 2));
655
656	__asm __volatile("mov	%0=psr" : "=r"(psr));
657	__asm __volatile("rsm	psr.ic|psr.i");
658	ia64_srlz_i();
659	ia64_set_ifa(VM_MAXUSER_ADDRESS);
660	ia64_set_itir(PAGE_SHIFT << 2);
661	ia64_srlz_d();
662	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(5), "r"(pte));
663	ia64_srlz_d();
664	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(2), "r"(pte));
665	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
666	ia64_srlz_i();
667
668	/* Expose the mapping to userland in ar.k5 */
669	ia64_set_k5(VM_MAXUSER_ADDRESS);
670}
671
672static u_int
673freq_ratio(u_long base, u_long ratio)
674{
675	u_long f;
676
677	f = (base * (ratio >> 32)) / (ratio & 0xfffffffful);
678	return ((f + 500000) / 1000000);
679}
680
681static void
682calculate_frequencies(void)
683{
684	struct ia64_sal_result sal;
685	struct ia64_pal_result pal;
686	register_t ie;
687
688	ie = intr_disable();
689	sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
690	pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
691	intr_restore(ie);
692
693	if (sal.sal_status == 0 && pal.pal_status == 0) {
694		if (bootverbose) {
695			printf("Platform clock frequency %ld Hz\n",
696			       sal.sal_result[0]);
697			printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
698			       "ITC ratio %ld/%ld\n",
699			       pal.pal_result[0] >> 32,
700			       pal.pal_result[0] & ((1L << 32) - 1),
701			       pal.pal_result[1] >> 32,
702			       pal.pal_result[1] & ((1L << 32) - 1),
703			       pal.pal_result[2] >> 32,
704			       pal.pal_result[2] & ((1L << 32) - 1));
705		}
706		cpu_freq = freq_ratio(sal.sal_result[0], pal.pal_result[0]);
707		bus_freq = freq_ratio(sal.sal_result[0], pal.pal_result[1]);
708		itc_freq = freq_ratio(sal.sal_result[0], pal.pal_result[2]);
709	}
710}
711
712struct ia64_init_return
713ia64_init(void)
714{
715	struct ia64_init_return ret;
716	struct efi_md *md;
717	pt_entry_t *pbvm_pgtbl_ent, *pbvm_pgtbl_lim;
718	char *p;
719	vm_size_t mdlen;
720	int metadata_missing;
721
722	/*
723	 * NO OUTPUT ALLOWED UNTIL FURTHER NOTICE.
724	 */
725
726	ia64_set_fpsr(IA64_FPSR_DEFAULT);
727
728	/*
729	 * Region 6 is direct mapped UC and region 7 is direct mapped
730	 * WC. The details of this is controlled by the Alt {I,D}TLB
731	 * handlers. Here we just make sure that they have the largest
732	 * possible page size to minimise TLB usage.
733	 */
734	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (LOG2_ID_PAGE_SIZE << 2));
735	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (LOG2_ID_PAGE_SIZE << 2));
736	ia64_srlz_d();
737
738	/* Initialize/setup physical memory datastructures */
739	ia64_physmem_init();
740
741	/*
742	 * Process the memory map. This gives us the PAL locations,
743	 * the I/O port base address, the available memory regions
744	 * for initializing the physical memory map.
745	 */
746	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
747		mdlen = md->md_pages * EFI_PAGE_SIZE;
748		switch (md->md_type) {
749		case EFI_MD_TYPE_IOPORT:
750			ia64_port_base = pmap_mapdev_priv(md->md_phys,
751			    mdlen, VM_MEMATTR_UNCACHEABLE);
752			break;
753		case EFI_MD_TYPE_PALCODE:
754			ia64_pal_base = md->md_phys;
755			ia64_pal_size = mdlen;
756			/*FALLTHROUGH*/
757		case EFI_MD_TYPE_BAD:
758		case EFI_MD_TYPE_FIRMWARE:
759		case EFI_MD_TYPE_RECLAIM:
760		case EFI_MD_TYPE_RT_CODE:
761		case EFI_MD_TYPE_RT_DATA:
762			/* Don't use these memory regions. */
763			ia64_physmem_track(md->md_phys, mdlen);
764			break;
765		case EFI_MD_TYPE_BS_CODE:
766		case EFI_MD_TYPE_BS_DATA:
767		case EFI_MD_TYPE_CODE:
768		case EFI_MD_TYPE_DATA:
769		case EFI_MD_TYPE_FREE:
770			/* These are ok to use. */
771			ia64_physmem_add(md->md_phys, mdlen);
772			break;
773		}
774	}
775
776	/*
777	 * Remove the PBVM and its page table from phys_avail. The loader
778	 * passes the physical address of the page table to us. The virtual
779	 * address of the page table is fixed.
780	 * Track and the PBVM limit for later use.
781	 */
782	ia64_physmem_delete(bootinfo->bi_pbvm_pgtbl, bootinfo->bi_pbvm_pgtblsz);
783	pbvm_pgtbl_ent = (void *)IA64_PBVM_PGTBL;
784	pbvm_pgtbl_lim = (void *)(IA64_PBVM_PGTBL + bootinfo->bi_pbvm_pgtblsz);
785	while (pbvm_pgtbl_ent < pbvm_pgtbl_lim) {
786		if ((*pbvm_pgtbl_ent & PTE_PRESENT) == 0)
787			break;
788		ia64_physmem_delete(*pbvm_pgtbl_ent & PTE_PPN_MASK,
789		    IA64_PBVM_PAGE_SIZE);
790		pbvm_pgtbl_ent++;
791	}
792
793	/* Finalize physical memory datastructures */
794	ia64_physmem_fini();
795
796	metadata_missing = 0;
797	if (bootinfo->bi_modulep)
798		preload_metadata = (caddr_t)bootinfo->bi_modulep;
799	else
800		metadata_missing = 1;
801
802	if (envmode == 0 && bootinfo->bi_envp)
803		kern_envp = (caddr_t)bootinfo->bi_envp;
804	else
805		kern_envp = static_env;
806
807	/*
808	 * Look at arguments passed to us and compute boothowto.
809	 */
810	boothowto = bootinfo->bi_boothowto;
811
812	if (boothowto & RB_VERBOSE)
813		bootverbose = 1;
814
815	/*
816	 * Wire things up so we can call the firmware.
817	 */
818	map_pal_code();
819	efi_boot_minimal(bootinfo->bi_systab);
820	ia64_xiv_init();
821	ia64_sal_init();
822	calculate_frequencies();
823
824	set_cputicker(ia64_get_itc, (u_long)itc_freq * 1000000, 0);
825
826	/*
827	 * Setup the PCPU data for the bootstrap processor. It is needed
828	 * by printf(). Also, since printf() has critical sections, we
829	 * need to initialize at least pc_curthread.
830	 */
831	pcpup = &pcpu0;
832	ia64_set_k4((u_int64_t)pcpup);
833	pcpu_init(pcpup, 0, sizeof(pcpu0));
834	dpcpu_init(ia64_physmem_alloc(DPCPU_SIZE, PAGE_SIZE), 0);
835	cpu_pcpu_setup(pcpup, ~0U, ia64_get_lid());
836	pcpup->pc_curthread = &thread0;
837
838	/*
839	 * Initialize the console before we print anything out.
840	 */
841	cninit();
842
843	/* OUTPUT NOW ALLOWED */
844
845	if (metadata_missing)
846		printf("WARNING: loader(8) metadata is missing!\n");
847
848	/* Get FPSWA interface */
849	fpswa_iface = (bootinfo->bi_fpswa == 0) ? NULL :
850	    (struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo->bi_fpswa);
851
852	/* Init basic tunables, including hz */
853	init_param1();
854
855	p = getenv("kernelname");
856	if (p != NULL) {
857		strlcpy(kernelname, p, sizeof(kernelname));
858		freeenv(p);
859	}
860
861	init_param2(physmem);
862
863	/*
864	 * Initialize error message buffer (at end of core).
865	 */
866	msgbufp = ia64_physmem_alloc(msgbufsize, PAGE_SIZE);
867	msgbufinit(msgbufp, msgbufsize);
868
869	proc_linkup0(&proc0, &thread0);
870	/*
871	 * Init mapping for kernel stack for proc 0
872	 */
873	p = ia64_physmem_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
874	thread0.td_kstack = (uintptr_t)p;
875	thread0.td_kstack_pages = KSTACK_PAGES;
876
877	mutex_init();
878
879	/*
880	 * Initialize the rest of proc 0's PCB.
881	 *
882	 * Set the kernel sp, reserving space for an (empty) trapframe,
883	 * and make proc0's trapframe pointer point to it for sanity.
884	 * Initialise proc0's backing store to start after u area.
885	 */
886	cpu_thread_alloc(&thread0);
887	thread0.td_frame->tf_flags = FRAME_SYSCALL;
888	thread0.td_pcb->pcb_special.sp =
889	    (u_int64_t)thread0.td_frame - 16;
890	thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;
891
892	/*
893	 * Initialize the virtual memory system.
894	 */
895	pmap_bootstrap();
896
897#ifdef XTRACE
898	ia64_xtrace_init_bsp();
899#endif
900
901	/*
902	 * Initialize debuggers, and break into them if appropriate.
903	 */
904#ifdef DDB
905	ksym_start = bootinfo->bi_symtab;
906	ksym_end = bootinfo->bi_esymtab;
907#endif
908
909	kdb_init();
910
911#ifdef KDB
912	if (boothowto & RB_KDB)
913		kdb_enter(KDB_WHY_BOOTFLAGS,
914		    "Boot flags requested debugger\n");
915#endif
916
917	ia64_set_tpr(0);
918	ia64_srlz_d();
919
920	ret.bspstore = thread0.td_pcb->pcb_special.bspstore;
921	ret.sp = thread0.td_pcb->pcb_special.sp;
922	return (ret);
923}
924
925uint64_t
926ia64_get_hcdp(void)
927{
928
929	return (bootinfo->bi_hcdp);
930}
931
932void
933bzero(void *buf, size_t len)
934{
935	caddr_t p = buf;
936
937	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
938		*p++ = 0;
939		len--;
940	}
941	while (len >= sizeof(u_long) * 8) {
942		*(u_long*) p = 0;
943		*((u_long*) p + 1) = 0;
944		*((u_long*) p + 2) = 0;
945		*((u_long*) p + 3) = 0;
946		len -= sizeof(u_long) * 8;
947		*((u_long*) p + 4) = 0;
948		*((u_long*) p + 5) = 0;
949		*((u_long*) p + 6) = 0;
950		*((u_long*) p + 7) = 0;
951		p += sizeof(u_long) * 8;
952	}
953	while (len >= sizeof(u_long)) {
954		*(u_long*) p = 0;
955		len -= sizeof(u_long);
956		p += sizeof(u_long);
957	}
958	while (len) {
959		*p++ = 0;
960		len--;
961	}
962}
963
964u_int
965ia64_itc_freq(void)
966{
967
968	return (itc_freq);
969}
970
971void
972DELAY(int n)
973{
974	u_int64_t start, end, now;
975
976	sched_pin();
977
978	start = ia64_get_itc();
979	end = start + itc_freq * n;
980	/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
981	do {
982		now = ia64_get_itc();
983	} while (now < end || (now > start && end < start));
984
985	sched_unpin();
986}
987
988/*
989 * Send an interrupt (signal) to a process.
990 */
991void
992sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
993{
994	struct proc *p;
995	struct thread *td;
996	struct trapframe *tf;
997	struct sigacts *psp;
998	struct sigframe sf, *sfp;
999	u_int64_t sbs, sp;
1000	int oonstack;
1001	int sig;
1002	u_long code;
1003
1004	td = curthread;
1005	p = td->td_proc;
1006	PROC_LOCK_ASSERT(p, MA_OWNED);
1007	sig = ksi->ksi_signo;
1008	code = ksi->ksi_code;
1009	psp = p->p_sigacts;
1010	mtx_assert(&psp->ps_mtx, MA_OWNED);
1011	tf = td->td_frame;
1012	sp = tf->tf_special.sp;
1013	oonstack = sigonstack(sp);
1014	sbs = 0;
1015
1016	/* save user context */
1017	bzero(&sf, sizeof(struct sigframe));
1018	sf.sf_uc.uc_sigmask = *mask;
1019	sf.sf_uc.uc_stack = td->td_sigstk;
1020	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
1021	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
1022
1023	/*
1024	 * Allocate and validate space for the signal handler
1025	 * context. Note that if the stack is in P0 space, the
1026	 * call to grow() is a nop, and the useracc() check
1027	 * will fail if the process has not already allocated
1028	 * the space with a `brk'.
1029	 */
1030	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
1031	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
1032		sbs = (u_int64_t)td->td_sigstk.ss_sp;
1033		sbs = (sbs + 15) & ~15;
1034		sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
1035#if defined(COMPAT_43)
1036		td->td_sigstk.ss_flags |= SS_ONSTACK;
1037#endif
1038	} else
1039		sfp = (struct sigframe *)sp;
1040	sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);
1041
1042	/* Fill in the siginfo structure for POSIX handlers. */
1043	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
1044		sf.sf_si = ksi->ksi_info;
1045		sf.sf_si.si_signo = sig;
1046		/*
1047		 * XXX this shouldn't be here after code in trap.c
1048		 * is fixed
1049		 */
1050		sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
1051		code = (u_int64_t)&sfp->sf_si;
1052	}
1053
1054	mtx_unlock(&psp->ps_mtx);
1055	PROC_UNLOCK(p);
1056
1057	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
1058
1059	/* Copy the frame out to userland. */
1060	if (copyout(&sf, sfp, sizeof(sf)) != 0) {
1061		/*
1062		 * Process has trashed its stack; give it an illegal
1063		 * instruction to halt it in its tracks.
1064		 */
1065		PROC_LOCK(p);
1066		sigexit(td, SIGILL);
1067		return;
1068	}
1069
1070	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
1071		tf->tf_special.psr &= ~IA64_PSR_RI;
1072		tf->tf_special.iip = ia64_get_k5() +
1073		    ((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
1074	} else
1075		tf->tf_special.iip = ia64_get_k5() +
1076		    ((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);
1077
1078	/*
1079	 * Setup the trapframe to return to the signal trampoline. We pass
1080	 * information to the trampoline in the following registers:
1081	 *
1082	 *	gp	new backing store or NULL
1083	 *	r8	signal number
1084	 *	r9	signal code or siginfo pointer
1085	 *	r10	signal handler (function descriptor)
1086	 */
1087	tf->tf_special.sp = (u_int64_t)sfp - 16;
1088	tf->tf_special.gp = sbs;
1089	tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
1090	tf->tf_special.ndirty = 0;
1091	tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
1092	tf->tf_scratch.gr8 = sig;
1093	tf->tf_scratch.gr9 = code;
1094	tf->tf_scratch.gr10 = (u_int64_t)catcher;
1095
1096	PROC_LOCK(p);
1097	mtx_lock(&psp->ps_mtx);
1098}
1099
1100/*
1101 * System call to cleanup state after a signal
1102 * has been taken.  Reset signal mask and
1103 * stack state from context left by sendsig (above).
1104 * Return to previous pc and psl as specified by
1105 * context left by sendsig. Check carefully to
1106 * make sure that the user has not modified the
1107 * state to gain improper privileges.
1108 *
1109 * MPSAFE
1110 */
1111int
1112sys_sigreturn(struct thread *td,
1113	struct sigreturn_args /* {
1114		ucontext_t *sigcntxp;
1115	} */ *uap)
1116{
1117	ucontext_t uc;
1118	struct trapframe *tf;
1119	struct pcb *pcb;
1120
1121	tf = td->td_frame;
1122	pcb = td->td_pcb;
1123
1124	/*
1125	 * Fetch the entire context structure at once for speed.
1126	 * We don't use a normal argument to simplify RSE handling.
1127	 */
1128	if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
1129		return (EFAULT);
1130
1131	set_mcontext(td, &uc.uc_mcontext);
1132
1133#if defined(COMPAT_43)
1134	if (sigonstack(tf->tf_special.sp))
1135		td->td_sigstk.ss_flags |= SS_ONSTACK;
1136	else
1137		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1138#endif
1139	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
1140
1141	return (EJUSTRETURN);
1142}
1143
1144#ifdef COMPAT_FREEBSD4
1145int
1146freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
1147{
1148
1149	return sys_sigreturn(td, (struct sigreturn_args *)uap);
1150}
1151#endif
1152
1153/*
1154 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1155 * we want to start a backtrace from the function that caused us to enter
1156 * the debugger. We have the context in the trapframe, but base the trace
1157 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1158 * enough for a backtrace.
1159 */
1160void
1161makectx(struct trapframe *tf, struct pcb *pcb)
1162{
1163
1164	pcb->pcb_special = tf->tf_special;
1165	pcb->pcb_special.__spare = ~0UL;	/* XXX see unwind.c */
1166	save_callee_saved(&pcb->pcb_preserved);
1167	save_callee_saved_fp(&pcb->pcb_preserved_fp);
1168}
1169
1170int
1171ia64_flush_dirty(struct thread *td, struct _special *r)
1172{
1173	struct iovec iov;
1174	struct uio uio;
1175	uint64_t bspst, kstk, rnat;
1176	int error, locked;
1177
1178	if (r->ndirty == 0)
1179		return (0);
1180
1181	kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
1182	if (td == curthread) {
1183		__asm __volatile("mov	ar.rsc=0;;");
1184		__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
1185		/* Make sure we have all the user registers written out. */
1186		if (bspst - kstk < r->ndirty) {
1187			__asm __volatile("flushrs;;");
1188			__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
1189		}
1190		__asm __volatile("mov	%0=ar.rnat;;" : "=r"(rnat));
1191		__asm __volatile("mov	ar.rsc=3");
1192		error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty);
1193		kstk += r->ndirty;
1194		r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
1195		    ? *(uint64_t*)(kstk | 0x1f8L) : rnat;
1196	} else {
1197		locked = PROC_LOCKED(td->td_proc);
1198		if (!locked)
1199			PHOLD(td->td_proc);
1200		iov.iov_base = (void*)(uintptr_t)kstk;
1201		iov.iov_len = r->ndirty;
1202		uio.uio_iov = &iov;
1203		uio.uio_iovcnt = 1;
1204		uio.uio_offset = r->bspstore;
1205		uio.uio_resid = r->ndirty;
1206		uio.uio_segflg = UIO_SYSSPACE;
1207		uio.uio_rw = UIO_WRITE;
1208		uio.uio_td = td;
1209		error = proc_rwmem(td->td_proc, &uio);
1210		/*
1211		 * XXX proc_rwmem() doesn't currently return ENOSPC,
1212		 * so I think it can bogusly return 0. Neither do
1213		 * we allow short writes.
1214		 */
1215		if (uio.uio_resid != 0 && error == 0)
1216			error = ENOSPC;
1217		if (!locked)
1218			PRELE(td->td_proc);
1219	}
1220
1221	r->bspstore += r->ndirty;
1222	r->ndirty = 0;
1223	return (error);
1224}
1225
1226int
1227get_mcontext(struct thread *td, mcontext_t *mc, int flags)
1228{
1229	struct trapframe *tf;
1230	int error;
1231
1232	tf = td->td_frame;
1233	bzero(mc, sizeof(*mc));
1234	mc->mc_special = tf->tf_special;
1235	error = ia64_flush_dirty(td, &mc->mc_special);
1236	if (tf->tf_flags & FRAME_SYSCALL) {
1237		mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT;
1238		mc->mc_scratch = tf->tf_scratch;
1239		if (flags & GET_MC_CLEAR_RET) {
1240			mc->mc_scratch.gr8 = 0;
1241			mc->mc_scratch.gr9 = 0;
1242			mc->mc_scratch.gr10 = 0;
1243			mc->mc_scratch.gr11 = 0;
1244		}
1245	} else {
1246		mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT;
1247		mc->mc_scratch = tf->tf_scratch;
1248		mc->mc_scratch_fp = tf->tf_scratch_fp;
1249		/*
1250		 * XXX If the thread never used the high FP registers, we
1251		 * probably shouldn't waste time saving them.
1252		 */
1253		ia64_highfp_save(td);
1254		mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID;
1255		mc->mc_high_fp = td->td_pcb->pcb_high_fp;
1256	}
1257	save_callee_saved(&mc->mc_preserved);
1258	save_callee_saved_fp(&mc->mc_preserved_fp);
1259	return (error);
1260}
1261
1262int
1263set_mcontext(struct thread *td, const mcontext_t *mc)
1264{
1265	struct _special s;
1266	struct trapframe *tf;
1267	uint64_t psrmask;
1268
1269	tf = td->td_frame;
1270
1271	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
1272	    ("Whoa there! We have more than 8KB of dirty registers!"));
1273
1274	s = mc->mc_special;
1275	/*
1276	 * Only copy the user mask and the restart instruction bit from
1277	 * the new context.
1278	 */
1279	psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
1280	    IA64_PSR_MFH | IA64_PSR_RI;
1281	s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask);
1282	/* We don't have any dirty registers of the new context. */
1283	s.ndirty = 0;
1284	if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
1285		/*
1286		 * We can get an async context passed to us while we
1287		 * entered the kernel through a syscall: sigreturn(2)
1288		 * takes contexts that could previously be the result of
1289		 * a trap or interrupt.
1290		 * Hence, we cannot assert that the trapframe is not
1291		 * a syscall frame, but we can assert that it's at
1292		 * least an expected syscall.
1293		 */
1294		if (tf->tf_flags & FRAME_SYSCALL) {
1295			KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn, ("foo"));
1296			tf->tf_flags &= ~FRAME_SYSCALL;
1297		}
1298		tf->tf_scratch = mc->mc_scratch;
1299		tf->tf_scratch_fp = mc->mc_scratch_fp;
1300		if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
1301			td->td_pcb->pcb_high_fp = mc->mc_high_fp;
1302	} else {
1303		KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
1304		if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
1305			s.cfm = tf->tf_special.cfm;
1306			s.iip = tf->tf_special.iip;
1307			tf->tf_scratch.gr15 = 0;	/* Clear syscall nr. */
1308		} else
1309			tf->tf_scratch = mc->mc_scratch;
1310	}
1311	tf->tf_special = s;
1312	restore_callee_saved(&mc->mc_preserved);
1313	restore_callee_saved_fp(&mc->mc_preserved_fp);
1314
1315	return (0);
1316}
1317
1318/*
1319 * Clear registers on exec.
1320 */
1321void
1322exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
1323{
1324	struct trapframe *tf;
1325	uint64_t *ksttop, *kst;
1326
1327	tf = td->td_frame;
1328	ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
1329	    (tf->tf_special.bspstore & 0x1ffUL));
1330
1331	/*
1332	 * We can ignore up to 8KB of dirty registers by masking off the
1333	 * lower 13 bits in exception_restore() or epc_syscall(). This
1334	 * should be enough for a couple of years, but if there are more
1335	 * than 8KB of dirty registers, we lose track of the bottom of
1336	 * the kernel stack. The solution is to copy the active part of
1337	 * the kernel stack down 1 page (or 2, but not more than that)
1338	 * so that we always have less than 8KB of dirty registers.
1339	 */
1340	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
1341	    ("Whoa there! We have more than 8KB of dirty registers!"));
1342
1343	bzero(&tf->tf_special, sizeof(tf->tf_special));
1344	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {	/* break syscalls. */
1345		bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
1346		bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
1347		tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL;
1348		tf->tf_special.bspstore = IA64_BACKINGSTORE;
1349		/*
1350		 * Copy the arguments onto the kernel register stack so that
1351		 * they get loaded by the loadrs instruction. Skip over the
1352		 * NaT collection points.
1353		 */
1354		kst = ksttop - 1;
1355		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1356			*kst-- = 0;
1357		*kst-- = 0;
1358		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1359			*kst-- = 0;
1360		*kst-- = imgp->ps_strings;
1361		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1362			*kst-- = 0;
1363		*kst = stack;
1364		tf->tf_special.ndirty = (ksttop - kst) << 3;
1365	} else {				/* epc syscalls (default). */
1366		tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL;
1367		tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
1368		/*
1369		 * Write values for out0, out1 and out2 to the user's backing
1370		 * store and arrange for them to be restored into the user's
1371		 * initial register frame.
1372		 * Assumes that (bspstore & 0x1f8) < 0x1e0.
1373		 */
1374		suword((caddr_t)tf->tf_special.bspstore - 24, stack);
1375		suword((caddr_t)tf->tf_special.bspstore - 16, imgp->ps_strings);
1376		suword((caddr_t)tf->tf_special.bspstore -  8, 0);
1377	}
1378
1379	tf->tf_special.iip = imgp->entry_addr;
1380	tf->tf_special.sp = (stack & ~15) - 16;
1381	tf->tf_special.rsc = 0xf;
1382	tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
1383	tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT |
1384	    IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN |
1385	    IA64_PSR_CPL_USER;
1386}
1387
1388int
1389ptrace_set_pc(struct thread *td, unsigned long addr)
1390{
1391	uint64_t slot;
1392
1393	switch (addr & 0xFUL) {
1394	case 0:
1395		slot = IA64_PSR_RI_0;
1396		break;
1397	case 1:
1398		/* XXX we need to deal with MLX bundles here */
1399		slot = IA64_PSR_RI_1;
1400		break;
1401	case 2:
1402		slot = IA64_PSR_RI_2;
1403		break;
1404	default:
1405		return (EINVAL);
1406	}
1407
1408	td->td_frame->tf_special.iip = addr & ~0x0FULL;
1409	td->td_frame->tf_special.psr =
1410	    (td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot;
1411	return (0);
1412}
1413
1414int
1415ptrace_single_step(struct thread *td)
1416{
1417	struct trapframe *tf;
1418
1419	/*
1420	 * There's no way to set single stepping when we're leaving the
1421	 * kernel through the EPC syscall path. The way we solve this is
1422	 * by enabling the lower-privilege trap so that we re-enter the
1423	 * kernel as soon as the privilege level changes. See trap.c for
1424	 * how we proceed from there.
1425	 */
1426	tf = td->td_frame;
1427	if (tf->tf_flags & FRAME_SYSCALL)
1428		tf->tf_special.psr |= IA64_PSR_LP;
1429	else
1430		tf->tf_special.psr |= IA64_PSR_SS;
1431	return (0);
1432}
1433
1434int
1435ptrace_clear_single_step(struct thread *td)
1436{
1437	struct trapframe *tf;
1438
1439	/*
1440	 * Clear any and all status bits we may use to implement single
1441	 * stepping.
1442	 */
1443	tf = td->td_frame;
1444	tf->tf_special.psr &= ~IA64_PSR_SS;
1445	tf->tf_special.psr &= ~IA64_PSR_LP;
1446	tf->tf_special.psr &= ~IA64_PSR_TB;
1447	return (0);
1448}
1449
1450int
1451fill_regs(struct thread *td, struct reg *regs)
1452{
1453	struct trapframe *tf;
1454
1455	tf = td->td_frame;
1456	regs->r_special = tf->tf_special;
1457	regs->r_scratch = tf->tf_scratch;
1458	save_callee_saved(&regs->r_preserved);
1459	return (0);
1460}
1461
1462int
1463set_regs(struct thread *td, struct reg *regs)
1464{
1465	struct trapframe *tf;
1466	int error;
1467
1468	tf = td->td_frame;
1469	error = ia64_flush_dirty(td, &tf->tf_special);
1470	if (!error) {
1471		tf->tf_special = regs->r_special;
1472		tf->tf_special.bspstore += tf->tf_special.ndirty;
1473		tf->tf_special.ndirty = 0;
1474		tf->tf_scratch = regs->r_scratch;
1475		restore_callee_saved(&regs->r_preserved);
1476	}
1477	return (error);
1478}
1479
1480int
1481fill_dbregs(struct thread *td, struct dbreg *dbregs)
1482{
1483
1484	return (ENOSYS);
1485}
1486
1487int
1488set_dbregs(struct thread *td, struct dbreg *dbregs)
1489{
1490
1491	return (ENOSYS);
1492}
1493
1494int
1495fill_fpregs(struct thread *td, struct fpreg *fpregs)
1496{
1497	struct trapframe *frame = td->td_frame;
1498	struct pcb *pcb = td->td_pcb;
1499
1500	/* Save the high FP registers. */
1501	ia64_highfp_save(td);
1502
1503	fpregs->fpr_scratch = frame->tf_scratch_fp;
1504	save_callee_saved_fp(&fpregs->fpr_preserved);
1505	fpregs->fpr_high = pcb->pcb_high_fp;
1506	return (0);
1507}
1508
1509int
1510set_fpregs(struct thread *td, struct fpreg *fpregs)
1511{
1512	struct trapframe *frame = td->td_frame;
1513	struct pcb *pcb = td->td_pcb;
1514
1515	/* Throw away the high FP registers (should be redundant). */
1516	ia64_highfp_drop(td);
1517
1518	frame->tf_scratch_fp = fpregs->fpr_scratch;
1519	restore_callee_saved_fp(&fpregs->fpr_preserved);
1520	pcb->pcb_high_fp = fpregs->fpr_high;
1521	return (0);
1522}
1523
1524void
1525ia64_sync_icache(vm_offset_t va, vm_offset_t sz)
1526{
1527	vm_offset_t lim;
1528
1529	if (!ia64_sync_icache_needed)
1530		return;
1531
1532	lim = va + sz;
1533	while (va < lim) {
1534		ia64_fc_i(va);
1535		va += 32;	/* XXX */
1536	}
1537
1538	ia64_sync_i();
1539	ia64_srlz_i();
1540}
1541