machdep.c revision 278412
1/*-
2 * Copyright (c) 2003,2004 Marcel Moolenaar
3 * Copyright (c) 2000,2001 Doug Rabson
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/ia64/ia64/machdep.c 278412 2015-02-08 22:17:20Z peter $");
30
31#include "opt_compat.h"
32#include "opt_ddb.h"
33#include "opt_kstack_pages.h"
34#include "opt_sched.h"
35#include "opt_xtrace.h"
36
37#include <sys/param.h>
38#include <sys/proc.h>
39#include <sys/systm.h>
40#include <sys/bio.h>
41#include <sys/buf.h>
42#include <sys/bus.h>
43#include <sys/cons.h>
44#include <sys/cpu.h>
45#include <sys/efi.h>
46#include <sys/eventhandler.h>
47#include <sys/exec.h>
48#include <sys/imgact.h>
49#include <sys/kdb.h>
50#include <sys/kernel.h>
51#include <sys/linker.h>
52#include <sys/lock.h>
53#include <sys/malloc.h>
54#include <sys/mbuf.h>
55#include <sys/msgbuf.h>
56#include <sys/pcpu.h>
57#include <sys/ptrace.h>
58#include <sys/random.h>
59#include <sys/reboot.h>
60#include <sys/rwlock.h>
61#include <sys/sched.h>
62#include <sys/signalvar.h>
63#include <sys/syscall.h>
64#include <sys/syscallsubr.h>
65#include <sys/sysctl.h>
66#include <sys/sysproto.h>
67#include <sys/ucontext.h>
68#include <sys/uio.h>
69#include <sys/uuid.h>
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72
73#include <ddb/ddb.h>
74
75#include <net/netisr.h>
76
77#include <vm/vm.h>
78#include <vm/vm_extern.h>
79#include <vm/vm_kern.h>
80#include <vm/vm_page.h>
81#include <vm/vm_map.h>
82#include <vm/vm_object.h>
83#include <vm/vm_pager.h>
84
85#include <machine/bootinfo.h>
86#include <machine/cpu.h>
87#include <machine/elf.h>
88#include <machine/fpu.h>
89#include <machine/intr.h>
90#include <machine/kdb.h>
91#include <machine/mca.h>
92#include <machine/md_var.h>
93#include <machine/pal.h>
94#include <machine/pcb.h>
95#include <machine/reg.h>
96#include <machine/sal.h>
97#include <machine/sigframe.h>
98#ifdef SMP
99#include <machine/smp.h>
100#endif
101#include <machine/unwind.h>
102#include <machine/vmparam.h>
103
104/*
105 * For atomicity reasons, we demand that pc_curthread is the first
106 * field in the struct pcpu. It allows us to read the pointer with
107 * a single atomic instruction:
108 *	ld8 %curthread = [r13]
109 * Otherwise we would first have to calculate the load address and
110 * store the result in a temporary register and that for the load:
111 *	add %temp = %offsetof(struct pcpu), r13
112 *	ld8 %curthread = [%temp]
113 * A context switch inbetween the add and the ld8 could have the
114 * thread migrate to a different core. In that case,  %curthread
115 * would be the thread running on the original core and not actually
116 * the current thread.
117 */
118CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
119
120static SYSCTL_NODE(_hw, OID_AUTO, freq, CTLFLAG_RD, 0, "");
121static SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RD, 0, "");
122
123static u_int bus_freq;
124SYSCTL_UINT(_hw_freq, OID_AUTO, bus, CTLFLAG_RD, &bus_freq, 0,
125    "Bus clock frequency");
126
127static u_int cpu_freq;
128SYSCTL_UINT(_hw_freq, OID_AUTO, cpu, CTLFLAG_RD, &cpu_freq, 0,
129    "CPU clock frequency");
130
131static u_int itc_freq;
132SYSCTL_UINT(_hw_freq, OID_AUTO, itc, CTLFLAG_RD, &itc_freq, 0,
133    "ITC frequency");
134
135int cold = 1;
136int unmapped_buf_allowed = 0;
137
138struct bootinfo *bootinfo;
139
140struct pcpu pcpu0;
141
142extern u_int64_t kernel_text[], _end[];
143
144extern u_int64_t ia64_gateway_page[];
145extern u_int64_t break_sigtramp[];
146extern u_int64_t epc_sigtramp[];
147
148struct fpswa_iface *fpswa_iface;
149
150vm_size_t ia64_pal_size;
151vm_paddr_t ia64_pal_base;
152vm_offset_t ia64_port_base;
153
154u_int64_t ia64_lapic_addr = PAL_PIB_DEFAULT_ADDR;
155
156struct ia64_pib *ia64_pib;
157
158static int ia64_sync_icache_needed;
159
160char machine[] = MACHINE;
161SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
162
163static char cpu_model[64];
164SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
165    "The CPU model name");
166
167static char cpu_family[64];
168SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
169    "The CPU family name");
170
171#ifdef DDB
172extern vm_offset_t ksym_start, ksym_end;
173#endif
174
175struct msgbuf *msgbufp = NULL;
176
177/* Other subsystems (e.g., ACPI) can hook this later. */
178void (*cpu_idle_hook)(sbintime_t) = NULL;
179
180struct kva_md_info kmi;
181
182static void
183identifycpu(void)
184{
185	char vendor[17];
186	char *family_name, *model_name;
187	u_int64_t features, tmp;
188	int number, revision, model, family, archrev;
189
190	/*
191	 * Assumes little-endian.
192	 */
193	*(u_int64_t *) &vendor[0] = ia64_get_cpuid(0);
194	*(u_int64_t *) &vendor[8] = ia64_get_cpuid(1);
195	vendor[16] = '\0';
196
197	tmp = ia64_get_cpuid(3);
198	number = (tmp >> 0) & 0xff;
199	revision = (tmp >> 8) & 0xff;
200	model = (tmp >> 16) & 0xff;
201	family = (tmp >> 24) & 0xff;
202	archrev = (tmp >> 32) & 0xff;
203
204	family_name = model_name = "unknown";
205	switch (family) {
206	case 0x07:
207		family_name = "Itanium";
208		model_name = "Merced";
209		break;
210	case 0x1f:
211		family_name = "Itanium 2";
212		switch (model) {
213		case 0x00:
214			model_name = "McKinley";
215			break;
216		case 0x01:
217			/*
218			 * Deerfield is a low-voltage variant based on the
219			 * Madison core. We need circumstantial evidence
220			 * (i.e. the clock frequency) to identify those.
221			 * Allow for roughly 1% error margin.
222			 */
223			if (cpu_freq > 990 && cpu_freq < 1010)
224				model_name = "Deerfield";
225			else
226				model_name = "Madison";
227			break;
228		case 0x02:
229			model_name = "Madison II";
230			break;
231		}
232		break;
233	case 0x20:
234		ia64_sync_icache_needed = 1;
235
236		family_name = "Itanium 2";
237		switch (model) {
238		case 0x00:
239			model_name = "Montecito";
240			break;
241		case 0x01:
242			model_name = "Montvale";
243			break;
244		}
245		break;
246	}
247	snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
248	snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);
249
250	features = ia64_get_cpuid(4);
251
252	printf("CPU: %s (", model_name);
253	if (cpu_freq)
254		printf("%u MHz ", cpu_freq);
255	printf("%s)\n", family_name);
256	printf("  Origin = \"%s\"  Revision = %d\n", vendor, revision);
257	printf("  Features = 0x%b\n", (u_int32_t) features,
258	    "\020"
259	    "\001LB"	/* long branch (brl) instruction. */
260	    "\002SD"	/* Spontaneous deferral. */
261	    "\003AO"	/* 16-byte atomic operations (ld, st, cmpxchg). */ );
262}
263
264static void
265cpu_startup(void *dummy)
266{
267	char nodename[16];
268	struct pcpu *pc;
269	struct pcpu_stats *pcs;
270
271	/*
272	 * Good {morning,afternoon,evening,night}.
273	 */
274	identifycpu();
275
276#ifdef PERFMON
277	perfmon_init();
278#endif
279	printf("real memory  = %ld (%ld MB)\n", ptoa(realmem),
280	    ptoa(realmem) / 1048576);
281
282	vm_ksubmap_init(&kmi);
283
284	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
285	    ptoa(cnt.v_free_count) / 1048576);
286
287	if (fpswa_iface == NULL)
288		printf("Warning: no FPSWA package supplied\n");
289	else
290		printf("FPSWA Revision = 0x%lx, Entry = %p\n",
291		    (long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);
292
293	/*
294	 * Set up buffers, so they can be used to read disk labels.
295	 */
296	bufinit();
297	vm_pager_bufferinit();
298
299	/*
300	 * Traverse the MADT to discover IOSAPIC and Local SAPIC
301	 * information.
302	 */
303	ia64_probe_sapics();
304	ia64_pib = pmap_mapdev(ia64_lapic_addr, sizeof(*ia64_pib));
305
306	ia64_mca_init();
307
308	/*
309	 * Create sysctl tree for per-CPU information.
310	 */
311	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
312		snprintf(nodename, sizeof(nodename), "%u", pc->pc_cpuid);
313		sysctl_ctx_init(&pc->pc_md.sysctl_ctx);
314		pc->pc_md.sysctl_tree = SYSCTL_ADD_NODE(&pc->pc_md.sysctl_ctx,
315		    SYSCTL_STATIC_CHILDREN(_machdep_cpu), OID_AUTO, nodename,
316		    CTLFLAG_RD, NULL, "");
317		if (pc->pc_md.sysctl_tree == NULL)
318			continue;
319
320		pcs = &pc->pc_md.stats;
321
322		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
323		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
324		    "nasts", CTLFLAG_RD, &pcs->pcs_nasts,
325		    "Number of IPI_AST interrupts");
326
327		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
328		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
329		    "nclks", CTLFLAG_RD, &pcs->pcs_nclks,
330		    "Number of clock interrupts");
331
332		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
333		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
334		    "nextints", CTLFLAG_RD, &pcs->pcs_nextints,
335		    "Number of ExtINT interrupts");
336
337		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
338		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
339		    "nhardclocks", CTLFLAG_RD, &pcs->pcs_nhardclocks,
340		    "Number of IPI_HARDCLOCK interrupts");
341
342		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
343		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
344		    "nhighfps", CTLFLAG_RD, &pcs->pcs_nhighfps,
345		    "Number of IPI_HIGH_FP interrupts");
346
347		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
348		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
349		    "nhwints", CTLFLAG_RD, &pcs->pcs_nhwints,
350		    "Number of hardware (device) interrupts");
351
352		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
353		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
354		    "npreempts", CTLFLAG_RD, &pcs->pcs_npreempts,
355		    "Number of IPI_PREEMPT interrupts");
356
357		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
358		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
359		    "nrdvs", CTLFLAG_RD, &pcs->pcs_nrdvs,
360		    "Number of IPI_RENDEZVOUS interrupts");
361
362		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
363		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
364		    "nstops", CTLFLAG_RD, &pcs->pcs_nstops,
365		    "Number of IPI_STOP interrupts");
366
367		SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
368		    SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
369		    "nstrays", CTLFLAG_RD, &pcs->pcs_nstrays,
370		    "Number of stray interrupts");
371	}
372}
373SYSINIT(cpu_startup, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
374
375void
376cpu_flush_dcache(void *ptr, size_t len)
377{
378	vm_offset_t lim, va;
379
380	va = (uintptr_t)ptr & ~31;
381	lim = (uintptr_t)ptr + len;
382	while (va < lim) {
383		ia64_fc(va);
384		va += 32;
385	}
386
387	ia64_srlz_d();
388}
389
390/* Get current clock frequency for the given cpu id. */
391int
392cpu_est_clockrate(int cpu_id, uint64_t *rate)
393{
394
395	if (pcpu_find(cpu_id) == NULL || rate == NULL)
396		return (EINVAL);
397	*rate = (u_long)cpu_freq * 1000000ul;
398	return (0);
399}
400
401void
402cpu_halt()
403{
404
405	efi_reset_system();
406}
407
408void
409cpu_idle(int busy)
410{
411	register_t ie;
412	sbintime_t sbt = -1;
413
414	if (!busy) {
415		critical_enter();
416		sbt = cpu_idleclock();
417	}
418
419	ie = intr_disable();
420	KASSERT(ie != 0, ("%s called with interrupts disabled\n", __func__));
421
422	if (sched_runnable())
423		ia64_enable_intr();
424	else if (cpu_idle_hook != NULL) {
425		(*cpu_idle_hook)(sbt);
426		/* The hook must enable interrupts! */
427	} else {
428		ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
429		ia64_enable_intr();
430	}
431
432	if (!busy) {
433		cpu_activeclock();
434		critical_exit();
435	}
436}
437
438int
439cpu_idle_wakeup(int cpu)
440{
441
442	return (0);
443}
444
445void
446cpu_reset()
447{
448
449	efi_reset_system();
450}
451
452void
453cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx)
454{
455	struct pcb *oldpcb, *newpcb;
456
457	oldpcb = old->td_pcb;
458#ifdef COMPAT_FREEBSD32
459	ia32_savectx(oldpcb);
460#endif
461	if (pcpup->pc_fpcurthread == old)
462		old->td_frame->tf_special.psr |= IA64_PSR_DFH;
463	if (!savectx(oldpcb)) {
464		newpcb = new->td_pcb;
465		oldpcb->pcb_current_pmap =
466		    pmap_switch(newpcb->pcb_current_pmap);
467
468		ia64_mf();
469
470		atomic_store_rel_ptr(&old->td_lock, mtx);
471
472#if defined(SCHED_ULE) && defined(SMP)
473		while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
474			cpu_spinwait();
475#endif
476
477		pcpup->pc_curthread = new;
478
479#ifdef COMPAT_FREEBSD32
480		ia32_restorectx(newpcb);
481#endif
482
483		if (pcpup->pc_fpcurthread == new)
484			new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
485		restorectx(newpcb);
486		/* We should not get here. */
487		panic("cpu_switch: restorectx() returned");
488		/* NOTREACHED */
489	}
490}
491
492void
493cpu_throw(struct thread *old __unused, struct thread *new)
494{
495	struct pcb *newpcb;
496
497	newpcb = new->td_pcb;
498	(void)pmap_switch(newpcb->pcb_current_pmap);
499
500#if defined(SCHED_ULE) && defined(SMP)
501	while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
502		cpu_spinwait();
503#endif
504
505	pcpup->pc_curthread = new;
506
507#ifdef COMPAT_FREEBSD32
508	ia32_restorectx(newpcb);
509#endif
510
511	restorectx(newpcb);
512	/* We should not get here. */
513	panic("cpu_throw: restorectx() returned");
514	/* NOTREACHED */
515}
516
517void
518cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
519{
520
521	/*
522	 * Set pc_acpi_id to "uninitialized".
523	 * See sys/dev/acpica/acpi_cpu.c
524	 */
525	pcpu->pc_acpi_id = 0xffffffff;
526}
527
528void
529cpu_pcpu_setup(struct pcpu *pc, u_int acpi_id, u_int sapic_id)
530{
531
532	pc->pc_acpi_id = acpi_id;
533	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
534}
535
536void
537spinlock_enter(void)
538{
539	struct thread *td;
540	int intr;
541
542	td = curthread;
543	if (td->td_md.md_spinlock_count == 0) {
544		intr = intr_disable();
545		td->td_md.md_spinlock_count = 1;
546		td->td_md.md_saved_intr = intr;
547	} else
548		td->td_md.md_spinlock_count++;
549	critical_enter();
550}
551
552void
553spinlock_exit(void)
554{
555	struct thread *td;
556	int intr;
557
558	td = curthread;
559	critical_exit();
560	intr = td->td_md.md_saved_intr;
561	td->td_md.md_spinlock_count--;
562	if (td->td_md.md_spinlock_count == 0)
563		intr_restore(intr);
564}
565
566void
567kdb_cpu_trap(int vector, int code __unused)
568{
569
570#ifdef XTRACE
571	ia64_xtrace_stop();
572#endif
573	__asm __volatile("flushrs;;");
574
575	/* Restart after the break instruction. */
576	if (vector == IA64_VEC_BREAK &&
577	    kdb_frame->tf_special.ifa == IA64_FIXED_BREAK)
578		kdb_frame->tf_special.psr += IA64_PSR_RI_1;
579}
580
581void
582map_vhpt(uintptr_t vhpt)
583{
584	pt_entry_t pte;
585	uint64_t psr;
586
587	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
588	    PTE_PL_KERN | PTE_AR_RW;
589	pte |= vhpt & PTE_PPN_MASK;
590
591	__asm __volatile("ptr.d %0,%1" :: "r"(vhpt),
592	    "r"(pmap_vhpt_log2size << 2));
593
594	__asm __volatile("mov   %0=psr" : "=r"(psr));
595	__asm __volatile("rsm   psr.ic|psr.i");
596	ia64_srlz_i();
597	ia64_set_ifa(vhpt);
598	ia64_set_itir(pmap_vhpt_log2size << 2);
599	ia64_srlz_d();
600	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(3), "r"(pte));
601	__asm __volatile("mov   psr.l=%0" :: "r" (psr));
602	ia64_srlz_i();
603}
604
605void
606map_pal_code(void)
607{
608	pt_entry_t pte;
609	vm_offset_t va;
610	vm_size_t sz;
611	uint64_t psr;
612	u_int shft;
613
614	if (ia64_pal_size == 0)
615		return;
616
617	va = IA64_PHYS_TO_RR7(ia64_pal_base);
618
619	sz = ia64_pal_size;
620	shft = 0;
621	while (sz > 1) {
622		shft++;
623		sz >>= 1;
624	}
625
626	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
627	    PTE_PL_KERN | PTE_AR_RWX;
628	pte |= ia64_pal_base & PTE_PPN_MASK;
629
630	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(va), "r"(shft<<2));
631
632	__asm __volatile("mov	%0=psr" : "=r"(psr));
633	__asm __volatile("rsm	psr.ic|psr.i");
634	ia64_srlz_i();
635	ia64_set_ifa(va);
636	ia64_set_itir(shft << 2);
637	ia64_srlz_d();
638	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(4), "r"(pte));
639	ia64_srlz_d();
640	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(1), "r"(pte));
641	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
642	ia64_srlz_i();
643}
644
645void
646map_gateway_page(void)
647{
648	pt_entry_t pte;
649	uint64_t psr;
650
651	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
652	    PTE_PL_KERN | PTE_AR_X_RX;
653	pte |= ia64_tpa((uint64_t)ia64_gateway_page) & PTE_PPN_MASK;
654
655	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
656	    "r"(VM_MAXUSER_ADDRESS), "r"(PAGE_SHIFT << 2));
657
658	__asm __volatile("mov	%0=psr" : "=r"(psr));
659	__asm __volatile("rsm	psr.ic|psr.i");
660	ia64_srlz_i();
661	ia64_set_ifa(VM_MAXUSER_ADDRESS);
662	ia64_set_itir(PAGE_SHIFT << 2);
663	ia64_srlz_d();
664	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(5), "r"(pte));
665	ia64_srlz_d();
666	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(2), "r"(pte));
667	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
668	ia64_srlz_i();
669
670	/* Expose the mapping to userland in ar.k5 */
671	ia64_set_k5(VM_MAXUSER_ADDRESS);
672}
673
674static u_int
675freq_ratio(u_long base, u_long ratio)
676{
677	u_long f;
678
679	f = (base * (ratio >> 32)) / (ratio & 0xfffffffful);
680	return ((f + 500000) / 1000000);
681}
682
683static void
684calculate_frequencies(void)
685{
686	struct ia64_sal_result sal;
687	struct ia64_pal_result pal;
688	register_t ie;
689
690	ie = intr_disable();
691	sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
692	pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
693	intr_restore(ie);
694
695	if (sal.sal_status == 0 && pal.pal_status == 0) {
696		if (bootverbose) {
697			printf("Platform clock frequency %ld Hz\n",
698			       sal.sal_result[0]);
699			printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
700			       "ITC ratio %ld/%ld\n",
701			       pal.pal_result[0] >> 32,
702			       pal.pal_result[0] & ((1L << 32) - 1),
703			       pal.pal_result[1] >> 32,
704			       pal.pal_result[1] & ((1L << 32) - 1),
705			       pal.pal_result[2] >> 32,
706			       pal.pal_result[2] & ((1L << 32) - 1));
707		}
708		cpu_freq = freq_ratio(sal.sal_result[0], pal.pal_result[0]);
709		bus_freq = freq_ratio(sal.sal_result[0], pal.pal_result[1]);
710		itc_freq = freq_ratio(sal.sal_result[0], pal.pal_result[2]);
711	}
712}
713
714struct ia64_init_return
715ia64_init(void)
716{
717	struct ia64_init_return ret;
718	struct efi_md *md;
719	pt_entry_t *pbvm_pgtbl_ent, *pbvm_pgtbl_lim;
720	char *p;
721	vm_size_t mdlen;
722	int metadata_missing;
723
724	/*
725	 * NO OUTPUT ALLOWED UNTIL FURTHER NOTICE.
726	 */
727
728	ia64_set_fpsr(IA64_FPSR_DEFAULT);
729
730	/*
731	 * Region 6 is direct mapped UC and region 7 is direct mapped
732	 * WC. The details of this is controlled by the Alt {I,D}TLB
733	 * handlers. Here we just make sure that they have the largest
734	 * possible page size to minimise TLB usage.
735	 */
736	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (LOG2_ID_PAGE_SIZE << 2));
737	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (LOG2_ID_PAGE_SIZE << 2));
738	ia64_srlz_d();
739
740	/* Initialize/setup physical memory datastructures */
741	ia64_physmem_init();
742
743	/*
744	 * Process the memory map. This gives us the PAL locations,
745	 * the I/O port base address, the available memory regions
746	 * for initializing the physical memory map.
747	 */
748	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
749		mdlen = md->md_pages * EFI_PAGE_SIZE;
750		switch (md->md_type) {
751		case EFI_MD_TYPE_IOPORT:
752			ia64_port_base = pmap_mapdev_priv(md->md_phys,
753			    mdlen, VM_MEMATTR_UNCACHEABLE);
754			break;
755		case EFI_MD_TYPE_PALCODE:
756			ia64_pal_base = md->md_phys;
757			ia64_pal_size = mdlen;
758			/*FALLTHROUGH*/
759		case EFI_MD_TYPE_BAD:
760		case EFI_MD_TYPE_FIRMWARE:
761		case EFI_MD_TYPE_RECLAIM:
762		case EFI_MD_TYPE_RT_CODE:
763		case EFI_MD_TYPE_RT_DATA:
764			/* Don't use these memory regions. */
765			ia64_physmem_track(md->md_phys, mdlen);
766			break;
767		case EFI_MD_TYPE_BS_CODE:
768		case EFI_MD_TYPE_BS_DATA:
769		case EFI_MD_TYPE_CODE:
770		case EFI_MD_TYPE_DATA:
771		case EFI_MD_TYPE_FREE:
772			/* These are ok to use. */
773			ia64_physmem_add(md->md_phys, mdlen);
774			break;
775		}
776	}
777
778	/*
779	 * Remove the PBVM and its page table from phys_avail. The loader
780	 * passes the physical address of the page table to us. The virtual
781	 * address of the page table is fixed.
782	 * Track and the PBVM limit for later use.
783	 */
784	ia64_physmem_delete(bootinfo->bi_pbvm_pgtbl, bootinfo->bi_pbvm_pgtblsz);
785	pbvm_pgtbl_ent = (void *)IA64_PBVM_PGTBL;
786	pbvm_pgtbl_lim = (void *)(IA64_PBVM_PGTBL + bootinfo->bi_pbvm_pgtblsz);
787	while (pbvm_pgtbl_ent < pbvm_pgtbl_lim) {
788		if ((*pbvm_pgtbl_ent & PTE_PRESENT) == 0)
789			break;
790		ia64_physmem_delete(*pbvm_pgtbl_ent & PTE_PPN_MASK,
791		    IA64_PBVM_PAGE_SIZE);
792		pbvm_pgtbl_ent++;
793	}
794
795	/* Finalize physical memory datastructures */
796	ia64_physmem_fini();
797
798	metadata_missing = 0;
799	if (bootinfo->bi_modulep)
800		preload_metadata = (caddr_t)bootinfo->bi_modulep;
801	else
802		metadata_missing = 1;
803
804	if (envmode == 0 && bootinfo->bi_envp)
805		kern_envp = (caddr_t)bootinfo->bi_envp;
806	else
807		kern_envp = static_env;
808
809	/*
810	 * Look at arguments passed to us and compute boothowto.
811	 */
812	boothowto = bootinfo->bi_boothowto;
813
814	if (boothowto & RB_VERBOSE)
815		bootverbose = 1;
816
817	/*
818	 * Wire things up so we can call the firmware.
819	 */
820	map_pal_code();
821	efi_boot_minimal(bootinfo->bi_systab);
822	ia64_xiv_init();
823	ia64_sal_init();
824	calculate_frequencies();
825
826	set_cputicker(ia64_get_itc, (u_long)itc_freq * 1000000, 0);
827
828	/*
829	 * Setup the PCPU data for the bootstrap processor. It is needed
830	 * by printf(). Also, since printf() has critical sections, we
831	 * need to initialize at least pc_curthread.
832	 */
833	pcpup = &pcpu0;
834	ia64_set_k4((u_int64_t)pcpup);
835	pcpu_init(pcpup, 0, sizeof(pcpu0));
836	dpcpu_init(ia64_physmem_alloc(DPCPU_SIZE, PAGE_SIZE), 0);
837	cpu_pcpu_setup(pcpup, ~0U, ia64_get_lid());
838	pcpup->pc_curthread = &thread0;
839
840	/*
841	 * Initialize the console before we print anything out.
842	 */
843	cninit();
844
845	/* OUTPUT NOW ALLOWED */
846
847	if (metadata_missing)
848		printf("WARNING: loader(8) metadata is missing!\n");
849
850	/* Get FPSWA interface */
851	fpswa_iface = (bootinfo->bi_fpswa == 0) ? NULL :
852	    (struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo->bi_fpswa);
853
854	/* Init basic tunables, including hz */
855	init_param1();
856
857	p = getenv("kernelname");
858	if (p != NULL) {
859		strlcpy(kernelname, p, sizeof(kernelname));
860		freeenv(p);
861	}
862
863	init_param2(physmem);
864
865	/*
866	 * Initialize error message buffer (at end of core).
867	 */
868	msgbufp = ia64_physmem_alloc(msgbufsize, PAGE_SIZE);
869	msgbufinit(msgbufp, msgbufsize);
870
871	proc_linkup0(&proc0, &thread0);
872	/*
873	 * Init mapping for kernel stack for proc 0
874	 */
875	p = ia64_physmem_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
876	thread0.td_kstack = (uintptr_t)p;
877	thread0.td_kstack_pages = KSTACK_PAGES;
878
879	mutex_init();
880
881	/*
882	 * Initialize the rest of proc 0's PCB.
883	 *
884	 * Set the kernel sp, reserving space for an (empty) trapframe,
885	 * and make proc0's trapframe pointer point to it for sanity.
886	 * Initialise proc0's backing store to start after u area.
887	 */
888	cpu_thread_alloc(&thread0);
889	thread0.td_frame->tf_flags = FRAME_SYSCALL;
890	thread0.td_pcb->pcb_special.sp =
891	    (u_int64_t)thread0.td_frame - 16;
892	thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;
893
894	/*
895	 * Initialize the virtual memory system.
896	 */
897	pmap_bootstrap();
898
899#ifdef XTRACE
900	ia64_xtrace_init_bsp();
901#endif
902
903	/*
904	 * Initialize debuggers, and break into them if appropriate.
905	 */
906#ifdef DDB
907	ksym_start = bootinfo->bi_symtab;
908	ksym_end = bootinfo->bi_esymtab;
909#endif
910
911	kdb_init();
912
913#ifdef KDB
914	if (boothowto & RB_KDB)
915		kdb_enter(KDB_WHY_BOOTFLAGS,
916		    "Boot flags requested debugger\n");
917#endif
918
919	ia64_set_tpr(0);
920	ia64_srlz_d();
921
922	ret.bspstore = thread0.td_pcb->pcb_special.bspstore;
923	ret.sp = thread0.td_pcb->pcb_special.sp;
924	return (ret);
925}
926
927uint64_t
928ia64_get_hcdp(void)
929{
930
931	return (bootinfo->bi_hcdp);
932}
933
934void
935bzero(void *buf, size_t len)
936{
937	caddr_t p = buf;
938
939	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
940		*p++ = 0;
941		len--;
942	}
943	while (len >= sizeof(u_long) * 8) {
944		*(u_long*) p = 0;
945		*((u_long*) p + 1) = 0;
946		*((u_long*) p + 2) = 0;
947		*((u_long*) p + 3) = 0;
948		len -= sizeof(u_long) * 8;
949		*((u_long*) p + 4) = 0;
950		*((u_long*) p + 5) = 0;
951		*((u_long*) p + 6) = 0;
952		*((u_long*) p + 7) = 0;
953		p += sizeof(u_long) * 8;
954	}
955	while (len >= sizeof(u_long)) {
956		*(u_long*) p = 0;
957		len -= sizeof(u_long);
958		p += sizeof(u_long);
959	}
960	while (len) {
961		*p++ = 0;
962		len--;
963	}
964}
965
966u_int
967ia64_itc_freq(void)
968{
969
970	return (itc_freq);
971}
972
973void
974DELAY(int n)
975{
976	u_int64_t start, end, now;
977
978	sched_pin();
979
980	start = ia64_get_itc();
981	end = start + itc_freq * n;
982	/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
983	do {
984		now = ia64_get_itc();
985	} while (now < end || (now > start && end < start));
986
987	sched_unpin();
988}
989
990/*
991 * Send an interrupt (signal) to a process.
992 */
993void
994sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
995{
996	struct proc *p;
997	struct thread *td;
998	struct trapframe *tf;
999	struct sigacts *psp;
1000	struct sigframe sf, *sfp;
1001	u_int64_t sbs, sp;
1002	int oonstack;
1003	int sig;
1004	u_long code;
1005
1006	td = curthread;
1007	p = td->td_proc;
1008	PROC_LOCK_ASSERT(p, MA_OWNED);
1009	sig = ksi->ksi_signo;
1010	code = ksi->ksi_code;
1011	psp = p->p_sigacts;
1012	mtx_assert(&psp->ps_mtx, MA_OWNED);
1013	tf = td->td_frame;
1014	sp = tf->tf_special.sp;
1015	oonstack = sigonstack(sp);
1016	sbs = 0;
1017
1018	/* save user context */
1019	bzero(&sf, sizeof(struct sigframe));
1020	sf.sf_uc.uc_sigmask = *mask;
1021	sf.sf_uc.uc_stack = td->td_sigstk;
1022	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
1023	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
1024
1025	/*
1026	 * Allocate and validate space for the signal handler
1027	 * context. Note that if the stack is in P0 space, the
1028	 * call to grow() is a nop, and the useracc() check
1029	 * will fail if the process has not already allocated
1030	 * the space with a `brk'.
1031	 */
1032	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
1033	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
1034		sbs = (u_int64_t)td->td_sigstk.ss_sp;
1035		sbs = (sbs + 15) & ~15;
1036		sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
1037#if defined(COMPAT_43)
1038		td->td_sigstk.ss_flags |= SS_ONSTACK;
1039#endif
1040	} else
1041		sfp = (struct sigframe *)sp;
1042	sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);
1043
1044	/* Fill in the siginfo structure for POSIX handlers. */
1045	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
1046		sf.sf_si = ksi->ksi_info;
1047		sf.sf_si.si_signo = sig;
1048		/*
1049		 * XXX this shouldn't be here after code in trap.c
1050		 * is fixed
1051		 */
1052		sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
1053		code = (u_int64_t)&sfp->sf_si;
1054	}
1055
1056	mtx_unlock(&psp->ps_mtx);
1057	PROC_UNLOCK(p);
1058
1059	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
1060
1061	/* Copy the frame out to userland. */
1062	if (copyout(&sf, sfp, sizeof(sf)) != 0) {
1063		/*
1064		 * Process has trashed its stack; give it an illegal
1065		 * instruction to halt it in its tracks.
1066		 */
1067		PROC_LOCK(p);
1068		sigexit(td, SIGILL);
1069		return;
1070	}
1071
1072	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
1073		tf->tf_special.psr &= ~IA64_PSR_RI;
1074		tf->tf_special.iip = ia64_get_k5() +
1075		    ((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
1076	} else
1077		tf->tf_special.iip = ia64_get_k5() +
1078		    ((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);
1079
1080	/*
1081	 * Setup the trapframe to return to the signal trampoline. We pass
1082	 * information to the trampoline in the following registers:
1083	 *
1084	 *	gp	new backing store or NULL
1085	 *	r8	signal number
1086	 *	r9	signal code or siginfo pointer
1087	 *	r10	signal handler (function descriptor)
1088	 */
1089	tf->tf_special.sp = (u_int64_t)sfp - 16;
1090	tf->tf_special.gp = sbs;
1091	tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
1092	tf->tf_special.ndirty = 0;
1093	tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
1094	tf->tf_scratch.gr8 = sig;
1095	tf->tf_scratch.gr9 = code;
1096	tf->tf_scratch.gr10 = (u_int64_t)catcher;
1097
1098	PROC_LOCK(p);
1099	mtx_lock(&psp->ps_mtx);
1100}
1101
1102/*
1103 * System call to cleanup state after a signal
1104 * has been taken.  Reset signal mask and
1105 * stack state from context left by sendsig (above).
1106 * Return to previous pc and psl as specified by
1107 * context left by sendsig. Check carefully to
1108 * make sure that the user has not modified the
1109 * state to gain improper privileges.
1110 *
1111 * MPSAFE
1112 */
1113int
1114sys_sigreturn(struct thread *td,
1115	struct sigreturn_args /* {
1116		ucontext_t *sigcntxp;
1117	} */ *uap)
1118{
1119	ucontext_t uc;
1120	struct trapframe *tf;
1121	struct pcb *pcb;
1122
1123	tf = td->td_frame;
1124	pcb = td->td_pcb;
1125
1126	/*
1127	 * Fetch the entire context structure at once for speed.
1128	 * We don't use a normal argument to simplify RSE handling.
1129	 */
1130	if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
1131		return (EFAULT);
1132
1133	set_mcontext(td, &uc.uc_mcontext);
1134
1135#if defined(COMPAT_43)
1136	if (sigonstack(tf->tf_special.sp))
1137		td->td_sigstk.ss_flags |= SS_ONSTACK;
1138	else
1139		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1140#endif
1141	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
1142
1143	return (EJUSTRETURN);
1144}
1145
1146#ifdef COMPAT_FREEBSD4
1147int
1148freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
1149{
1150
1151	return sys_sigreturn(td, (struct sigreturn_args *)uap);
1152}
1153#endif
1154
1155/*
1156 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1157 * we want to start a backtrace from the function that caused us to enter
1158 * the debugger. We have the context in the trapframe, but base the trace
1159 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1160 * enough for a backtrace.
1161 */
1162void
1163makectx(struct trapframe *tf, struct pcb *pcb)
1164{
1165
1166	pcb->pcb_special = tf->tf_special;
1167	pcb->pcb_special.__spare = ~0UL;	/* XXX see unwind.c */
1168	save_callee_saved(&pcb->pcb_preserved);
1169	save_callee_saved_fp(&pcb->pcb_preserved_fp);
1170}
1171
1172int
1173ia64_flush_dirty(struct thread *td, struct _special *r)
1174{
1175	struct iovec iov;
1176	struct uio uio;
1177	uint64_t bspst, kstk, rnat;
1178	int error, locked;
1179
1180	if (r->ndirty == 0)
1181		return (0);
1182
1183	kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
1184	if (td == curthread) {
1185		__asm __volatile("mov	ar.rsc=0;;");
1186		__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
1187		/* Make sure we have all the user registers written out. */
1188		if (bspst - kstk < r->ndirty) {
1189			__asm __volatile("flushrs;;");
1190			__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
1191		}
1192		__asm __volatile("mov	%0=ar.rnat;;" : "=r"(rnat));
1193		__asm __volatile("mov	ar.rsc=3");
1194		error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty);
1195		kstk += r->ndirty;
1196		r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
1197		    ? *(uint64_t*)(kstk | 0x1f8L) : rnat;
1198	} else {
1199		locked = PROC_LOCKED(td->td_proc);
1200		if (!locked)
1201			PHOLD(td->td_proc);
1202		iov.iov_base = (void*)(uintptr_t)kstk;
1203		iov.iov_len = r->ndirty;
1204		uio.uio_iov = &iov;
1205		uio.uio_iovcnt = 1;
1206		uio.uio_offset = r->bspstore;
1207		uio.uio_resid = r->ndirty;
1208		uio.uio_segflg = UIO_SYSSPACE;
1209		uio.uio_rw = UIO_WRITE;
1210		uio.uio_td = td;
1211		error = proc_rwmem(td->td_proc, &uio);
1212		/*
1213		 * XXX proc_rwmem() doesn't currently return ENOSPC,
1214		 * so I think it can bogusly return 0. Neither do
1215		 * we allow short writes.
1216		 */
1217		if (uio.uio_resid != 0 && error == 0)
1218			error = ENOSPC;
1219		if (!locked)
1220			PRELE(td->td_proc);
1221	}
1222
1223	r->bspstore += r->ndirty;
1224	r->ndirty = 0;
1225	return (error);
1226}
1227
1228int
1229get_mcontext(struct thread *td, mcontext_t *mc, int flags)
1230{
1231	struct trapframe *tf;
1232	int error;
1233
1234	tf = td->td_frame;
1235	bzero(mc, sizeof(*mc));
1236	mc->mc_special = tf->tf_special;
1237	error = ia64_flush_dirty(td, &mc->mc_special);
1238	if (tf->tf_flags & FRAME_SYSCALL) {
1239		mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT;
1240		mc->mc_scratch = tf->tf_scratch;
1241		if (flags & GET_MC_CLEAR_RET) {
1242			mc->mc_scratch.gr8 = 0;
1243			mc->mc_scratch.gr9 = 0;
1244			mc->mc_scratch.gr10 = 0;
1245			mc->mc_scratch.gr11 = 0;
1246		}
1247	} else {
1248		mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT;
1249		mc->mc_scratch = tf->tf_scratch;
1250		mc->mc_scratch_fp = tf->tf_scratch_fp;
1251		/*
1252		 * XXX If the thread never used the high FP registers, we
1253		 * probably shouldn't waste time saving them.
1254		 */
1255		ia64_highfp_save(td);
1256		mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID;
1257		mc->mc_high_fp = td->td_pcb->pcb_high_fp;
1258	}
1259	save_callee_saved(&mc->mc_preserved);
1260	save_callee_saved_fp(&mc->mc_preserved_fp);
1261	return (error);
1262}
1263
1264int
1265set_mcontext(struct thread *td, mcontext_t *mc)
1266{
1267	struct _special s;
1268	struct trapframe *tf;
1269	uint64_t psrmask;
1270
1271	tf = td->td_frame;
1272
1273	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
1274	    ("Whoa there! We have more than 8KB of dirty registers!"));
1275
1276	s = mc->mc_special;
1277	/*
1278	 * Only copy the user mask and the restart instruction bit from
1279	 * the new context.
1280	 */
1281	psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
1282	    IA64_PSR_MFH | IA64_PSR_RI;
1283	s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask);
1284	/* We don't have any dirty registers of the new context. */
1285	s.ndirty = 0;
1286	if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
1287		/*
1288		 * We can get an async context passed to us while we
1289		 * entered the kernel through a syscall: sigreturn(2)
1290		 * takes contexts that could previously be the result of
1291		 * a trap or interrupt.
1292		 * Hence, we cannot assert that the trapframe is not
1293		 * a syscall frame, but we can assert that it's at
1294		 * least an expected syscall.
1295		 */
1296		if (tf->tf_flags & FRAME_SYSCALL) {
1297			KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn, ("foo"));
1298			tf->tf_flags &= ~FRAME_SYSCALL;
1299		}
1300		tf->tf_scratch = mc->mc_scratch;
1301		tf->tf_scratch_fp = mc->mc_scratch_fp;
1302		if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
1303			td->td_pcb->pcb_high_fp = mc->mc_high_fp;
1304	} else {
1305		KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
1306		if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
1307			s.cfm = tf->tf_special.cfm;
1308			s.iip = tf->tf_special.iip;
1309			tf->tf_scratch.gr15 = 0;	/* Clear syscall nr. */
1310		} else
1311			tf->tf_scratch = mc->mc_scratch;
1312	}
1313	tf->tf_special = s;
1314	restore_callee_saved(&mc->mc_preserved);
1315	restore_callee_saved_fp(&mc->mc_preserved_fp);
1316
1317	return (0);
1318}
1319
1320/*
1321 * Clear registers on exec.
1322 */
1323void
1324exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
1325{
1326	struct trapframe *tf;
1327	uint64_t *ksttop, *kst;
1328
1329	tf = td->td_frame;
1330	ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
1331	    (tf->tf_special.bspstore & 0x1ffUL));
1332
1333	/*
1334	 * We can ignore up to 8KB of dirty registers by masking off the
1335	 * lower 13 bits in exception_restore() or epc_syscall(). This
1336	 * should be enough for a couple of years, but if there are more
1337	 * than 8KB of dirty registers, we lose track of the bottom of
1338	 * the kernel stack. The solution is to copy the active part of
1339	 * the kernel stack down 1 page (or 2, but not more than that)
1340	 * so that we always have less than 8KB of dirty registers.
1341	 */
1342	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
1343	    ("Whoa there! We have more than 8KB of dirty registers!"));
1344
1345	bzero(&tf->tf_special, sizeof(tf->tf_special));
1346	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {	/* break syscalls. */
1347		bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
1348		bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
1349		tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL;
1350		tf->tf_special.bspstore = IA64_BACKINGSTORE;
1351		/*
1352		 * Copy the arguments onto the kernel register stack so that
1353		 * they get loaded by the loadrs instruction. Skip over the
1354		 * NaT collection points.
1355		 */
1356		kst = ksttop - 1;
1357		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1358			*kst-- = 0;
1359		*kst-- = 0;
1360		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1361			*kst-- = 0;
1362		*kst-- = imgp->ps_strings;
1363		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
1364			*kst-- = 0;
1365		*kst = stack;
1366		tf->tf_special.ndirty = (ksttop - kst) << 3;
1367	} else {				/* epc syscalls (default). */
1368		tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL;
1369		tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
1370		/*
1371		 * Write values for out0, out1 and out2 to the user's backing
1372		 * store and arrange for them to be restored into the user's
1373		 * initial register frame.
1374		 * Assumes that (bspstore & 0x1f8) < 0x1e0.
1375		 */
1376		suword((caddr_t)tf->tf_special.bspstore - 24, stack);
1377		suword((caddr_t)tf->tf_special.bspstore - 16, imgp->ps_strings);
1378		suword((caddr_t)tf->tf_special.bspstore -  8, 0);
1379	}
1380
1381	tf->tf_special.iip = imgp->entry_addr;
1382	tf->tf_special.sp = (stack & ~15) - 16;
1383	tf->tf_special.rsc = 0xf;
1384	tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
1385	tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT |
1386	    IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN |
1387	    IA64_PSR_CPL_USER;
1388}
1389
1390int
1391ptrace_set_pc(struct thread *td, unsigned long addr)
1392{
1393	uint64_t slot;
1394
1395	switch (addr & 0xFUL) {
1396	case 0:
1397		slot = IA64_PSR_RI_0;
1398		break;
1399	case 1:
1400		/* XXX we need to deal with MLX bundles here */
1401		slot = IA64_PSR_RI_1;
1402		break;
1403	case 2:
1404		slot = IA64_PSR_RI_2;
1405		break;
1406	default:
1407		return (EINVAL);
1408	}
1409
1410	td->td_frame->tf_special.iip = addr & ~0x0FULL;
1411	td->td_frame->tf_special.psr =
1412	    (td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot;
1413	return (0);
1414}
1415
1416int
1417ptrace_single_step(struct thread *td)
1418{
1419	struct trapframe *tf;
1420
1421	/*
1422	 * There's no way to set single stepping when we're leaving the
1423	 * kernel through the EPC syscall path. The way we solve this is
1424	 * by enabling the lower-privilege trap so that we re-enter the
1425	 * kernel as soon as the privilege level changes. See trap.c for
1426	 * how we proceed from there.
1427	 */
1428	tf = td->td_frame;
1429	if (tf->tf_flags & FRAME_SYSCALL)
1430		tf->tf_special.psr |= IA64_PSR_LP;
1431	else
1432		tf->tf_special.psr |= IA64_PSR_SS;
1433	return (0);
1434}
1435
1436int
1437ptrace_clear_single_step(struct thread *td)
1438{
1439	struct trapframe *tf;
1440
1441	/*
1442	 * Clear any and all status bits we may use to implement single
1443	 * stepping.
1444	 */
1445	tf = td->td_frame;
1446	tf->tf_special.psr &= ~IA64_PSR_SS;
1447	tf->tf_special.psr &= ~IA64_PSR_LP;
1448	tf->tf_special.psr &= ~IA64_PSR_TB;
1449	return (0);
1450}
1451
1452int
1453fill_regs(struct thread *td, struct reg *regs)
1454{
1455	struct trapframe *tf;
1456
1457	tf = td->td_frame;
1458	regs->r_special = tf->tf_special;
1459	regs->r_scratch = tf->tf_scratch;
1460	save_callee_saved(&regs->r_preserved);
1461	return (0);
1462}
1463
1464int
1465set_regs(struct thread *td, struct reg *regs)
1466{
1467	struct trapframe *tf;
1468	int error;
1469
1470	tf = td->td_frame;
1471	error = ia64_flush_dirty(td, &tf->tf_special);
1472	if (!error) {
1473		tf->tf_special = regs->r_special;
1474		tf->tf_special.bspstore += tf->tf_special.ndirty;
1475		tf->tf_special.ndirty = 0;
1476		tf->tf_scratch = regs->r_scratch;
1477		restore_callee_saved(&regs->r_preserved);
1478	}
1479	return (error);
1480}
1481
1482int
1483fill_dbregs(struct thread *td, struct dbreg *dbregs)
1484{
1485
1486	return (ENOSYS);
1487}
1488
1489int
1490set_dbregs(struct thread *td, struct dbreg *dbregs)
1491{
1492
1493	return (ENOSYS);
1494}
1495
1496int
1497fill_fpregs(struct thread *td, struct fpreg *fpregs)
1498{
1499	struct trapframe *frame = td->td_frame;
1500	struct pcb *pcb = td->td_pcb;
1501
1502	/* Save the high FP registers. */
1503	ia64_highfp_save(td);
1504
1505	fpregs->fpr_scratch = frame->tf_scratch_fp;
1506	save_callee_saved_fp(&fpregs->fpr_preserved);
1507	fpregs->fpr_high = pcb->pcb_high_fp;
1508	return (0);
1509}
1510
1511int
1512set_fpregs(struct thread *td, struct fpreg *fpregs)
1513{
1514	struct trapframe *frame = td->td_frame;
1515	struct pcb *pcb = td->td_pcb;
1516
1517	/* Throw away the high FP registers (should be redundant). */
1518	ia64_highfp_drop(td);
1519
1520	frame->tf_scratch_fp = fpregs->fpr_scratch;
1521	restore_callee_saved_fp(&fpregs->fpr_preserved);
1522	pcb->pcb_high_fp = fpregs->fpr_high;
1523	return (0);
1524}
1525
1526void
1527ia64_sync_icache(vm_offset_t va, vm_offset_t sz)
1528{
1529	vm_offset_t lim;
1530
1531	if (!ia64_sync_icache_needed)
1532		return;
1533
1534	lim = va + sz;
1535	while (va < lim) {
1536		ia64_fc_i(va);
1537		va += 32;	/* XXX */
1538	}
1539
1540	ia64_sync_i();
1541	ia64_srlz_i();
1542}
1543