dtrace_subr.c revision 315012
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 * 22 * $FreeBSD: stable/10/sys/cddl/dev/dtrace/i386/dtrace_subr.c 315012 2017-03-10 18:52:37Z markj $ 23 * 24 */ 25/* 26 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30/* 31 * Copyright (c) 2011, Joyent, Inc. All rights reserved. 32 */ 33 34#include <sys/param.h> 35#include <sys/systm.h> 36#include <sys/types.h> 37#include <sys/cpuset.h> 38#include <sys/kernel.h> 39#include <sys/malloc.h> 40#include <sys/kmem.h> 41#include <sys/smp.h> 42#include <sys/dtrace_impl.h> 43#include <sys/dtrace_bsd.h> 44#include <machine/clock.h> 45#include <machine/cpufunc.h> 46#include <machine/frame.h> 47#include <machine/psl.h> 48#include <vm/pmap.h> 49 50extern uintptr_t kernelbase; 51extern uintptr_t dtrace_in_probe_addr; 52extern int dtrace_in_probe; 53 54extern void dtrace_getnanotime(struct timespec *tsp); 55 56int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); 57 58typedef struct dtrace_invop_hdlr { 59 int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t); 60 struct dtrace_invop_hdlr *dtih_next; 61} dtrace_invop_hdlr_t; 62 63dtrace_invop_hdlr_t *dtrace_invop_hdlr; 64 65int 66dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) 67{ 68 dtrace_invop_hdlr_t *hdlr; 69 int rval; 70 71 for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) 72 if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0) 73 return (rval); 74 75 return (0); 76} 77 78void 79dtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 80{ 81 dtrace_invop_hdlr_t *hdlr; 82 83 hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); 84 hdlr->dtih_func = func; 85 hdlr->dtih_next = dtrace_invop_hdlr; 86 dtrace_invop_hdlr = hdlr; 87} 88 89void 90dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 91{ 92 dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; 93 94 for (;;) { 95 if (hdlr == NULL) 96 panic("attempt to remove non-existent invop handler"); 97 98 if (hdlr->dtih_func == func) 99 break; 100 101 prev = hdlr; 102 hdlr = hdlr->dtih_next; 103 } 104 105 if (prev == NULL) { 106 ASSERT(dtrace_invop_hdlr == hdlr); 107 dtrace_invop_hdlr = hdlr->dtih_next; 108 } else { 109 ASSERT(dtrace_invop_hdlr != hdlr); 110 prev->dtih_next = hdlr->dtih_next; 111 } 112 113 kmem_free(hdlr, 0); 114} 115 116void 117dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) 118{ 119 (*func)(0, kernelbase); 120} 121 122void 123dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) 124{ 125 cpuset_t cpus; 126 127 if (cpu == DTRACE_CPUALL) 128 cpus = all_cpus; 129 else 130 CPU_SETOF(cpu, &cpus); 131 132 smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, 133 smp_no_rendevous_barrier, arg); 134} 135 136static void 137dtrace_sync_func(void) 138{ 139} 140 141void 142dtrace_sync(void) 143{ 144 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); 145} 146 147#ifdef notyet 148int (*dtrace_fasttrap_probe_ptr)(struct regs *); 149int (*dtrace_pid_probe_ptr)(struct regs *); 150int (*dtrace_return_probe_ptr)(struct regs *); 151 152void 153dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) 154{ 155 krwlock_t *rwp; 156 proc_t *p = curproc; 157 extern void trap(struct regs *, caddr_t, processorid_t); 158 159 if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) { 160 if (curthread->t_cred != p->p_cred) { 161 cred_t *oldcred = curthread->t_cred; 162 /* 163 * DTrace accesses t_cred in probe context. t_cred 164 * must always be either NULL, or point to a valid, 165 * allocated cred structure. 166 */ 167 curthread->t_cred = crgetcred(); 168 crfree(oldcred); 169 } 170 } 171 172 if (rp->r_trapno == T_DTRACE_RET) { 173 uint8_t step = curthread->t_dtrace_step; 174 uint8_t ret = curthread->t_dtrace_ret; 175 uintptr_t npc = curthread->t_dtrace_npc; 176 177 if (curthread->t_dtrace_ast) { 178 aston(curthread); 179 curthread->t_sig_check = 1; 180 } 181 182 /* 183 * Clear all user tracing flags. 184 */ 185 curthread->t_dtrace_ft = 0; 186 187 /* 188 * If we weren't expecting to take a return probe trap, kill 189 * the process as though it had just executed an unassigned 190 * trap instruction. 191 */ 192 if (step == 0) { 193 tsignal(curthread, SIGILL); 194 return; 195 } 196 197 /* 198 * If we hit this trap unrelated to a return probe, we're 199 * just here to reset the AST flag since we deferred a signal 200 * until after we logically single-stepped the instruction we 201 * copied out. 202 */ 203 if (ret == 0) { 204 rp->r_pc = npc; 205 return; 206 } 207 208 /* 209 * We need to wait until after we've called the 210 * dtrace_return_probe_ptr function pointer to set %pc. 211 */ 212 rwp = &CPU->cpu_ft_lock; 213 rw_enter(rwp, RW_READER); 214 if (dtrace_return_probe_ptr != NULL) 215 (void) (*dtrace_return_probe_ptr)(rp); 216 rw_exit(rwp); 217 rp->r_pc = npc; 218 219 } else if (rp->r_trapno == T_DTRACE_PROBE) { 220 rwp = &CPU->cpu_ft_lock; 221 rw_enter(rwp, RW_READER); 222 if (dtrace_fasttrap_probe_ptr != NULL) 223 (void) (*dtrace_fasttrap_probe_ptr)(rp); 224 rw_exit(rwp); 225 226 } else if (rp->r_trapno == T_BPTFLT) { 227 uint8_t instr; 228 rwp = &CPU->cpu_ft_lock; 229 230 /* 231 * The DTrace fasttrap provider uses the breakpoint trap 232 * (int 3). We let DTrace take the first crack at handling 233 * this trap; if it's not a probe that DTrace knowns about, 234 * we call into the trap() routine to handle it like a 235 * breakpoint placed by a conventional debugger. 236 */ 237 rw_enter(rwp, RW_READER); 238 if (dtrace_pid_probe_ptr != NULL && 239 (*dtrace_pid_probe_ptr)(rp) == 0) { 240 rw_exit(rwp); 241 return; 242 } 243 rw_exit(rwp); 244 245 /* 246 * If the instruction that caused the breakpoint trap doesn't 247 * look like an int 3 anymore, it may be that this tracepoint 248 * was removed just after the user thread executed it. In 249 * that case, return to user land to retry the instuction. 250 */ 251 if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && 252 instr != FASTTRAP_INSTR) { 253 rp->r_pc--; 254 return; 255 } 256 257 trap(rp, addr, cpuid); 258 259 } else { 260 trap(rp, addr, cpuid); 261 } 262} 263 264void 265dtrace_safe_synchronous_signal(void) 266{ 267 kthread_t *t = curthread; 268 struct regs *rp = lwptoregs(ttolwp(t)); 269 size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 270 271 ASSERT(t->t_dtrace_on); 272 273 /* 274 * If we're not in the range of scratch addresses, we're not actually 275 * tracing user instructions so turn off the flags. If the instruction 276 * we copied out caused a synchonous trap, reset the pc back to its 277 * original value and turn off the flags. 278 */ 279 if (rp->r_pc < t->t_dtrace_scrpc || 280 rp->r_pc > t->t_dtrace_astpc + isz) { 281 t->t_dtrace_ft = 0; 282 } else if (rp->r_pc == t->t_dtrace_scrpc || 283 rp->r_pc == t->t_dtrace_astpc) { 284 rp->r_pc = t->t_dtrace_pc; 285 t->t_dtrace_ft = 0; 286 } 287} 288 289int 290dtrace_safe_defer_signal(void) 291{ 292 kthread_t *t = curthread; 293 struct regs *rp = lwptoregs(ttolwp(t)); 294 size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 295 296 ASSERT(t->t_dtrace_on); 297 298 /* 299 * If we're not in the range of scratch addresses, we're not actually 300 * tracing user instructions so turn off the flags. 301 */ 302 if (rp->r_pc < t->t_dtrace_scrpc || 303 rp->r_pc > t->t_dtrace_astpc + isz) { 304 t->t_dtrace_ft = 0; 305 return (0); 306 } 307 308 /* 309 * If we have executed the original instruction, but we have performed 310 * neither the jmp back to t->t_dtrace_npc nor the clean up of any 311 * registers used to emulate %rip-relative instructions in 64-bit mode, 312 * we'll save ourselves some effort by doing that here and taking the 313 * signal right away. We detect this condition by seeing if the program 314 * counter is the range [scrpc + isz, astpc). 315 */ 316 if (rp->r_pc >= t->t_dtrace_scrpc + isz && 317 rp->r_pc < t->t_dtrace_astpc) { 318#ifdef __amd64 319 /* 320 * If there is a scratch register and we're on the 321 * instruction immediately after the modified instruction, 322 * restore the value of that scratch register. 323 */ 324 if (t->t_dtrace_reg != 0 && 325 rp->r_pc == t->t_dtrace_scrpc + isz) { 326 switch (t->t_dtrace_reg) { 327 case REG_RAX: 328 rp->r_rax = t->t_dtrace_regv; 329 break; 330 case REG_RCX: 331 rp->r_rcx = t->t_dtrace_regv; 332 break; 333 case REG_R8: 334 rp->r_r8 = t->t_dtrace_regv; 335 break; 336 case REG_R9: 337 rp->r_r9 = t->t_dtrace_regv; 338 break; 339 } 340 } 341#endif 342 rp->r_pc = t->t_dtrace_npc; 343 t->t_dtrace_ft = 0; 344 return (0); 345 } 346 347 /* 348 * Otherwise, make sure we'll return to the kernel after executing 349 * the copied out instruction and defer the signal. 350 */ 351 if (!t->t_dtrace_step) { 352 ASSERT(rp->r_pc < t->t_dtrace_astpc); 353 rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; 354 t->t_dtrace_step = 1; 355 } 356 357 t->t_dtrace_ast = 1; 358 359 return (1); 360} 361#endif 362 363static int64_t tgt_cpu_tsc; 364static int64_t hst_cpu_tsc; 365static int64_t tsc_skew[MAXCPU]; 366static uint64_t nsec_scale; 367 368/* See below for the explanation of this macro. */ 369#define SCALE_SHIFT 28 370 371static void 372dtrace_gethrtime_init_cpu(void *arg) 373{ 374 uintptr_t cpu = (uintptr_t) arg; 375 376 if (cpu == curcpu) 377 tgt_cpu_tsc = rdtsc(); 378 else 379 hst_cpu_tsc = rdtsc(); 380} 381 382static void 383dtrace_gethrtime_init(void *arg) 384{ 385 cpuset_t map; 386 struct pcpu *pc; 387 uint64_t tsc_f; 388 int i; 389 390 /* 391 * Get TSC frequency known at this moment. 392 * This should be constant if TSC is invariant. 393 * Otherwise tick->time conversion will be inaccurate, but 394 * will preserve monotonic property of TSC. 395 */ 396 tsc_f = atomic_load_acq_64(&tsc_freq); 397 398 /* 399 * The following line checks that nsec_scale calculated below 400 * doesn't overflow 32-bit unsigned integer, so that it can multiply 401 * another 32-bit integer without overflowing 64-bit. 402 * Thus minimum supported TSC frequency is 62.5MHz. 403 */ 404 KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); 405 406 /* 407 * We scale up NANOSEC/tsc_f ratio to preserve as much precision 408 * as possible. 409 * 2^28 factor was chosen quite arbitrarily from practical 410 * considerations: 411 * - it supports TSC frequencies as low as 62.5MHz (see above); 412 * - it provides quite good precision (e < 0.01%) up to THz 413 * (terahertz) values; 414 */ 415 nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; 416 417 /* The current CPU is the reference one. */ 418 sched_pin(); 419 tsc_skew[curcpu] = 0; 420 CPU_FOREACH(i) { 421 if (i == curcpu) 422 continue; 423 424 pc = pcpu_find(i); 425 CPU_SETOF(PCPU_GET(cpuid), &map); 426 CPU_SET(pc->pc_cpuid, &map); 427 428 smp_rendezvous_cpus(map, NULL, 429 dtrace_gethrtime_init_cpu, 430 smp_no_rendevous_barrier, (void *)(uintptr_t) i); 431 432 tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; 433 } 434 sched_unpin(); 435} 436 437SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); 438 439/* 440 * DTrace needs a high resolution time function which can 441 * be called from a probe context and guaranteed not to have 442 * instrumented with probes itself. 443 * 444 * Returns nanoseconds since boot. 445 */ 446uint64_t 447dtrace_gethrtime() 448{ 449 uint64_t tsc; 450 uint32_t lo; 451 uint32_t hi; 452 453 /* 454 * We split TSC value into lower and higher 32-bit halves and separately 455 * scale them with nsec_scale, then we scale them down by 2^28 456 * (see nsec_scale calculations) taking into account 32-bit shift of 457 * the higher half and finally add. 458 */ 459 tsc = rdtsc() - tsc_skew[curcpu]; 460 lo = tsc; 461 hi = tsc >> 32; 462 return (((lo * nsec_scale) >> SCALE_SHIFT) + 463 ((hi * nsec_scale) << (32 - SCALE_SHIFT))); 464} 465 466uint64_t 467dtrace_gethrestime(void) 468{ 469 struct timespec current_time; 470 471 dtrace_getnanotime(¤t_time); 472 473 return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); 474} 475 476/* Function to handle DTrace traps during probes. See i386/i386/trap.c */ 477int 478dtrace_trap(struct trapframe *frame, u_int type) 479{ 480 uint16_t nofault; 481 482 /* 483 * A trap can occur while DTrace executes a probe. Before 484 * executing the probe, DTrace blocks re-scheduling and sets 485 * a flag in it's per-cpu flags to indicate that it doesn't 486 * want to fault. On returning from the probe, the no-fault 487 * flag is cleared and finally re-scheduling is enabled. 488 * 489 * Check if DTrace has enabled 'no-fault' mode: 490 * 491 */ 492 sched_pin(); 493 nofault = cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT; 494 sched_unpin(); 495 if (nofault) { 496 KASSERT((read_eflags() & PSL_I) == 0, ("interrupts enabled")); 497 498 /* 499 * There are only a couple of trap types that are expected. 500 * All the rest will be handled in the usual way. 501 */ 502 switch (type) { 503 /* General protection fault. */ 504 case T_PROTFLT: 505 /* Flag an illegal operation. */ 506 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 507 508 /* 509 * Offset the instruction pointer to the instruction 510 * following the one causing the fault. 511 */ 512 frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip); 513 return (1); 514 /* Page fault. */ 515 case T_PAGEFLT: 516 /* Flag a bad address. */ 517 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; 518 cpu_core[curcpu].cpuc_dtrace_illval = rcr2(); 519 520 /* 521 * Offset the instruction pointer to the instruction 522 * following the one causing the fault. 523 */ 524 frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip); 525 return (1); 526 default: 527 /* Handle all other traps in the usual way. */ 528 break; 529 } 530 } 531 532 /* Handle the trap in the usual way. */ 533 return (0); 534} 535