1/*
2 * linux/drivers/char/kernprof.c
3 *
4 * Implementation of profiling devices.  We reserve minor number 255 for a
5 * control interface.  ioctl()s on this device control various profiling
6 * settings.
7 *
8 * Copyright (C) SGI 1999, 2000, 2001
9 *
10 * Written by Dimitris Michailidis (dimitris@engr.sgi.com)
11 * Modified by John Hawkes (hawkes@engr.sgi.com)
12 * Contributions from Niels Christiansen (nchr@us.ibm.com)
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/kernprof.h>
18#include <linux/init.h>
19#include <linux/fs.h>
20#include <linux/major.h>
21#include <linux/proc_fs.h>
22#include <linux/slab.h>
23#include <linux/vmalloc.h>
24#include <linux/smp.h>
25#include <linux/devfs_fs_kernel.h>
26#include <linux/module.h>
27#include <linux/compiler.h>
28
29#include <asm/uaccess.h>
30#include <asm/kernprof.h>
31
32#define PROF_CNTRL_MINOR 0
33
34int prof_enabled = 0; /* any profiling active */
35int prof_domain = PROF_DOMAIN_TIME, prof_mode = PROF_MODE_PC_SAMPLING;
36int prof_pid = 0;
37int perfctr_event = 0;
38unsigned int prof_shift, PC_resolution = DFL_PC_RES;
39unsigned int perfctr_freq = 1000;
40unsigned long unload_timeout = 0;
41
42prof_hook_p *prof_intr_hook = &prof_timer_hook;
43prof_hook_p prof_perfctr_aux_hook = NULL;
44
45/* This buffer holds PC samples */
46PC_sample_count_t *PC_sample_buf = NULL;
47size_t PC_buf_sz;
48
49/* Switch for /proc files created */
50int proc_created = 0;
51
52int proc_handle;
53
54/*
55 * These variables deal with the call graph.  The call graph records arcs
56 * linking the location of each function call to the address of the called
57 * function.  It is maintained as a hash table indexed by a call site's
58 * location.  The bucket associated with each hash table entry records the
59 * targets of the calls.
60 */
61unsigned short *cg_from_base = NULL;
62struct cg_arc_dest *cg_to_base = NULL;
63size_t cg_from_sz, cg_to_sz;
64int cg_arc_overflow; /* set when no new arcs can be added to the call graph */
65int n_buckets = 0;
66
67size_t mem_needed;   /* space needed for the call graph and the PC samples */
68
69/* And these hold backtrace samples */
70struct trace_ring_buf {
71	unsigned long *data;
72	int start;
73	int end;
74	int active;
75};
76
77struct trace_ring_buf trace_bufs[NR_CPUS];
78
79prof_mem_map_t memory_map;
80
81unsigned char cpu_prof_enabled[NR_CPUS];
82unsigned long cpu_prof_enable_map = ~0UL;
83
84#define DEBUG_RECUR_COUNT_MAX 4
85static union {
86	struct percpu_data {
87		unsigned long lost_ones;
88		unsigned long total_mcount;
89		unsigned long debug_recurse_count[DEBUG_RECUR_COUNT_MAX];
90		unsigned int  amhere;
91	} d;
92	char __pad [SMP_CACHE_BYTES];
93} kernprof_cpu_data [NR_CPUS] __cacheline_aligned;
94
95MODULE_AUTHOR("Dimitris Michailidis");
96MODULE_DESCRIPTION("Kernel profile driver");
97
98MODULE_PARM(PC_resolution, "i");
99MODULE_PARM_DESC(PC_resolution, "resolution of PC samples "
100		                "(rounded down to a power of 2)");
101
102/* round x up to a multiple of n.  n must be a power of 2 */
103static inline size_t roundup(size_t x, int n)
104{
105	return (x + n - 1) & ~(n - 1);
106}
107
108/* The next few definitions deal with procfs */
109static ssize_t read_prof_buf(char *prof_buf, size_t prof_buf_sz,
110			     char *user_buf, size_t count, loff_t *ppos)
111{
112	if (!prof_buf)
113		return -EIO;
114	if (*ppos >= prof_buf_sz)
115		return 0;
116	if (count > prof_buf_sz - *ppos)
117		count = prof_buf_sz - *ppos;
118	copy_to_user(user_buf, prof_buf + *ppos, count);
119	*ppos += count;
120	return count;
121}
122
123static ssize_t read_PC_samples(struct file *file, char *user_buf,
124			       size_t count, loff_t *ppos)
125{
126	return read_prof_buf((char *)PC_sample_buf, PC_buf_sz, user_buf,
127			     count, ppos);
128}
129
130static struct file_operations proc_PC_sample_operations = {
131	read: read_PC_samples,
132};
133
134static ssize_t read_call_graph(struct file *file, char *user_buf,
135			       size_t count, loff_t *ppos)
136{
137	return read_prof_buf((char *)cg_from_base, (cg_from_sz + cg_to_sz) * smp_num_cpus,
138			     user_buf, count, ppos);
139}
140
141static struct file_operations proc_call_graph_operations = {
142	read: read_call_graph,
143};
144
145static void expand_enable_map(void)
146{
147	int i;
148
149	for (i = 0; i < NR_CPUS; ++i)
150		cpu_prof_enabled[i] = (cpu_prof_enable_map & (1L << i)) != 0;
151}
152
153static void prof_reset(void)
154{
155	int i;
156	if (PC_sample_buf)
157		memset(PC_sample_buf, 0, mem_needed);
158	cg_arc_overflow = 0;
159	prof_pid = 0;
160	for (i = 0; i < smp_num_cpus; i++) {
161#ifdef CONFIG_LIMIT_RECURS
162		int c;
163		for (c = 0; c < DEBUG_RECUR_COUNT_MAX; c++) {
164			kernprof_cpu_data[i].d.debug_recurse_count[c] = 0L;
165		}
166#endif
167		kernprof_cpu_data[i].d.total_mcount = 0L;
168		kernprof_cpu_data[i].d.lost_ones    = 0L;
169		trace_bufs[i].start = 0;
170		trace_bufs[i].end   = PROF_BACKTRACE_BUFSIZE - 1;
171	}
172}
173
174/* Deallocate profiling buffers */
175static void prof_free_mem(void)
176{
177	int i;
178
179	/* vfree() and kfree() handle NULL pointers */
180	vfree(PC_sample_buf);
181	PC_sample_buf = NULL;
182	for (i = 0; i < smp_num_cpus; ++i)
183		kfree(trace_bufs[cpu_logical_map(i)].data);
184}
185
186/*
187 * Allocate memory for the various profiling buffers. We are lazy and only do
188 * this if we really try to use the profiling facilities.
189 */
190static int prof_alloc_mem(void)
191{
192	char *p;
193	int i;
194
195	if ((p = vmalloc(mem_needed)) == NULL)
196		return -ENOMEM;
197	PC_sample_buf = (PC_sample_count_t *) p;
198	memory_map.nr_cpus = smp_num_cpus;
199	if (supports_call_graph)
200	{
201		cg_from_base = (unsigned short *) (p + PC_buf_sz);
202		cg_to_base = (struct cg_arc_dest *) (p + PC_buf_sz + cg_from_sz * smp_num_cpus);
203		memory_map.cg_from_size = cg_from_sz;
204		memory_map.cg_to_size = cg_to_sz;
205		memory_map.cg_to_offset = cg_from_sz * smp_num_cpus;
206	}
207	else
208	{
209		memory_map.cg_from_size = 0L;
210		memory_map.cg_to_size = 0L;
211		memory_map.cg_to_offset = 0L;
212	}
213	if (prof_have_frameptr)  /* allocate ring buffers for present CPUs */
214		for (i = 0; i < smp_num_cpus; ++i) {
215			int cpu = cpu_logical_map(i);
216
217			trace_bufs[cpu].data = (unsigned long *)kmalloc(
218				PROF_BACKTRACE_BUFSIZE * sizeof(unsigned long),
219				GFP_KERNEL);
220		}
221	prof_reset();
222	return 0;
223}
224
225/* Record a PC sample.  Called from interrupt handlers.  SMP safe. */
226static void PC_sample(struct pt_regs *regs)
227{
228	unsigned long pc;
229
230	if (!cpu_prof_enabled[smp_processor_id()]) return;
231	if (prof_pid && (!current || current->pid != prof_pid)) return;
232
233	pc = instruction_pointer(regs);
234	if (user_mode(regs))
235		pc = FUNCTIONPC(USER);
236	else if (in_firmware(regs))
237		pc = FUNCTIONPC(FIRMWARE);
238	else if (pc >= memory_map.module_start && pc < memory_map.module_end)
239		pc = FUNCTIONPC(MODULE);
240	else if (pc_out_of_range(pc))
241		pc = FUNCTIONPC(UNKNOWN_KERNEL);
242
243	pc -= (unsigned long) &_stext;
244	atomic_inc((atomic_t *) &PC_sample_buf[pc >> prof_shift]);
245}
246
247/* Record PC samples when woken up, called from schedule()
248 * blocked --> time spent sleeping on a wait queue
249 * stalled --> time spent runnable yet not running
250 */
251static void PC_wakeup_sample(unsigned long frompc, unsigned long blocked,
252			     unsigned long stalled)
253{
254	if (!cpu_prof_enabled[smp_processor_id()]) return;
255	if (prof_pid && (!current || current->pid != prof_pid)) return;
256
257	if (blocked == 0)
258		goto stalled;
259
260	frompc = FUNCTIONPC(SLEEPING) - (unsigned long) &_stext;
261	atomic_add(blocked * (get_prof_freq() / HZ),
262		   (atomic_t *) &PC_sample_buf[frompc >> prof_shift]);
263
264 stalled:
265	if (!stalled)
266		return;
267
268	frompc = FUNCTIONPC(STALLED) - (unsigned long) &_stext;
269	atomic_add(stalled * (get_prof_freq() / HZ),
270		   (atomic_t *) &PC_sample_buf[frompc >> prof_shift]);
271}
272
273/* Maintain function call counts. Called by mcount().  SMP safe. */
274void record_fn_call(unsigned long not_used, unsigned long pc)
275{
276	if (prof_pid && (!current || current->pid != prof_pid)) return;
277	if (pc_out_of_range(pc))
278	{
279		if (pc >= memory_map.module_start && pc < memory_map.module_end)
280			pc = FUNCTIONPC(MODULE);
281		else
282			pc = FUNCTIONPC(UNKNOWN_KERNEL);
283	}
284	pc -= (unsigned long) &_stext;
285	atomic_inc((atomic_t *) &PC_sample_buf[pc >> prof_shift]);
286}
287
288/* Record an arc traversal in the call graph.  Called by mcount().  SMP safe */
289void cg_record_arc(unsigned long frompc, unsigned long selfpc)
290{
291#ifndef __HAVE_ARCH_CMPXCHG16
292	static spinlock_t cg_record_lock = SPIN_LOCK_UNLOCKED;
293	unsigned long flags;
294#endif
295	int toindex;
296	int fromindex;
297	int cpu;
298	unsigned short *q;
299	struct cg_arc_dest *p;
300        unsigned short *cg_from;
301        struct cg_arc_dest *cg_to;
302#ifdef CONFIG_LIMIT_RECURS
303        uint *ishere;
304#endif /* CONFIG_LIMIT_RECURS */
305
306	cpu = smp_processor_id();
307        if (!cpu_prof_enabled[cpu])
308		return;
309	kernprof_cpu_data[cpu].d.total_mcount++;
310#ifdef CONFIG_LIMIT_RECURS
311	ishere = &kernprof_cpu_data[cpu].d.amhere;
312	toindex = atomic_add_return(1, (atomic_t *)ishere) - 2;
313	if (unlikely(toindex >= 0)) {
314		/* Ongoing decrements (see below) should keep index in range */
315		if (toindex >= DEBUG_RECUR_COUNT_MAX)   BUG();
316        	kernprof_cpu_data[cpu].d.debug_recurse_count[toindex]++;
317		/* If we're at the highest recursion count, then bail out! */
318        	if (toindex == DEBUG_RECUR_COUNT_MAX-1) {
319			atomic_dec((atomic_t *)ishere);
320			return;
321		}
322	}
323#endif /* CONFIG_LIMIT_RECURS */
324	cg_from = (u_short *)(((char *)cg_from_base) + cg_from_sz * cpu);
325	cg_to = &cg_to_base[CG_MAX_ARCS * cpu];
326	if (pc_out_of_range(frompc))
327	{
328	   if (frompc >= memory_map.module_start && frompc < memory_map.module_end)
329	      fromindex = (FUNCTIONPC(MODULE) - (unsigned long)&_stext) >> prof_shift;
330	   else
331	      fromindex = (FUNCTIONPC(UNKNOWN_KERNEL) - (unsigned long)&_stext) >> prof_shift;
332	}
333	else
334		fromindex = (frompc - (unsigned long) &_stext) >> prof_shift;
335	q = &cg_from[fromindex];
336
337	/* Easy case: the arc is already in the call graph */
338	for (toindex = *q; toindex != 0; ) {
339		p = &cg_to[toindex];
340		if (p->address == selfpc) {
341			atomic_inc(&p->count);
342#ifdef CONFIG_LIMIT_RECURS
343			atomic_dec((atomic_t *)ishere);
344#endif /* CONFIG_LIMIT_RECURS */
345			return;
346		}
347		toindex = p->link;
348	}
349
350	/*
351	 * No luck.  We need to add a new arc.  Since cg_to[0] is unused,
352	 * we use cg_to[0].count to keep track of the next available arc.
353	 */
354	if (cg_arc_overflow)
355	{
356		kernprof_cpu_data[cpu].d.lost_ones++;
357#ifdef CONFIG_LIMIT_RECURS
358		atomic_dec((atomic_t *)ishere);
359#endif /* CONFIG_LIMIT_RECURS */
360		return;
361	}
362	toindex = atomic_add_return(1, &cg_to->count);
363	if (toindex >= CG_MAX_ARCS) {
364		/*
365		 * We have run out of space for arcs.  We'll keep incrementing
366		 * the existing ones but we won't try to add any more.
367		 */
368		kernprof_cpu_data[cpu].d.lost_ones++;
369		cg_arc_overflow = 1;
370		atomic_set(&cg_to->count, CG_MAX_ARCS - 1);
371#ifdef CONFIG_LIMIT_RECURS
372		atomic_dec((atomic_t *)ishere);
373#endif /* CONFIG_LIMIT_RECURS */
374		return;
375	}
376
377	/*
378	 * We have a secured slot for a new arc and all we need to do is
379	 * initialize it and add it to a hash bucket.  We use compare&swap, if
380	 * possible, to avoid any spinlocks whatsoever.
381	 */
382	p = &cg_to[toindex];
383	p->address = selfpc;
384	atomic_set(&p->count, 1);
385#ifdef __HAVE_ARCH_CMPXCHG16
386	do {
387		p->link = *q;
388	} while (cmpxchg(q, p->link, toindex) != p->link);
389#else
390	spin_lock_irqsave(&cg_record_lock, flags);
391	p->link = *q;
392	*q = toindex;
393	spin_unlock_irqrestore(&cg_record_lock, flags);
394#endif
395#ifdef CONFIG_LIMIT_RECURS
396	atomic_dec((atomic_t *)ishere);
397#endif /* CONFIG_LIMIT_RECURS */
398	return;
399}
400
401/*
402 * Record an arc traversal in the call graph, and walk up the stack to
403 * find and record all the call graph arcs.  Called by schedule() (and
404 * potentially others).  SMP safe.
405 */
406void backtrace_cg_record_arc(unsigned long frompc, unsigned long selfpc)
407{
408	int backtrace_count = PROF_BACKTRACE_MAX_LEN;	/* for safety */
409	frame_info_t frame;
410	unsigned long caller_pc, callee_pc;
411
412	if (prof_pid && (!current || current->pid != prof_pid))
413		return;
414
415	/* If can't build fake frame, then record what info we have and leave */
416	if (!build_fake_frame(&frame)) {
417#ifndef CONFIG_IA64
418		caller_pc = frompc;
419		callee_pc = (selfpc) ? selfpc
420				: (unsigned long)__builtin_return_address(0);
421		cg_record_arc(caller_pc, callee_pc);
422#endif
423		return;
424	}
425
426	/* Walk back to who called us */
427	if (!get_next_frame(&frame)) {
428		return;
429	}
430	callee_pc = frame_get_pc(&frame);
431	if (pc_out_of_range(callee_pc)) {
432		return;
433	}
434
435	/* Now walk back to who called our caller, giving us the 1st cg arc */
436	if (!get_next_frame(&frame)) {
437		printk("  computed callee_pc:0x%lx\n", callee_pc & 0xffffffffL);
438		printk("  caller-supplied caller:0x%lx callee:0x%lx\n",
439			frompc & 0xffffffffL, selfpc & 0xffffffffL);
440		BUG();	/* debug */
441		return;
442	}
443	caller_pc = frame_get_pc(&frame);
444	if (pc_out_of_range(caller_pc)) {
445		return;
446	}
447	/* Now record this cg arc and keep walking back the stack for more */
448	while (backtrace_count--) {
449		cg_record_arc(caller_pc, callee_pc);
450		callee_pc = caller_pc;
451		if (!get_next_frame(&frame))
452			break;		/* quit! */
453		caller_pc = frame_get_pc(&frame);
454		if (pc_out_of_range(caller_pc))
455			break;		/* quit! */
456		backtrace_count--;
457	}
458}
459
460#define PROF_TRACE_MASK (PROF_BACKTRACE_BUFSIZE - 1)
461
462/* circularly increment i to point to the next entry in a trace ring buffer */
463#define CIRC_INC(i)     (((i) + 1) & PROF_TRACE_MASK)
464
465/*
466 * In backtrace mode, add a sample to the per-processor trace bufs.
467 *
468 * If frame is NULL, there is no backtrace. Just record a length 1
469 * backtrace at alt_pc.
470 *
471 * If frame is non-NULL, use it to perform a backtrace, generating a
472 * list of PCs to add onto the trace bufs.
473 *
474 * If frame is non-NULL, and alt_pc is non-NULL, same as before, except
475 * force alt_pc to be at the head of the backtrace, and pretend that the
476 * first function on the frame called alt_pc.
477 */
478
479static void do_backtrace_sample(frame_info_t *frame, unsigned long alt_pc,
480				unsigned long count)
481{
482	int free_slots, j, n_entries;
483	struct trace_ring_buf *p;
484
485	p = &trace_bufs[smp_processor_id()];
486	if (!p->active ||
487	    ((free_slots = ((p->end - p->start) & PROF_TRACE_MASK)) < 3))
488		goto out;
489	j = CIRC_INC(p->start);
490	n_entries = 1;
491
492	if (!frame) {
493		p->data[j] = alt_pc;
494		goto end_trace;
495	}
496
497	/* We set aside one slot for the trace length */
498	if (--free_slots > PROF_BACKTRACE_MAX_LEN)
499		free_slots = PROF_BACKTRACE_MAX_LEN;
500
501	n_entries = 0;
502	if (alt_pc) {
503		p->data[j] = alt_pc;
504		if (++n_entries == free_slots)
505			goto end_trace;
506		j = CIRC_INC(j);
507	}
508	while (1) {
509		p->data[j] = frame_get_pc(frame);
510		if (pc_out_of_range(p->data[j])) {
511	   		if (p->data[j] >= memory_map.module_start &&
512			    p->data[j] < memory_map.module_end)
513				p->data[j] = FUNCTIONPC(MODULE);
514			else
515				p->data[j] = FUNCTIONPC(UNKNOWN_KERNEL);
516		}
517		if (++n_entries == free_slots || !get_next_frame(frame))
518			break;
519		j = CIRC_INC(j);
520	}
521end_trace:
522	/* count goes in upper half of data value. 0 is interpreted as a 1 */
523	p->data[p->start] = (count << ((sizeof count) * 4)) | n_entries;
524	p->start = CIRC_INC(j);
525out:    return;
526}
527
528/* Record a stack backtrace.  Called from interrupt handlers. No MP issues. */
529static void backtrace_sample(struct pt_regs *regs)
530{
531	frame_info_t frame;
532	u_long pc;
533
534	if (!cpu_prof_enabled[smp_processor_id()])
535		return;
536	if (prof_pid && (!current || current->pid != prof_pid))
537		return;
538
539	/* Check for corner cases, otherwise generate frame from regs */
540
541	if (user_mode(regs)) {
542		pc = FUNCTIONPC(USER);
543		do_backtrace_sample(NULL, pc, 0);
544	} else if (in_firmware(regs)) {
545		pc = FUNCTIONPC(FIRMWARE);
546		do_backtrace_sample(NULL, pc, 0);
547	} else if (pc_out_of_range(instruction_pointer(regs))) {
548		if (instruction_pointer(regs) >= memory_map.module_start &&
549			instruction_pointer(regs) < memory_map.module_end)
550			{
551				pc = FUNCTIONPC(MODULE);
552				do_backtrace_sample(NULL, pc, 0);
553			} else {
554				pc = FUNCTIONPC(UNKNOWN_KERNEL);
555				do_backtrace_sample(NULL, pc, 0);
556			}
557	} else {
558		/* We have a pc value within the static kernel text area */
559		get_top_frame(regs, &frame);
560		pc = instruction_pointer(regs);
561		do_backtrace_sample(&frame, 0, 0);
562	}
563
564	pc -= (u_long) &_stext;
565	atomic_inc((atomic_t *) &PC_sample_buf[pc >> prof_shift]);
566}
567
568static void backtrace_wakeup_sample(unsigned long frompc, unsigned long blocked,
569				    unsigned long stalled)
570{
571	frame_info_t frame;
572	u_long pc;
573
574	if (!cpu_prof_enabled[smp_processor_id()])
575		return;
576
577	if (prof_pid == 0)
578		printk("kernprof error: backtrace_wakeup_sample but prof_pid == 0\n");
579
580	if (!current || current->pid != prof_pid)
581		return;
582
583	if (!build_fake_frame(&frame))
584		return;
585
586	if (!get_next_frame(&frame))
587		return;
588
589	if (blocked) {
590		pc = FUNCTIONPC(SLEEPING);
591		do_backtrace_sample(&frame, pc,
592				    blocked * (get_prof_freq() / HZ));
593
594		pc -= (u_long) &_stext;
595		atomic_add(blocked * (get_prof_freq() / HZ),
596			   (atomic_t *) &PC_sample_buf[pc >> prof_shift]);
597	}
598
599	if (stalled) {
600		pc = FUNCTIONPC(STALLED);
601		do_backtrace_sample(NULL, pc,
602				    stalled * (get_prof_freq() / HZ));
603		pc -= (u_long) &_stext;
604		atomic_add(stalled * (get_prof_freq() / HZ),
605			   (atomic_t *) &PC_sample_buf[pc >> prof_shift]);
606	}
607}
608
609static ssize_t trace_read(struct file *file, char *buf,
610			  size_t count, loff_t *ppos)
611{
612	struct trace_ring_buf *p;
613	size_t avail, entries_to_write;
614
615	p = &trace_bufs[minor(file->f_dentry->d_inode->i_rdev) - 1];
616	avail = (PROF_BACKTRACE_BUFSIZE - 1) + p->start - p->end;
617	avail &= PROF_TRACE_MASK;
618
619	entries_to_write = count / sizeof(*p->data);
620	if (entries_to_write > avail)
621		entries_to_write = avail;
622	if (entries_to_write == 0)
623		return 0;
624	count = entries_to_write * sizeof(*p->data);
625	if (p->end + entries_to_write < PROF_BACKTRACE_BUFSIZE) {
626		copy_to_user(buf, (void *)&p->data[p->end + 1], count);
627		p->end += entries_to_write;
628	} else {
629		size_t first_part;
630
631		avail = (PROF_BACKTRACE_BUFSIZE - 1) - p->end;
632		first_part = avail * sizeof(*p->data);
633
634		if (avail)
635			copy_to_user(buf, (void *)&p->data[p->end + 1],
636				     first_part);
637		copy_to_user(buf + first_part, (void *)&p->data[0],
638			     count - first_part);
639		p->end = entries_to_write - avail - 1;
640	}
641	return count;
642}
643
644static int trace_release(struct inode *inode, struct file *filp)
645{
646	trace_bufs[minor(inode->i_rdev) - 1].active = 0;
647        return 0;
648}
649
650static struct file_operations prof_trace_fops = {
651	owner: THIS_MODULE,
652	read: trace_read,
653	release: trace_release,
654};
655
656/*
657 * The perf counter interrupt handler calls this function which then calls the
658 * appropriate sampling function.  We do this because we may need to reload the
659 * perf counter after it overflows.
660 */
661void perfctr_aux_intr_handler(struct pt_regs *regs)
662{
663	prof_perfctr_aux_hook(regs);
664	perfctr_reload(perfctr_freq);
665}
666
667/* Start the performance monitoring counters */
668static void perfctr_commence(void *dummy)
669{
670	__perfctr_commence(perfctr_freq, perfctr_event);
671}
672
673/* Stop the performance monitoring counters */
674static void perfctr_stop(void *dummy)
675{
676	__perfctr_stop();
677}
678
679/* Open a profiling device */
680static int prof_open(struct inode *inode, struct file *filp)
681{
682	int minor = minor(inode->i_rdev);
683
684	if (minor != PROF_CNTRL_MINOR) {
685		--minor;
686		if (minor >= NR_CPUS || trace_bufs[minor].data == NULL)
687			return -ENODEV;
688
689		filp->f_op = &prof_trace_fops;
690		trace_bufs[minor].start = 0;
691		trace_bufs[minor].end = PROF_BACKTRACE_BUFSIZE - 1;
692		trace_bufs[minor].active = 1;
693	}
694
695	return 0;
696}
697
698static void prof_stop(void)
699{
700	if (prof_mode & PROF_MODE_CALL_GRAPH) {
701		/* Aggregate per-cpu counts into all-cpu counts to display */
702		unsigned long total_mcount = 0L;
703		unsigned long lost_ones = 0L;
704		int i;
705#ifdef CONFIG_LIMIT_RECURS
706		int ii;
707		unsigned long recur_counts[DEBUG_RECUR_COUNT_MAX];
708		for (i = 0; i < DEBUG_RECUR_COUNT_MAX; i++)
709			recur_counts[i] = 0L;
710#endif
711		for (i = 0; i < smp_num_cpus; i++) {
712			total_mcount += kernprof_cpu_data[i].d.total_mcount;
713			lost_ones    += kernprof_cpu_data[i].d.lost_ones;
714#ifdef CONFIG_LIMIT_RECURS
715			for (ii = 0; ii < DEBUG_RECUR_COUNT_MAX; ii++)
716				recur_counts[ii] += kernprof_cpu_data[i].d.debug_recurse_count[ii];
717#endif
718		}
719#ifdef CONFIG_LIMIT_RECURS
720		if (lost_ones || recur_counts[DEBUG_RECUR_COUNT_MAX-1]) {
721#else
722		if (lost_ones) {
723#endif
724			printk("Total mcount invocations: %12lu\n",
725				total_mcount);
726			printk("Lost to table overflow:   %12lu\n",
727				lost_ones);
728#ifdef CONFIG_LIMIT_RECURS
729			printk("Lost to recursive invoc:  %12lu\n",
730				recur_counts[DEBUG_RECUR_COUNT_MAX-1]);
731			printk("Recursion depth:counts: ");
732			for (ii = 0; ii < DEBUG_RECUR_COUNT_MAX-1; ii++)
733				printk(" %d:%lu ", ii+1, recur_counts[ii]);
734			printk("\n");
735#endif /* CONFIG_LIMIT_RECURS */
736		}
737	}
738	if (prof_perfctr_hook) {
739		smp_call_function(perfctr_stop, NULL, 1, 0);
740		perfctr_stop(NULL);
741	}
742	prof_timer_hook = prof_perfctr_hook = NULL;
743	mcount_hook = NULL;
744	prof_scheduler_hook = NULL;
745	prof_wakeup_hook = NULL;
746	if (prof_enabled) {
747		unload_timeout = jiffies + HZ;
748		prof_enabled = 0;
749		MOD_DEC_USE_COUNT;
750	}
751}
752
753extern struct module *module_list;
754extern struct module *static_module_list;
755
756int prof_get_module_map(prof_mem_map_t *map)
757{
758   struct module        *mod;
759   struct module_symbol *s;
760   char                 *t;
761   u_long                low = (u_long)-1L;
762   u_long                high = 0L;
763   u_long                end;
764   int                   i;
765
766   for (mod = module_list; mod != static_module_list; mod = mod->next)
767   {
768      if (mod->flags & MOD_RUNNING)
769      {
770         for (i = 0, s = mod->syms; i < mod->nsyms; i++, s++)
771         {
772            if ((t = strstr(s->name, "_S.text_L")))
773            {
774               if (s->value < low)
775                  low = s->value;
776               end = mod->size + s->value;
777               if (end > high)
778                  high = end;
779            }
780         }
781      }
782   }
783   if (high)
784   {
785      map->module_start = low;
786      map->module_end = high;
787      map->module_buckets = 0;
788      return(0);
789   }
790   return(-1);
791}
792
793int create_proc_files(void)
794{
795   struct proc_dir_entry *ent;
796   prof_mem_map_t m_map;
797
798   if (prof_get_module_map(&m_map))
799   {
800      m_map.module_start = m_map.module_end = 0L;
801      m_map.module_buckets = 0;
802   }
803   if (n_buckets != memory_map.kernel_buckets + m_map.module_buckets)
804   {
805      if (proc_created)
806      {
807         remove_proc_entry("profile/PC_samples", 0);
808         if (supports_call_graph)
809            remove_proc_entry("profile/call_graph", 0);
810         remove_proc_entry("profile", 0);
811         prof_free_mem();
812         proc_created = 0;
813      }
814      memory_map.module_buckets = 0;
815      memory_map.module_start = m_map.module_start;
816      memory_map.module_end = m_map.module_end;
817      n_buckets = memory_map.kernel_buckets;
818   }
819
820   if (proc_created)
821      return(0);
822
823   PC_buf_sz = n_buckets * sizeof(PC_sample_count_t);
824
825   if (!proc_mkdir("profile", 0))
826   {
827      printk(KERN_ERR "kernprof: unable to create /proc entries\n");
828      return -ENODEV;
829   }
830   if ((ent = create_proc_entry("profile/PC_samples", 0, 0)) != NULL)
831   {
832      ent->size = PC_buf_sz;
833      ent->proc_fops = &proc_PC_sample_operations;
834   }
835   else
836      printk("Unable to do create_proc_entry for PC_samples\n");
837
838   if (supports_call_graph)
839   {
840      /*
841       * Calculate size of call graph structures.  The round-ups
842       * ensure that pointers to these structures are properly
843       * aligned.
844       */
845      cg_from_sz = n_buckets * sizeof(short);
846      cg_to_sz = CG_MAX_ARCS * sizeof(struct cg_arc_dest);
847
848      PC_buf_sz = roundup(PC_buf_sz, sizeof(unsigned long));
849      cg_from_sz = roundup(cg_from_sz, sizeof(unsigned long));
850      mem_needed = PC_buf_sz + cg_from_sz * smp_num_cpus + cg_to_sz * smp_num_cpus ;
851
852      if ((ent = create_proc_entry("profile/call_graph", 0, 0)))
853      {
854         ent->size = cg_to_sz * smp_num_cpus  + cg_from_sz * smp_num_cpus;
855         ent->proc_fops = &proc_call_graph_operations;
856      }
857      else
858         printk("Unable to do create_proc_entry for call_graph\n");
859   }
860   else
861      mem_needed = PC_buf_sz;
862
863   proc_created = 1;
864   return(0);
865}
866
867/*
868 * ioctl handler for the kernprof control device.
869 */
870int prof_ctl_ioctl(struct inode *inode, struct file *filp,
871		   unsigned int command, unsigned long arg)
872{
873	int err = 0;
874
875	switch (command) {
876	case PROF_START:
877		if (prof_enabled)
878			return 0;
879		if (create_proc_files())
880		{
881			err = -EINVAL;
882			return err;
883		}
884		if (PC_sample_buf == NULL && (err = prof_alloc_mem()))
885			return err;
886		MOD_INC_USE_COUNT;
887		prof_enabled = 1;
888		if (prof_mode & PROF_MODE_CALL_GRAPH)
889		{
890		   mcount_hook = cg_record_arc;
891		}
892		else if (prof_mode & PROF_MODE_CALL_COUNT)
893		{
894		   mcount_hook = record_fn_call;
895		}
896		else if (prof_mode & PROF_MODE_SCHEDULER_CALL_GRAPH)
897			prof_scheduler_hook = backtrace_cg_record_arc;
898		if (prof_mode & PROF_MODE_PC_SAMPLING) {
899			*prof_intr_hook = PC_sample;
900			if (prof_pid)
901				prof_wakeup_hook = PC_wakeup_sample;
902		} else if (prof_mode & PROF_MODE_BACKTRACE) {
903			*prof_intr_hook = backtrace_sample;
904			if (prof_pid)
905				prof_wakeup_hook = backtrace_wakeup_sample;
906		}
907		if (prof_domain == PROF_DOMAIN_PERFCTR) {
908			if (!(prof_mode & PROF_MODE_PC_SAMPLING) &&
909			    !(prof_mode & PROF_MODE_BACKTRACE))
910			{
911				err = -EINVAL;
912				return err;
913			}
914			prof_perfctr_hook = perfctr_aux_intr_handler;
915			smp_call_function(perfctr_commence, NULL, 1, 0);
916			perfctr_commence(NULL);
917		}
918		break;
919	case PROF_STOP:
920		prof_stop();
921		break;
922	case PROF_RESET:
923		prof_stop();         /* resetting also stops profiling */
924		prof_reset();
925		break;
926	case PROF_SET_SAMPLE_FREQ:
927		if (prof_domain == PROF_DOMAIN_TIME)
928			err = setup_profiling_timer(arg);
929		else if (prof_domain == PROF_DOMAIN_PERFCTR) {
930			if (valid_perfctr_freq(arg))
931				perfctr_freq = arg;
932			else
933				err = -EINVAL;
934		} else
935			err = EINVAL;
936		break;
937	case PROF_GET_SAMPLE_FREQ:
938		if (prof_domain == PROF_DOMAIN_TIME) {
939			unsigned int freq = get_prof_freq();
940			err = copy_to_user((void *)arg, &freq, sizeof freq) ?
941				-EFAULT : 0;
942		} else
943			err = copy_to_user((void *)arg, &perfctr_freq,
944					   sizeof perfctr_freq) ? -EFAULT : 0;
945		break;
946	case PROF_GET_PC_RES:
947		err = copy_to_user((void *)arg, &PC_resolution,
948				   sizeof PC_resolution) ? -EFAULT : 0;
949		break;
950	case PROF_GET_ON_OFF_STATE:
951		err = copy_to_user((void *)arg, &prof_enabled,
952				   sizeof prof_enabled) ? -EFAULT : 0;
953		break;
954	case PROF_SET_DOMAIN:
955		if (arg != prof_domain)  /* changing domains stops profiling */
956			prof_stop();
957		if (arg == PROF_DOMAIN_TIME) {
958			prof_domain = arg;
959			prof_intr_hook = &prof_timer_hook;
960		} else if (arg == PROF_DOMAIN_PERFCTR && have_perfctr()) {
961			prof_domain = arg;
962			prof_intr_hook = &prof_perfctr_aux_hook;
963		} else
964			err = -EINVAL;
965		break;
966	case PROF_GET_DOMAIN:
967		err = copy_to_user((void *)arg, &prof_domain,
968				   sizeof prof_domain) ? -EFAULT : 0;
969		break;
970	case PROF_SET_MODE:
971		if (arg != prof_mode) /* changing modes also stops profiling */
972			prof_stop();
973		if (arg == PROF_MODE_PC_SAMPLING)
974			prof_mode = arg;
975		else if (arg == PROF_MODE_BACKTRACE && prof_have_frameptr)
976			prof_mode = arg;
977		else if (arg == PROF_MODE_CALL_COUNT && prof_have_mcount)
978			prof_mode = arg;
979		else if (supports_call_graph &&
980			  (arg == PROF_MODE_SCHEDULER_CALL_GRAPH ||
981			   arg == PROF_MODE_CALL_GRAPH ||
982			   arg == (PROF_MODE_CALL_GRAPH|PROF_MODE_PC_SAMPLING)))
983			prof_mode = arg;
984		else
985			err = -EINVAL;
986		break;
987	case PROF_GET_MODE:
988		err = copy_to_user((void *)arg, &prof_mode, sizeof prof_mode) ?
989			-EFAULT : 0;
990		break;
991	case PROF_SET_PID:
992		if (prof_enabled) /* don't change PID while profiling */
993			err = -EINVAL;
994		else {
995			prof_reset();
996			prof_pid = arg;
997		}
998		break;
999	case PROF_GET_PID:
1000		err = copy_to_user((void *)arg, &prof_pid, sizeof prof_pid) ?
1001 			-EFAULT : 0;
1002 		break;
1003	case PROF_SET_PERFCTR_EVENT:
1004		if (have_perfctr() && valid_perfctr_event(arg))
1005			perfctr_event = arg;
1006		else
1007			err = -EINVAL;
1008		break;
1009	case PROF_GET_PERFCTR_EVENT:
1010		if (have_perfctr())
1011			err = copy_to_user((void *)arg, &perfctr_event,
1012					   sizeof perfctr_event) ? -EFAULT : 0;
1013		else
1014			err = -EINVAL;
1015		break;
1016	case PROF_SET_ENABLE_MAP:
1017		if (get_user(cpu_prof_enable_map, (u_long *)arg))
1018			err = -EFAULT;
1019		else {
1020			cpu_prof_enable_map &= cpu_online_map;
1021			expand_enable_map();
1022		}
1023		break;
1024	case PROF_GET_ENABLE_MAP:
1025		err = copy_to_user((void *)arg, &cpu_prof_enable_map,
1026				   sizeof cpu_prof_enable_map) ? -EFAULT : 0;
1027		break;
1028	case PROF_GET_MAPPING:
1029		err = copy_to_user((void *)arg, &memory_map,
1030				   sizeof memory_map) ? -EFAULT : 0;
1031		break;
1032	default:
1033		err = -EINVAL;
1034	}
1035
1036	return err;
1037}
1038
1039static struct file_operations prof_ctl_fops = {
1040	owner: THIS_MODULE,
1041	ioctl: prof_ctl_ioctl,
1042	open: prof_open,
1043};
1044
1045#ifndef MODULE
1046static int __init kernprof_setup(char *str)
1047{
1048	int res;
1049
1050	if (get_option(&str, &res)) PC_resolution = res;
1051	return 1;
1052}
1053
1054__setup("kernprof=", kernprof_setup);
1055#else
1056static int can_unload(void)
1057{
1058	int ret = atomic_read(&__this_module.uc.usecount);
1059
1060	/*
1061	 * It is conceivable that we may try to delete this module just as
1062	 * an interrupt handler is trying to write into a profile buffer.
1063	 * Since unloading the module frees the buffers that would be
1064	 * unfortunate.  To avoid such races this module may not be unloaded
1065	 * within one second after profiling is turned off.
1066	 */
1067	if (time_before(jiffies, unload_timeout))
1068		ret = 1;
1069
1070	return ret;
1071}
1072#endif
1073
1074int __init kernprof_init(void)
1075{
1076	size_t text_size = (unsigned long) &_etext - (unsigned long) &_stext;
1077	int ret;
1078
1079	/* Round PC_resolution down to a power of 2 and compute its log */
1080	if (PC_resolution == 0)
1081		PC_resolution = DFL_PC_RES;
1082	while ((PC_resolution & (PC_resolution - 1)) != 0)
1083		PC_resolution &= PC_resolution - 1;
1084	for (prof_shift = 0; (1 << prof_shift) < PC_resolution; prof_shift++);
1085
1086	/* Calculate size of PC-sample buffer. */
1087	memory_map.kernel_buckets = n_buckets = text_size >> prof_shift;
1088	memory_map.kernel_start = (u_long)&_stext;
1089	memory_map.kernel_end = (u_long)&_etext;
1090
1091#ifdef MODULE
1092	__this_module.can_unload = can_unload;
1093#endif
1094	memset(trace_bufs, 0, sizeof trace_bufs);
1095
1096	cpu_prof_enable_map = cpu_online_map;
1097	expand_enable_map();
1098
1099	ret = devfs_register_chrdev(KERNPROF_MAJOR, "profile", &prof_ctl_fops);
1100	if (ret < 0)
1101		return ret;
1102	proc_handle = devfs_register(NULL, "profile",
1103				     DEVFS_FL_NONE, KERNPROF_MAJOR, 0,
1104				     S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
1105				     &prof_ctl_fops, NULL);
1106	return 0;
1107}
1108
1109/* This must be static for some reason */
1110static void __exit kernprof_exit(void)
1111{
1112	devfs_unregister(proc_handle);
1113	devfs_unregister_chrdev(KERNPROF_MAJOR, "profile");
1114	remove_proc_entry("profile/PC_samples", 0);
1115	if (supports_call_graph)
1116		remove_proc_entry("profile/call_graph", 0);
1117	remove_proc_entry("profile", 0);
1118	prof_free_mem();
1119}
1120
1121module_init(kernprof_init);
1122module_exit(kernprof_exit);
1123