rtla/src/timerlat_aa.c

130391Sle// SPDX-License-Identifier: GPL-2.0
152616Sle/*
130391Sle * Copyright (C) 2023 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
152631Sle */
152631Sle
152631Sle#include <stdlib.h>
152631Sle#include <errno.h>
152631Sle#include "utils.h"
130391Sle#include "osnoise.h"
130391Sle#include "timerlat.h"
130391Sle#include <unistd.h>
130391Sle
130391Sleenum timelat_state {
130391Sle	TIMERLAT_INIT = 0,
130391Sle	TIMERLAT_WAITING_IRQ,
130391Sle	TIMERLAT_WAITING_THREAD,
152631Sle};
130391Sle
130391Sle/* Used to fill spaces in the output */
130391Slestatic const char *spaces  = "                                                         ";
130391Sle
130391Sle#define MAX_COMM		24
130391Sle
130391Sle/*
130391Sle * Per-cpu data statistics and data.
130391Sle */
130391Slestruct timerlat_aa_data {
130391Sle	/* Current CPU state */
130391Sle	int			curr_state;
130391Sle
130391Sle	/* timerlat IRQ latency */
130391Sle	unsigned long long	tlat_irq_seqnum;
130391Sle	unsigned long long	tlat_irq_latency;
130391Sle	unsigned long long	tlat_irq_timstamp;
130391Sle
130391Sle	/* timerlat Thread latency */
130391Sle	unsigned long long	tlat_thread_seqnum;
130391Sle	unsigned long long	tlat_thread_latency;
130391Sle	unsigned long long	tlat_thread_timstamp;
130391Sle
130391Sle	/*
130391Sle	 * Information about the thread running when the IRQ
130391Sle	 * arrived.
130391Sle	 *
130391Sle	 * This can be blocking or interference, depending on the
130391Sle	 * priority of the thread. Assuming timerlat is the highest
130391Sle	 * prio, it is blocking. If timerlat has a lower prio, it is
130391Sle	 * interference.
130391Sle	 * note: "unsigned long long" because they are fetch using tep_get_field_val();
130391Sle	 */
130391Sle	unsigned long long	run_thread_pid;
130391Sle	char			run_thread_comm[MAX_COMM];
130391Sle	unsigned long long	thread_blocking_duration;
130391Sle	unsigned long long	max_exit_idle_latency;
130391Sle
130391Sle	/* Information about the timerlat timer irq */
130391Sle	unsigned long long	timer_irq_start_time;
130391Sle	unsigned long long	timer_irq_start_delay;
130391Sle	unsigned long long	timer_irq_duration;
152616Sle	unsigned long long	timer_exit_from_idle;
138110Sle
130391Sle	/*
152616Sle	 * Information about the last IRQ before the timerlat irq
157052Sle	 * arrived.
130391Sle	 *
130391Sle	 * If now - timestamp is <= latency, it might have influenced
138112Sle	 * in the timerlat irq latency. Otherwise, ignore it.
130391Sle	 */
130391Sle	unsigned long long	prev_irq_duration;
130391Sle	unsigned long long	prev_irq_timstamp;
130391Sle
130391Sle	/*
130391Sle	 * Interference sum.
130391Sle	 */
130391Sle	unsigned long long	thread_nmi_sum;
130391Sle	unsigned long long	thread_irq_sum;
130391Sle	unsigned long long	thread_softirq_sum;
130391Sle	unsigned long long	thread_thread_sum;
130391Sle
130391Sle	/*
130391Sle	 * Interference task information.
130391Sle	 */
130391Sle	struct trace_seq	*prev_irqs_seq;
130391Sle	struct trace_seq	*nmi_seq;
130391Sle	struct trace_seq	*irqs_seq;
130391Sle	struct trace_seq	*softirqs_seq;
130391Sle	struct trace_seq	*threads_seq;
130391Sle	struct trace_seq	*stack_seq;
130391Sle
130391Sle	/*
130391Sle	 * Current thread.
130391Sle	 */
130391Sle	char			current_comm[MAX_COMM];
130391Sle	unsigned long long	current_pid;
130391Sle
130391Sle	/*
130391Sle	 * Is the system running a kworker?
130391Sle	 */
130391Sle	unsigned long long	kworker;
130391Sle	unsigned long long	kworker_func;
130391Sle};
130391Sle
130391Sle/*
130391Sle * The analysis context and system wide view
130391Sle */
130391Slestruct timerlat_aa_context {
130391Sle	int nr_cpus;
130391Sle	int dump_tasks;
130391Sle
130391Sle	/* per CPU data */
130391Sle	struct timerlat_aa_data *taa_data;
130391Sle
130391Sle	/*
130391Sle	 * required to translate function names and register
130391Sle	 * events.
130391Sle	 */
130391Sle	struct osnoise_tool *tool;
130391Sle};
130391Sle
130391Sle/*
130391Sle * The data is stored as a local variable, but accessed via a helper function.
130391Sle *
130391Sle * It could be stored inside the trace context. But every access would
130391Sle * require container_of() + a series of pointers. Do we need it? Not sure.
130391Sle *
130391Sle * For now keep it simple. If needed, store it in the tool, add the *context
130391Sle * as a parameter in timerlat_aa_get_ctx() and do the magic there.
130391Sle */
130391Slestatic struct timerlat_aa_context *__timerlat_aa_ctx;
133097Sle
133097Slestatic struct timerlat_aa_context *timerlat_aa_get_ctx(void)
133097Sle{
133097Sle	return __timerlat_aa_ctx;
133097Sle}
133097Sle
133097Sle/*
133097Sle * timerlat_aa_get_data - Get the per-cpu data from the timerlat context
133097Sle */
133097Slestatic struct timerlat_aa_data
133097Sle*timerlat_aa_get_data(struct timerlat_aa_context *taa_ctx, int cpu)
133097Sle{
133097Sle	return &taa_ctx->taa_data[cpu];
133097Sle}
133097Sle
133097Sle/*
133097Sle * timerlat_aa_irq_latency - Handles timerlat IRQ event
133097Sle */
133097Slestatic int timerlat_aa_irq_latency(struct timerlat_aa_data *taa_data,
133097Sle				   struct trace_seq *s, struct tep_record *record,
133097Sle				   struct tep_event *event)
133097Sle{
133097Sle	/*
133097Sle	 * For interference, we start now looking for things that can delay
133097Sle	 * the thread.
133097Sle	 */
133097Sle	taa_data->curr_state = TIMERLAT_WAITING_THREAD;
133097Sle	taa_data->tlat_irq_timstamp = record->ts;
133097Sle
133097Sle	/*
133097Sle	 * Zero values.
133097Sle	 */
133097Sle	taa_data->thread_nmi_sum = 0;
133097Sle	taa_data->thread_irq_sum = 0;
133097Sle	taa_data->thread_softirq_sum = 0;
130391Sle	taa_data->thread_thread_sum = 0;
130391Sle	taa_data->thread_blocking_duration = 0;
130391Sle	taa_data->timer_irq_start_time = 0;
130391Sle	taa_data->timer_irq_duration = 0;
130391Sle	taa_data->timer_exit_from_idle = 0;
130391Sle
130391Sle	/*
130391Sle	 * Zero interference tasks.
130391Sle	 */
130391Sle	trace_seq_reset(taa_data->nmi_seq);
130391Sle	trace_seq_reset(taa_data->irqs_seq);
130391Sle	trace_seq_reset(taa_data->softirqs_seq);
130391Sle	trace_seq_reset(taa_data->threads_seq);
130391Sle
130391Sle	/* IRQ latency values */
130391Sle	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_irq_latency, 1);
130391Sle	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_irq_seqnum, 1);
130391Sle
130391Sle	/* The thread that can cause blocking */
130391Sle	tep_get_common_field_val(s, event, "common_pid", record, &taa_data->run_thread_pid, 1);
130391Sle
130391Sle	/*
130391Sle	 * Get exit from idle case.
130391Sle	 *
130391Sle	 * If it is not idle thread:
130391Sle	 */
130391Sle	if (taa_data->run_thread_pid)
130391Sle		return 0;
150044Sle
150044Sle	/*
150044Sle	 * if the latency is shorter than the known exit from idle:
150044Sle	 */
130391Sle	if (taa_data->tlat_irq_latency < taa_data->max_exit_idle_latency)
150044Sle		return 0;
150044Sle
150044Sle	/*
150044Sle	 * To be safe, ignore the cases in which an IRQ/NMI could have
150044Sle	 * interfered with the timerlat IRQ.
150044Sle	 */
150044Sle	if (taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency
150044Sle	    < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
150044Sle		return 0;
150044Sle
150044Sle	taa_data->max_exit_idle_latency = taa_data->tlat_irq_latency;
130391Sle
150044Sle	return 0;
150044Sle}
130391Sle
150044Sle/*
150044Sle * timerlat_aa_thread_latency - Handles timerlat thread event
150044Sle */
150044Slestatic int timerlat_aa_thread_latency(struct timerlat_aa_data *taa_data,
150044Sle				      struct trace_seq *s, struct tep_record *record,
130391Sle				      struct tep_event *event)
150044Sle{
150044Sle	/*
150044Sle	 * For interference, we start now looking for things that can delay
130391Sle	 * the IRQ of the next cycle.
150044Sle	 */
150044Sle	taa_data->curr_state = TIMERLAT_WAITING_IRQ;
150044Sle	taa_data->tlat_thread_timstamp = record->ts;
150044Sle
150044Sle	/* Thread latency values */
150044Sle	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_thread_latency, 1);
150044Sle	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_thread_seqnum, 1);
150044Sle
150044Sle	return 0;
150044Sle}
130391Sle
150044Sle/*
150044Sle * timerlat_aa_handler - Handle timerlat events
130391Sle *
150044Sle * This function is called to handle timerlat events recording statistics.
150044Sle *
150044Sle * Returns 0 on success, -1 otherwise.
150044Sle */
150044Slestatic int timerlat_aa_handler(struct trace_seq *s, struct tep_record *record,
130391Sle			struct tep_event *event, void *context)
150044Sle{
150044Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
150044Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
150044Sle	unsigned long long thread;
150044Sle
130391Sle	if (!taa_data)
150044Sle		return -1;
150044Sle
150044Sle	tep_get_field_val(s, event, "context", record, &thread, 1);
150044Sle	if (!thread)
150044Sle		return timerlat_aa_irq_latency(taa_data, s, record, event);
130391Sle	else
150044Sle		return timerlat_aa_thread_latency(taa_data, s, record, event);
150044Sle}
150044Sle
130391Sle/*
150044Sle * timerlat_aa_nmi_handler - Handles NMI noise
150044Sle *
150044Sle * It is used to collect information about interferences from NMI. It is
150044Sle * hooked to the osnoise:nmi_noise event.
150044Sle */
150044Slestatic int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *record,
150044Sle				   struct tep_event *event, void *context)
150044Sle{
150044Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
150044Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
150044Sle	unsigned long long duration;
130391Sle	unsigned long long start;
150044Sle
150044Sle	tep_get_field_val(s, event, "duration", record, &duration, 1);
150044Sle	tep_get_field_val(s, event, "start", record, &start, 1);
150044Sle
150044Sle	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
150044Sle		taa_data->prev_irq_duration = duration;
150044Sle		taa_data->prev_irq_timstamp = start;
150044Sle
150044Sle		trace_seq_reset(taa_data->prev_irqs_seq);
150044Sle		trace_seq_printf(taa_data->prev_irqs_seq, "  %24s %.*s %9.2f us\n",
150044Sle				 "nmi",
150044Sle				 24, spaces,
150044Sle				 ns_to_usf(duration));
150044Sle		return 0;
150044Sle	}
150044Sle
150044Sle	taa_data->thread_nmi_sum += duration;
150044Sle	trace_seq_printf(taa_data->nmi_seq, "  %24s %.*s %9.2f us\n",
150044Sle			 "nmi",
150044Sle			 24, spaces, ns_to_usf(duration));
130391Sle
130391Sle	return 0;
150044Sle}
150044Sle
130391Sle/*
150044Sle * timerlat_aa_irq_handler - Handles IRQ noise
150044Sle *
150044Sle * It is used to collect information about interferences from IRQ. It is
150044Sle * hooked to the osnoise:irq_noise event.
150044Sle *
150044Sle * It is a little bit more complex than the other because it measures:
150044Sle *	- The IRQs that can delay the timer IRQ before it happened.
150044Sle *	- The Timerlat IRQ handler
150044Sle *	- The IRQs that happened between the timerlat IRQ and the timerlat thread
150044Sle *	  (IRQ interference).
130391Sle */
130391Slestatic int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *record,
130391Sle				   struct tep_event *event, void *context)
130391Sle{
130391Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
130391Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
130391Sle	unsigned long long expected_start;
130391Sle	unsigned long long duration;
130391Sle	unsigned long long vector;
130391Sle	unsigned long long start;
130391Sle	char *desc;
130391Sle	int val;
130391Sle
130391Sle	tep_get_field_val(s, event, "duration", record, &duration, 1);
130391Sle	tep_get_field_val(s, event, "start", record, &start, 1);
130391Sle	tep_get_field_val(s, event, "vector", record, &vector, 1);
130391Sle	desc = tep_get_field_raw(s, event, "desc", record, &val, 1);
130391Sle
130391Sle	/*
130391Sle	 * Before the timerlat IRQ.
130391Sle	 */
130391Sle	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
130391Sle		taa_data->prev_irq_duration = duration;
130391Sle		taa_data->prev_irq_timstamp = start;
152616Sle
152616Sle		trace_seq_reset(taa_data->prev_irqs_seq);
152616Sle		trace_seq_printf(taa_data->prev_irqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
152616Sle				 desc, vector,
152616Sle				 15, spaces,
130391Sle				 ns_to_usf(duration));
152616Sle		return 0;
130391Sle	}
152616Sle
130391Sle	/*
152616Sle	 * The timerlat IRQ: taa_data->timer_irq_start_time is zeroed at
130391Sle	 * the timerlat irq handler.
152616Sle	 */
130391Sle	if (!taa_data->timer_irq_start_time) {
130391Sle		expected_start = taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency;
130391Sle
130391Sle		taa_data->timer_irq_start_time = start;
130391Sle		taa_data->timer_irq_duration = duration;
130391Sle
130391Sle		/*
130391Sle		 * We are dealing with two different clock sources: the
152616Sle		 * external clock source that timerlat uses as a reference
152616Sle		 * and the clock used by the tracer. There are also two
157052Sle		 * moments: the time reading the clock and the timer in
157052Sle		 * which the event is placed in the buffer (the trace
152616Sle		 * event timestamp). If the processor is slow or there
130391Sle		 * is some hardware noise, the difference between the
130391Sle		 * timestamp and the external clock read can be longer
130391Sle		 * than the IRQ handler delay, resulting in a negative
130391Sle		 * time. If so, set IRQ start delay as 0. In the end,
152616Sle		 * it is less relevant than the noise.
130391Sle		 */
130391Sle		if (expected_start < taa_data->timer_irq_start_time)
130391Sle			taa_data->timer_irq_start_delay = taa_data->timer_irq_start_time - expected_start;
152616Sle		else
130391Sle			taa_data->timer_irq_start_delay = 0;
130391Sle
130391Sle		/*
130391Sle		 * not exit from idle.
130391Sle		 */
130391Sle		if (taa_data->run_thread_pid)
130391Sle			return 0;
138112Sle
138112Sle		if (expected_start > taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
138112Sle			taa_data->timer_exit_from_idle = taa_data->timer_irq_start_delay;
138112Sle
138112Sle		return 0;
138112Sle	}
138112Sle
138112Sle	/*
138112Sle	 * IRQ interference.
138112Sle	 */
138112Sle	taa_data->thread_irq_sum += duration;
138112Sle	trace_seq_printf(taa_data->irqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
138112Sle			 desc, vector,
138112Sle			 24, spaces,
138112Sle			 ns_to_usf(duration));
138112Sle
138112Sle	return 0;
138112Sle}
138112Sle
138112Slestatic char *softirq_name[] = { "HI", "TIMER",	"NET_TX", "NET_RX", "BLOCK",
138112Sle				"IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" };
138112Sle
138112Sle
138112Sle/*
138112Sle * timerlat_aa_softirq_handler - Handles Softirq noise
138112Sle *
138112Sle * It is used to collect information about interferences from Softirq. It is
138112Sle * hooked to the osnoise:softirq_noise event.
138112Sle *
138112Sle * It is only printed in the non-rt kernel, as softirqs become thread on RT.
138112Sle */
138112Slestatic int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *record,
138112Sle				       struct tep_event *event, void *context)
138112Sle{
138112Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
138112Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
138112Sle	unsigned long long duration;
138112Sle	unsigned long long vector;
138112Sle	unsigned long long start;
138112Sle
138112Sle	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
138112Sle		return 0;
138112Sle
138112Sle	tep_get_field_val(s, event, "duration", record, &duration, 1);
138112Sle	tep_get_field_val(s, event, "start", record, &start, 1);
138112Sle	tep_get_field_val(s, event, "vector", record, &vector, 1);
138112Sle
138112Sle	taa_data->thread_softirq_sum += duration;
138112Sle
138112Sle	trace_seq_printf(taa_data->softirqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
138112Sle			 softirq_name[vector], vector,
138112Sle			 24, spaces,
138112Sle			 ns_to_usf(duration));
138112Sle	return 0;
138112Sle}
130391Sle
130391Sle/*
130391Sle * timerlat_aa_softirq_handler - Handles thread noise
130391Sle *
130391Sle * It is used to collect information about interferences from threads. It is
130391Sle * hooked to the osnoise:thread_noise event.
130391Sle *
130391Sle * Note: if you see thread noise, your timerlat thread was not the highest prio one.
130391Sle */
130391Slestatic int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *record,
130391Sle				      struct tep_event *event, void *context)
130391Sle{
130391Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
130391Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
130391Sle	unsigned long long duration;
130391Sle	unsigned long long start;
130391Sle	unsigned long long pid;
130391Sle	const char *comm;
130391Sle	int val;
130391Sle
130391Sle	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
130391Sle		return 0;
130391Sle
130391Sle	tep_get_field_val(s, event, "duration", record, &duration, 1);
130391Sle	tep_get_field_val(s, event, "start", record, &start, 1);
130391Sle
130391Sle	tep_get_common_field_val(s, event, "common_pid", record, &pid, 1);
130391Sle	comm = tep_get_field_raw(s, event, "comm", record, &val, 1);
130391Sle
130391Sle	if (pid == taa_data->run_thread_pid && !taa_data->thread_blocking_duration) {
130391Sle		taa_data->thread_blocking_duration = duration;
130391Sle
130391Sle		if (comm)
130391Sle			strncpy(taa_data->run_thread_comm, comm, MAX_COMM);
130391Sle		else
130391Sle			sprintf(taa_data->run_thread_comm, "<...>");
130391Sle
130391Sle	} else {
130391Sle		taa_data->thread_thread_sum += duration;
130391Sle
130391Sle		trace_seq_printf(taa_data->threads_seq, "  %24s:%-12llu %.*s %9.2f us\n",
130391Sle				 comm, pid,
130391Sle				 15, spaces,
130391Sle				 ns_to_usf(duration));
130391Sle	}
130391Sle
130391Sle	return 0;
130391Sle}
130391Sle
130391Sle/*
130391Sle * timerlat_aa_stack_handler - Handles timerlat IRQ stack trace
130391Sle *
130391Sle * Saves and parse the stack trace generated by the timerlat IRQ.
130391Sle */
130391Slestatic int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *record,
130391Sle			      struct tep_event *event, void *context)
130391Sle{
130391Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
130391Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
130391Sle	unsigned long *caller;
130391Sle	const char *function;
130391Sle	int val, i;
130391Sle
130391Sle	trace_seq_reset(taa_data->stack_seq);
152616Sle
130391Sle	trace_seq_printf(taa_data->stack_seq, "    Blocking thread stack trace\n");
152616Sle	caller = tep_get_field_raw(s, event, "caller", record, &val, 1);
152616Sle	if (caller) {
152616Sle		for (i = 0; ; i++) {
152616Sle			function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]);
152616Sle			if (!function)
152616Sle				break;
152616Sle			trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n",
152616Sle					 14, spaces, function);
152616Sle		}
152616Sle	}
152616Sle	return 0;
152616Sle}
152616Sle
152616Sle/*
152616Sle * timerlat_aa_sched_switch_handler - Tracks the current thread running on the CPU
152616Sle *
152616Sle * Handles the sched:sched_switch event to trace the current thread running on the
152616Sle * CPU. It is used to display the threads running on the other CPUs when the trace
152616Sle * stops.
152616Sle */
152616Slestatic int timerlat_aa_sched_switch_handler(struct trace_seq *s, struct tep_record *record,
152616Sle					    struct tep_event *event, void *context)
152616Sle{
152616Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
152616Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
152616Sle	const char *comm;
152616Sle	int val;
152616Sle
152616Sle	tep_get_field_val(s, event, "next_pid", record, &taa_data->current_pid, 1);
152616Sle	comm = tep_get_field_raw(s, event, "next_comm", record, &val, 1);
152616Sle
152616Sle	strncpy(taa_data->current_comm, comm, MAX_COMM);
152616Sle
152616Sle	/*
152616Sle	 * If this was a kworker, clean the last kworkers that ran.
152616Sle	 */
152616Sle	taa_data->kworker = 0;
152616Sle	taa_data->kworker_func = 0;
152616Sle
152616Sle	return 0;
152616Sle}
152616Sle
152616Sle/*
152616Sle * timerlat_aa_kworker_start_handler - Tracks a kworker running on the CPU
152616Sle *
152616Sle * Handles workqueue:workqueue_execute_start event, keeping track of
152631Sle * the job that a kworker could be doing in the CPU.
152616Sle *
152616Sle * We already catch problems of hardware related latencies caused by work queues
152616Sle * running driver code that causes hardware stall. For example, with DRM drivers.
152616Sle */
152616Slestatic int timerlat_aa_kworker_start_handler(struct trace_seq *s, struct tep_record *record,
152616Sle					     struct tep_event *event, void *context)
152616Sle{
130391Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
130391Sle	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
130391Sle
130391Sle	tep_get_field_val(s, event, "work", record, &taa_data->kworker, 1);
130391Sle	tep_get_field_val(s, event, "function", record, &taa_data->kworker_func, 1);
130391Sle	return 0;
138110Sle}
138110Sle
138110Sle/*
138110Sle * timerlat_thread_analysis - Prints the analysis of a CPU that hit a stop tracing
138110Sle *
138110Sle * This is the core of the analysis.
138110Sle */
138110Slestatic void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu,
138110Sle				     int irq_thresh, int thread_thresh)
138110Sle{
138110Sle	long long exp_irq_ts;
138110Sle	int total;
138110Sle	int irq;
138110Sle
138110Sle	/*
138110Sle	 * IRQ latency or Thread latency?
138110Sle	 */
138110Sle	if (taa_data->tlat_irq_seqnum > taa_data->tlat_thread_seqnum) {
138110Sle		irq = 1;
138110Sle		total = taa_data->tlat_irq_latency;
138110Sle	} else {
138110Sle		irq = 0;
138110Sle		total = taa_data->tlat_thread_latency;
138110Sle	}
138110Sle
138110Sle	/*
138110Sle	 * Expected IRQ arrival time using the trace clock as the base.
138110Sle	 *
138110Sle	 * TODO: Add a list of previous IRQ, and then run the list backwards.
138110Sle	 */
138110Sle	exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay;
138110Sle	if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) {
138110Sle		if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time)
138110Sle			printf("  Previous IRQ interference: %.*s up to  %9.2f us\n",
138110Sle			       16, spaces,
138110Sle			       ns_to_usf(taa_data->prev_irq_duration));
138110Sle	}
138110Sle
138110Sle	/*
138110Sle	 * The delay that the IRQ suffered before starting.
138110Sle	 */
138110Sle	printf("  IRQ handler delay: %.*s %16s  %9.2f us (%.2f %%)\n", 16, spaces,
138110Sle	       (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "",
138110Sle	       ns_to_usf(taa_data->timer_irq_start_delay),
138110Sle	       ns_to_per(total, taa_data->timer_irq_start_delay));
138110Sle
138110Sle	/*
138110Sle	 * Timerlat IRQ.
138110Sle	 */
138110Sle	printf("  IRQ latency: %.*s %9.2f us\n", 40, spaces,
138110Sle	       ns_to_usf(taa_data->tlat_irq_latency));
138110Sle
138110Sle	if (irq) {
138110Sle		/*
138110Sle		 * If the trace stopped due to IRQ, the other events will not happen
138110Sle		 * because... the trace stopped :-).
138110Sle		 *
138110Sle		 * That is all folks, the stack trace was printed before the stop,
138110Sle		 * so it will be displayed, it is the key.
138110Sle		 */
138110Sle		printf("  Blocking thread:\n");
138110Sle		printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm,
138110Sle		       taa_data->run_thread_pid);
138110Sle	} else  {
138110Sle		/*
138110Sle		 * The duration of the IRQ handler that handled the timerlat IRQ.
138110Sle		 */
138110Sle		printf("  Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n",
138110Sle		       30, spaces,
138110Sle		       ns_to_usf(taa_data->timer_irq_duration),
138110Sle		       ns_to_per(total, taa_data->timer_irq_duration));
138110Sle
138110Sle		/*
138110Sle		 * The amount of time that the current thread postponed the scheduler.
138110Sle		 *
138110Sle		 * Recalling that it is net from NMI/IRQ/Softirq interference, so there
138110Sle		 * is no need to compute values here.
138110Sle		 */
138110Sle		printf("  Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces,
138110Sle		       ns_to_usf(taa_data->thread_blocking_duration),
138110Sle		       ns_to_per(total, taa_data->thread_blocking_duration));
138110Sle
138110Sle		printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces,
152616Sle		       taa_data->run_thread_comm, taa_data->run_thread_pid,
152616Sle		       12, spaces, ns_to_usf(taa_data->thread_blocking_duration));
152616Sle	}
152616Sle
152616Sle	/*
152616Sle	 * Print the stack trace!
152616Sle	 */
152616Sle	trace_seq_do_printf(taa_data->stack_seq);
152616Sle
152616Sle	/*
152616Sle	 * NMIs can happen during the IRQ, so they are always possible.
152616Sle	 */
152616Sle	if (taa_data->thread_nmi_sum)
152616Sle		printf("  NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces,
152616Sle		       ns_to_usf(taa_data->thread_nmi_sum),
152616Sle		       ns_to_per(total, taa_data->thread_nmi_sum));
152616Sle
152616Sle	/*
152616Sle	 * If it is an IRQ latency, the other factors can be skipped.
152616Sle	 */
152616Sle	if (irq)
152616Sle		goto print_total;
152616Sle
152616Sle	/*
152616Sle	 * Prints the interference caused by IRQs to the thread latency.
152616Sle	 */
152616Sle	if (taa_data->thread_irq_sum) {
152616Sle		printf("  IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces,
152616Sle		       ns_to_usf(taa_data->thread_irq_sum),
152616Sle		       ns_to_per(total, taa_data->thread_irq_sum));
152616Sle
152616Sle		trace_seq_do_printf(taa_data->irqs_seq);
152616Sle	}
152616Sle
152616Sle	/*
152616Sle	 * Prints the interference caused by Softirqs to the thread latency.
152616Sle	 */
152616Sle	if (taa_data->thread_softirq_sum) {
152616Sle		printf("  Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces,
152616Sle		       ns_to_usf(taa_data->thread_softirq_sum),
152616Sle		       ns_to_per(total, taa_data->thread_softirq_sum));
152616Sle
152616Sle		trace_seq_do_printf(taa_data->softirqs_seq);
152616Sle	}
152616Sle
152631Sle	/*
152616Sle	 * Prints the interference caused by other threads to the thread latency.
152616Sle	 *
152616Sle	 * If this happens, your timerlat is not the highest prio. OK, migration
152616Sle	 * thread can happen. But otherwise, you are not measuring the "scheduling
152616Sle	 * latency" only, and here is the difference from scheduling latency and
152616Sle	 * timer handling latency.
152616Sle	 */
130391Sle	if (taa_data->thread_thread_sum) {
130391Sle		printf("  Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces,
130391Sle		       ns_to_usf(taa_data->thread_thread_sum),
130391Sle		       ns_to_per(total, taa_data->thread_thread_sum));
130391Sle
130391Sle		trace_seq_do_printf(taa_data->threads_seq);
130391Sle	}
130391Sle
130391Sle	/*
130391Sle	 * Done.
130391Sle	 */
130391Sleprint_total:
130391Sle	printf("------------------------------------------------------------------------\n");
130391Sle	printf("  %s latency: %.*s %9.2f us (100%%)\n", irq ? "   IRQ" : "Thread",
130391Sle	       37, spaces, ns_to_usf(total));
130391Sle}
130391Sle
130391Slestatic int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx)
130391Sle{
130391Sle	struct trace_instance *trace = &taa_ctx->tool->trace;
130391Sle	int retval;
130391Sle
130391Sle	retval = tracefs_iterate_raw_events(trace->tep,
130391Sle					    trace->inst,
130391Sle					    NULL,
130391Sle					    0,
130391Sle					    collect_registered_events,
130391Sle					    trace);
130391Sle		if (retval < 0) {
130391Sle			err_msg("Error iterating on events\n");
130391Sle			return 0;
130391Sle		}
130391Sle
130391Sle	return 1;
130391Sle}
130391Sle
130391Sle/**
130391Sle * timerlat_auto_analysis - Analyze the collected data
130391Sle */
130391Slevoid timerlat_auto_analysis(int irq_thresh, int thread_thresh)
130391Sle{
130391Sle	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
130391Sle	unsigned long long max_exit_from_idle = 0;
130391Sle	struct timerlat_aa_data *taa_data;
130391Sle	int max_exit_from_idle_cpu;
130391Sle	struct tep_handle *tep;
157052Sle	int cpu;
157052Sle
157052Sle	timerlat_auto_analysis_collect_trace(taa_ctx);
157052Sle
157052Sle	/* bring stop tracing to the ns scale */
157052Sle	irq_thresh = irq_thresh * 1000;
157052Sle	thread_thresh = thread_thresh * 1000;
157052Sle
157052Sle	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
157052Sle		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
157052Sle
157052Sle		if (irq_thresh && taa_data->tlat_irq_latency >= irq_thresh) {
157052Sle			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
157052Sle			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
157052Sle		} else if (thread_thresh && (taa_data->tlat_thread_latency) >= thread_thresh) {
157052Sle			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
157052Sle			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
157052Sle		}
157052Sle
157052Sle		if (taa_data->max_exit_idle_latency > max_exit_from_idle) {
157052Sle			max_exit_from_idle = taa_data->max_exit_idle_latency;
157052Sle			max_exit_from_idle_cpu = cpu;
157052Sle		}
157052Sle
157052Sle	}
157052Sle
157052Sle	if (max_exit_from_idle) {
157052Sle		printf("\n");
157052Sle		printf("Max timerlat IRQ latency from idle: %.2f us in cpu %d\n",
157052Sle			ns_to_usf(max_exit_from_idle), max_exit_from_idle_cpu);
157052Sle	}
157052Sle	if (!taa_ctx->dump_tasks)
157052Sle		return;
157052Sle
157052Sle	printf("\n");
157052Sle	printf("Printing CPU tasks:\n");
130391Sle	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
130391Sle		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
130391Sle		tep = taa_ctx->tool->trace.tep;
130391Sle
130391Sle		printf("    [%.3d] %24s:%llu", cpu, taa_data->current_comm, taa_data->current_pid);
130391Sle
130391Sle		if (taa_data->kworker_func)
130391Sle			printf(" kworker:%s:%s",
130391Sle				tep_find_function(tep, taa_data->kworker) ? : "<...>",
130391Sle				tep_find_function(tep, taa_data->kworker_func));
130391Sle		printf("\n");
130391Sle	}
130391Sle
130391Sle}
130391Sle
130391Sle/*
130391Sle * timerlat_aa_destroy_seqs - Destroy seq files used to store parsed data
130391Sle */
130391Slestatic void timerlat_aa_destroy_seqs(struct timerlat_aa_context *taa_ctx)
130391Sle{
130391Sle	struct timerlat_aa_data *taa_data;
130391Sle	int i;
130391Sle
130391Sle	if (!taa_ctx->taa_data)
130391Sle		return;
130391Sle
130391Sle	for (i = 0; i < taa_ctx->nr_cpus; i++) {
130391Sle		taa_data = timerlat_aa_get_data(taa_ctx, i);
130391Sle
130391Sle		if (taa_data->prev_irqs_seq) {
130391Sle			trace_seq_destroy(taa_data->prev_irqs_seq);
130391Sle			free(taa_data->prev_irqs_seq);
130391Sle		}
130391Sle
130391Sle		if (taa_data->nmi_seq) {
130391Sle			trace_seq_destroy(taa_data->nmi_seq);
130391Sle			free(taa_data->nmi_seq);
130391Sle		}
130391Sle
130391Sle		if (taa_data->irqs_seq) {
130391Sle			trace_seq_destroy(taa_data->irqs_seq);
130391Sle			free(taa_data->irqs_seq);
130391Sle		}
130391Sle
130391Sle		if (taa_data->softirqs_seq) {
130391Sle			trace_seq_destroy(taa_data->softirqs_seq);
130391Sle			free(taa_data->softirqs_seq);
130391Sle		}
130391Sle
130391Sle		if (taa_data->threads_seq) {
130391Sle			trace_seq_destroy(taa_data->threads_seq);
130391Sle			free(taa_data->threads_seq);
130391Sle		}
130391Sle
130391Sle		if (taa_data->stack_seq) {
130391Sle			trace_seq_destroy(taa_data->stack_seq);
130391Sle			free(taa_data->stack_seq);
130391Sle		}
130391Sle	}
130391Sle}
130391Sle
130391Sle/*
130391Sle * timerlat_aa_init_seqs - Init seq files used to store parsed information
130391Sle *
130391Sle * Instead of keeping data structures to store raw data, use seq files to
130391Sle * store parsed data.
130391Sle *
130391Sle * Allocates and initialize seq files.
130391Sle *
130391Sle * Returns 0 on success, -1 otherwise.
130391Sle */
130391Slestatic int timerlat_aa_init_seqs(struct timerlat_aa_context *taa_ctx)
130391Sle{
130391Sle	struct timerlat_aa_data *taa_data;
130391Sle	int i;
130391Sle
130391Sle	for (i = 0; i < taa_ctx->nr_cpus; i++) {
130391Sle
130391Sle		taa_data = timerlat_aa_get_data(taa_ctx, i);
130391Sle
130391Sle		taa_data->prev_irqs_seq = calloc(1, sizeof(*taa_data->prev_irqs_seq));
130391Sle		if (!taa_data->prev_irqs_seq)
130391Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->prev_irqs_seq);
130391Sle
130391Sle		taa_data->nmi_seq = calloc(1, sizeof(*taa_data->nmi_seq));
130391Sle		if (!taa_data->nmi_seq)
130391Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->nmi_seq);
130391Sle
130391Sle		taa_data->irqs_seq = calloc(1, sizeof(*taa_data->irqs_seq));
150044Sle		if (!taa_data->irqs_seq)
130391Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->irqs_seq);
130391Sle
130391Sle		taa_data->softirqs_seq = calloc(1, sizeof(*taa_data->softirqs_seq));
130391Sle		if (!taa_data->softirqs_seq)
130391Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->softirqs_seq);
130391Sle
130391Sle		taa_data->threads_seq = calloc(1, sizeof(*taa_data->threads_seq));
130391Sle		if (!taa_data->threads_seq)
130391Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->threads_seq);
152616Sle
152616Sle		taa_data->stack_seq = calloc(1, sizeof(*taa_data->stack_seq));
152616Sle		if (!taa_data->stack_seq)
152616Sle			goto out_err;
130391Sle
130391Sle		trace_seq_init(taa_data->stack_seq);
152616Sle	}
152616Sle
157052Sle	return 0;
157052Sle
130391Sleout_err:
130391Sle	timerlat_aa_destroy_seqs(taa_ctx);
130391Sle	return -1;
130391Sle}
138112Sle
138112Sle/*
130391Sle * timerlat_aa_unregister_events - Unregister events used in the auto-analysis
130391Sle */
130391Slestatic void timerlat_aa_unregister_events(struct osnoise_tool *tool, int dump_tasks)
130391Sle{
138110Sle
138110Sle	tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "timerlat",
138110Sle				     timerlat_aa_handler, tool);
138110Sle
130391Sle	tracefs_event_disable(tool->trace.inst, "osnoise", NULL);
130391Sle
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
130391Sle				     timerlat_aa_nmi_handler, tool);
130391Sle
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
130391Sle				     timerlat_aa_irq_handler, tool);
130391Sle
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
130391Sle				     timerlat_aa_softirq_handler, tool);
130391Sle
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
130391Sle				     timerlat_aa_thread_handler, tool);
130391Sle
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
130391Sle				     timerlat_aa_stack_handler, tool);
130391Sle	if (!dump_tasks)
130391Sle		return;
130391Sle
130391Sle	tracefs_event_disable(tool->trace.inst, "sched", "sched_switch");
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
130391Sle				     timerlat_aa_sched_switch_handler, tool);
130391Sle
130391Sle	tracefs_event_disable(tool->trace.inst, "workqueue", "workqueue_execute_start");
130391Sle	tep_unregister_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
130391Sle				     timerlat_aa_kworker_start_handler, tool);
130391Sle}
130391Sle
130391Sle/*
130391Sle * timerlat_aa_register_events - Register events used in the auto-analysis
130391Sle *
130391Sle * Returns 0 on success, -1 otherwise.
130391Sle */
130391Slestatic int timerlat_aa_register_events(struct osnoise_tool *tool, int dump_tasks)
130391Sle{
130391Sle	int retval;
130391Sle
130391Sle	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "timerlat",
130391Sle				timerlat_aa_handler, tool);
130391Sle
130391Sle
130391Sle	/*
130391Sle	 * register auto-analysis handlers.
130391Sle	 */
	retval = tracefs_event_enable(tool->trace.inst, "osnoise", NULL);
	if (retval < 0 && !errno) {
		err_msg("Could not find osnoise events\n");
		goto out_err;
	}

	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
				   timerlat_aa_nmi_handler, tool);

	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
				   timerlat_aa_irq_handler, tool);

	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
				   timerlat_aa_softirq_handler, tool);

	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
				   timerlat_aa_thread_handler, tool);

	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
				   timerlat_aa_stack_handler, tool);

	if (!dump_tasks)
		return 0;

	/*
	 * Dump task events.
	 */
	retval = tracefs_event_enable(tool->trace.inst, "sched", "sched_switch");
	if (retval < 0 && !errno) {
		err_msg("Could not find sched_switch\n");
		goto out_err;
	}

	tep_register_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
				   timerlat_aa_sched_switch_handler, tool);

	retval = tracefs_event_enable(tool->trace.inst, "workqueue", "workqueue_execute_start");
	if (retval < 0 && !errno) {
		err_msg("Could not find workqueue_execute_start\n");
		goto out_err;
	}

	tep_register_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
				   timerlat_aa_kworker_start_handler, tool);

	return 0;

out_err:
	timerlat_aa_unregister_events(tool, dump_tasks);
	return -1;
}

/**
 * timerlat_aa_destroy - Destroy timerlat auto-analysis
 */
void timerlat_aa_destroy(void)
{
	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();

	if (!taa_ctx)
		return;

	if (!taa_ctx->taa_data)
		goto out_ctx;

	timerlat_aa_unregister_events(taa_ctx->tool, taa_ctx->dump_tasks);
	timerlat_aa_destroy_seqs(taa_ctx);
	free(taa_ctx->taa_data);
out_ctx:
	free(taa_ctx);
}

/**
 * timerlat_aa_init - Initialize timerlat auto-analysis
 *
 * Returns 0 on success, -1 otherwise.
 */
int timerlat_aa_init(struct osnoise_tool *tool, int dump_tasks)
{
	int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
	struct timerlat_aa_context *taa_ctx;
	int retval;

	taa_ctx = calloc(1, sizeof(*taa_ctx));
	if (!taa_ctx)
		return -1;

	__timerlat_aa_ctx = taa_ctx;

	taa_ctx->nr_cpus = nr_cpus;
	taa_ctx->tool = tool;
	taa_ctx->dump_tasks = dump_tasks;

	taa_ctx->taa_data = calloc(nr_cpus, sizeof(*taa_ctx->taa_data));
	if (!taa_ctx->taa_data)
		goto out_err;

	retval = timerlat_aa_init_seqs(taa_ctx);
	if (retval)
		goto out_err;

	retval = timerlat_aa_register_events(tool, dump_tasks);
	if (retval)
		goto out_err;

	return 0;

out_err:
	timerlat_aa_destroy();
	return -1;
}