1130391Sle// SPDX-License-Identifier: GPL-2.0 2152616Sle/* 3130391Sle * Copyright (C) 2023 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org> 4152631Sle */ 5152631Sle 6152631Sle#include <stdlib.h> 7152631Sle#include <errno.h> 8152631Sle#include "utils.h" 9130391Sle#include "osnoise.h" 10130391Sle#include "timerlat.h" 11130391Sle#include <unistd.h> 12130391Sle 13130391Sleenum timelat_state { 14130391Sle TIMERLAT_INIT = 0, 15130391Sle TIMERLAT_WAITING_IRQ, 16130391Sle TIMERLAT_WAITING_THREAD, 17152631Sle}; 18130391Sle 19130391Sle/* Used to fill spaces in the output */ 20130391Slestatic const char *spaces = " "; 21130391Sle 22130391Sle#define MAX_COMM 24 23130391Sle 24130391Sle/* 25130391Sle * Per-cpu data statistics and data. 26130391Sle */ 27130391Slestruct timerlat_aa_data { 28130391Sle /* Current CPU state */ 29130391Sle int curr_state; 30130391Sle 31130391Sle /* timerlat IRQ latency */ 32130391Sle unsigned long long tlat_irq_seqnum; 33130391Sle unsigned long long tlat_irq_latency; 34130391Sle unsigned long long tlat_irq_timstamp; 35130391Sle 36130391Sle /* timerlat Thread latency */ 37130391Sle unsigned long long tlat_thread_seqnum; 38130391Sle unsigned long long tlat_thread_latency; 39130391Sle unsigned long long tlat_thread_timstamp; 40130391Sle 41130391Sle /* 42130391Sle * Information about the thread running when the IRQ 43130391Sle * arrived. 44130391Sle * 45130391Sle * This can be blocking or interference, depending on the 46130391Sle * priority of the thread. Assuming timerlat is the highest 47130391Sle * prio, it is blocking. If timerlat has a lower prio, it is 48130391Sle * interference. 49130391Sle * note: "unsigned long long" because they are fetch using tep_get_field_val(); 50130391Sle */ 51130391Sle unsigned long long run_thread_pid; 52130391Sle char run_thread_comm[MAX_COMM]; 53130391Sle unsigned long long thread_blocking_duration; 54130391Sle unsigned long long max_exit_idle_latency; 55130391Sle 56130391Sle /* Information about the timerlat timer irq */ 57130391Sle unsigned long long timer_irq_start_time; 58130391Sle unsigned long long timer_irq_start_delay; 59130391Sle unsigned long long timer_irq_duration; 60152616Sle unsigned long long timer_exit_from_idle; 61138110Sle 62130391Sle /* 63152616Sle * Information about the last IRQ before the timerlat irq 64157052Sle * arrived. 65130391Sle * 66130391Sle * If now - timestamp is <= latency, it might have influenced 67138112Sle * in the timerlat irq latency. Otherwise, ignore it. 68130391Sle */ 69130391Sle unsigned long long prev_irq_duration; 70130391Sle unsigned long long prev_irq_timstamp; 71130391Sle 72130391Sle /* 73130391Sle * Interference sum. 74130391Sle */ 75130391Sle unsigned long long thread_nmi_sum; 76130391Sle unsigned long long thread_irq_sum; 77130391Sle unsigned long long thread_softirq_sum; 78130391Sle unsigned long long thread_thread_sum; 79130391Sle 80130391Sle /* 81130391Sle * Interference task information. 82130391Sle */ 83130391Sle struct trace_seq *prev_irqs_seq; 84130391Sle struct trace_seq *nmi_seq; 85130391Sle struct trace_seq *irqs_seq; 86130391Sle struct trace_seq *softirqs_seq; 87130391Sle struct trace_seq *threads_seq; 88130391Sle struct trace_seq *stack_seq; 89130391Sle 90130391Sle /* 91130391Sle * Current thread. 92130391Sle */ 93130391Sle char current_comm[MAX_COMM]; 94130391Sle unsigned long long current_pid; 95130391Sle 96130391Sle /* 97130391Sle * Is the system running a kworker? 98130391Sle */ 99130391Sle unsigned long long kworker; 100130391Sle unsigned long long kworker_func; 101130391Sle}; 102130391Sle 103130391Sle/* 104130391Sle * The analysis context and system wide view 105130391Sle */ 106130391Slestruct timerlat_aa_context { 107130391Sle int nr_cpus; 108130391Sle int dump_tasks; 109130391Sle 110130391Sle /* per CPU data */ 111130391Sle struct timerlat_aa_data *taa_data; 112130391Sle 113130391Sle /* 114130391Sle * required to translate function names and register 115130391Sle * events. 116130391Sle */ 117130391Sle struct osnoise_tool *tool; 118130391Sle}; 119130391Sle 120130391Sle/* 121130391Sle * The data is stored as a local variable, but accessed via a helper function. 122130391Sle * 123130391Sle * It could be stored inside the trace context. But every access would 124130391Sle * require container_of() + a series of pointers. Do we need it? Not sure. 125130391Sle * 126130391Sle * For now keep it simple. If needed, store it in the tool, add the *context 127130391Sle * as a parameter in timerlat_aa_get_ctx() and do the magic there. 128130391Sle */ 129130391Slestatic struct timerlat_aa_context *__timerlat_aa_ctx; 130133097Sle 131133097Slestatic struct timerlat_aa_context *timerlat_aa_get_ctx(void) 132133097Sle{ 133133097Sle return __timerlat_aa_ctx; 134133097Sle} 135133097Sle 136133097Sle/* 137133097Sle * timerlat_aa_get_data - Get the per-cpu data from the timerlat context 138133097Sle */ 139133097Slestatic struct timerlat_aa_data 140133097Sle*timerlat_aa_get_data(struct timerlat_aa_context *taa_ctx, int cpu) 141133097Sle{ 142133097Sle return &taa_ctx->taa_data[cpu]; 143133097Sle} 144133097Sle 145133097Sle/* 146133097Sle * timerlat_aa_irq_latency - Handles timerlat IRQ event 147133097Sle */ 148133097Slestatic int timerlat_aa_irq_latency(struct timerlat_aa_data *taa_data, 149133097Sle struct trace_seq *s, struct tep_record *record, 150133097Sle struct tep_event *event) 151133097Sle{ 152133097Sle /* 153133097Sle * For interference, we start now looking for things that can delay 154133097Sle * the thread. 155133097Sle */ 156133097Sle taa_data->curr_state = TIMERLAT_WAITING_THREAD; 157133097Sle taa_data->tlat_irq_timstamp = record->ts; 158133097Sle 159133097Sle /* 160133097Sle * Zero values. 161133097Sle */ 162133097Sle taa_data->thread_nmi_sum = 0; 163133097Sle taa_data->thread_irq_sum = 0; 164133097Sle taa_data->thread_softirq_sum = 0; 165130391Sle taa_data->thread_thread_sum = 0; 166130391Sle taa_data->thread_blocking_duration = 0; 167130391Sle taa_data->timer_irq_start_time = 0; 168130391Sle taa_data->timer_irq_duration = 0; 169130391Sle taa_data->timer_exit_from_idle = 0; 170130391Sle 171130391Sle /* 172130391Sle * Zero interference tasks. 173130391Sle */ 174130391Sle trace_seq_reset(taa_data->nmi_seq); 175130391Sle trace_seq_reset(taa_data->irqs_seq); 176130391Sle trace_seq_reset(taa_data->softirqs_seq); 177130391Sle trace_seq_reset(taa_data->threads_seq); 178130391Sle 179130391Sle /* IRQ latency values */ 180130391Sle tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_irq_latency, 1); 181130391Sle tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_irq_seqnum, 1); 182130391Sle 183130391Sle /* The thread that can cause blocking */ 184130391Sle tep_get_common_field_val(s, event, "common_pid", record, &taa_data->run_thread_pid, 1); 185130391Sle 186130391Sle /* 187130391Sle * Get exit from idle case. 188130391Sle * 189130391Sle * If it is not idle thread: 190130391Sle */ 191130391Sle if (taa_data->run_thread_pid) 192130391Sle return 0; 193150044Sle 194150044Sle /* 195150044Sle * if the latency is shorter than the known exit from idle: 196150044Sle */ 197130391Sle if (taa_data->tlat_irq_latency < taa_data->max_exit_idle_latency) 198150044Sle return 0; 199150044Sle 200150044Sle /* 201150044Sle * To be safe, ignore the cases in which an IRQ/NMI could have 202150044Sle * interfered with the timerlat IRQ. 203150044Sle */ 204150044Sle if (taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency 205150044Sle < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) 206150044Sle return 0; 207150044Sle 208150044Sle taa_data->max_exit_idle_latency = taa_data->tlat_irq_latency; 209130391Sle 210150044Sle return 0; 211150044Sle} 212130391Sle 213150044Sle/* 214150044Sle * timerlat_aa_thread_latency - Handles timerlat thread event 215150044Sle */ 216150044Slestatic int timerlat_aa_thread_latency(struct timerlat_aa_data *taa_data, 217150044Sle struct trace_seq *s, struct tep_record *record, 218130391Sle struct tep_event *event) 219150044Sle{ 220150044Sle /* 221150044Sle * For interference, we start now looking for things that can delay 222130391Sle * the IRQ of the next cycle. 223150044Sle */ 224150044Sle taa_data->curr_state = TIMERLAT_WAITING_IRQ; 225150044Sle taa_data->tlat_thread_timstamp = record->ts; 226150044Sle 227150044Sle /* Thread latency values */ 228150044Sle tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_thread_latency, 1); 229150044Sle tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_thread_seqnum, 1); 230150044Sle 231150044Sle return 0; 232150044Sle} 233130391Sle 234150044Sle/* 235150044Sle * timerlat_aa_handler - Handle timerlat events 236130391Sle * 237150044Sle * This function is called to handle timerlat events recording statistics. 238150044Sle * 239150044Sle * Returns 0 on success, -1 otherwise. 240150044Sle */ 241150044Slestatic int timerlat_aa_handler(struct trace_seq *s, struct tep_record *record, 242130391Sle struct tep_event *event, void *context) 243150044Sle{ 244150044Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 245150044Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 246150044Sle unsigned long long thread; 247150044Sle 248130391Sle if (!taa_data) 249150044Sle return -1; 250150044Sle 251150044Sle tep_get_field_val(s, event, "context", record, &thread, 1); 252150044Sle if (!thread) 253150044Sle return timerlat_aa_irq_latency(taa_data, s, record, event); 254130391Sle else 255150044Sle return timerlat_aa_thread_latency(taa_data, s, record, event); 256150044Sle} 257150044Sle 258130391Sle/* 259150044Sle * timerlat_aa_nmi_handler - Handles NMI noise 260150044Sle * 261150044Sle * It is used to collect information about interferences from NMI. It is 262150044Sle * hooked to the osnoise:nmi_noise event. 263150044Sle */ 264150044Slestatic int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *record, 265150044Sle struct tep_event *event, void *context) 266150044Sle{ 267150044Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 268150044Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 269150044Sle unsigned long long duration; 270130391Sle unsigned long long start; 271150044Sle 272150044Sle tep_get_field_val(s, event, "duration", record, &duration, 1); 273150044Sle tep_get_field_val(s, event, "start", record, &start, 1); 274150044Sle 275150044Sle if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) { 276150044Sle taa_data->prev_irq_duration = duration; 277150044Sle taa_data->prev_irq_timstamp = start; 278150044Sle 279150044Sle trace_seq_reset(taa_data->prev_irqs_seq); 280150044Sle trace_seq_printf(taa_data->prev_irqs_seq, " %24s %.*s %9.2f us\n", 281150044Sle "nmi", 282150044Sle 24, spaces, 283150044Sle ns_to_usf(duration)); 284150044Sle return 0; 285150044Sle } 286150044Sle 287150044Sle taa_data->thread_nmi_sum += duration; 288150044Sle trace_seq_printf(taa_data->nmi_seq, " %24s %.*s %9.2f us\n", 289150044Sle "nmi", 290150044Sle 24, spaces, ns_to_usf(duration)); 291130391Sle 292130391Sle return 0; 293150044Sle} 294150044Sle 295130391Sle/* 296150044Sle * timerlat_aa_irq_handler - Handles IRQ noise 297150044Sle * 298150044Sle * It is used to collect information about interferences from IRQ. It is 299150044Sle * hooked to the osnoise:irq_noise event. 300150044Sle * 301150044Sle * It is a little bit more complex than the other because it measures: 302150044Sle * - The IRQs that can delay the timer IRQ before it happened. 303150044Sle * - The Timerlat IRQ handler 304150044Sle * - The IRQs that happened between the timerlat IRQ and the timerlat thread 305150044Sle * (IRQ interference). 306130391Sle */ 307130391Slestatic int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *record, 308130391Sle struct tep_event *event, void *context) 309130391Sle{ 310130391Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 311130391Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 312130391Sle unsigned long long expected_start; 313130391Sle unsigned long long duration; 314130391Sle unsigned long long vector; 315130391Sle unsigned long long start; 316130391Sle char *desc; 317130391Sle int val; 318130391Sle 319130391Sle tep_get_field_val(s, event, "duration", record, &duration, 1); 320130391Sle tep_get_field_val(s, event, "start", record, &start, 1); 321130391Sle tep_get_field_val(s, event, "vector", record, &vector, 1); 322130391Sle desc = tep_get_field_raw(s, event, "desc", record, &val, 1); 323130391Sle 324130391Sle /* 325130391Sle * Before the timerlat IRQ. 326130391Sle */ 327130391Sle if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) { 328130391Sle taa_data->prev_irq_duration = duration; 329130391Sle taa_data->prev_irq_timstamp = start; 330152616Sle 331152616Sle trace_seq_reset(taa_data->prev_irqs_seq); 332152616Sle trace_seq_printf(taa_data->prev_irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", 333152616Sle desc, vector, 334152616Sle 15, spaces, 335130391Sle ns_to_usf(duration)); 336152616Sle return 0; 337130391Sle } 338152616Sle 339130391Sle /* 340152616Sle * The timerlat IRQ: taa_data->timer_irq_start_time is zeroed at 341130391Sle * the timerlat irq handler. 342152616Sle */ 343130391Sle if (!taa_data->timer_irq_start_time) { 344130391Sle expected_start = taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency; 345130391Sle 346130391Sle taa_data->timer_irq_start_time = start; 347130391Sle taa_data->timer_irq_duration = duration; 348130391Sle 349130391Sle /* 350130391Sle * We are dealing with two different clock sources: the 351152616Sle * external clock source that timerlat uses as a reference 352152616Sle * and the clock used by the tracer. There are also two 353157052Sle * moments: the time reading the clock and the timer in 354157052Sle * which the event is placed in the buffer (the trace 355152616Sle * event timestamp). If the processor is slow or there 356130391Sle * is some hardware noise, the difference between the 357130391Sle * timestamp and the external clock read can be longer 358130391Sle * than the IRQ handler delay, resulting in a negative 359130391Sle * time. If so, set IRQ start delay as 0. In the end, 360152616Sle * it is less relevant than the noise. 361130391Sle */ 362130391Sle if (expected_start < taa_data->timer_irq_start_time) 363130391Sle taa_data->timer_irq_start_delay = taa_data->timer_irq_start_time - expected_start; 364152616Sle else 365130391Sle taa_data->timer_irq_start_delay = 0; 366130391Sle 367130391Sle /* 368130391Sle * not exit from idle. 369130391Sle */ 370130391Sle if (taa_data->run_thread_pid) 371130391Sle return 0; 372138112Sle 373138112Sle if (expected_start > taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) 374138112Sle taa_data->timer_exit_from_idle = taa_data->timer_irq_start_delay; 375138112Sle 376138112Sle return 0; 377138112Sle } 378138112Sle 379138112Sle /* 380138112Sle * IRQ interference. 381138112Sle */ 382138112Sle taa_data->thread_irq_sum += duration; 383138112Sle trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", 384138112Sle desc, vector, 385138112Sle 24, spaces, 386138112Sle ns_to_usf(duration)); 387138112Sle 388138112Sle return 0; 389138112Sle} 390138112Sle 391138112Slestatic char *softirq_name[] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 392138112Sle "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; 393138112Sle 394138112Sle 395138112Sle/* 396138112Sle * timerlat_aa_softirq_handler - Handles Softirq noise 397138112Sle * 398138112Sle * It is used to collect information about interferences from Softirq. It is 399138112Sle * hooked to the osnoise:softirq_noise event. 400138112Sle * 401138112Sle * It is only printed in the non-rt kernel, as softirqs become thread on RT. 402138112Sle */ 403138112Slestatic int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *record, 404138112Sle struct tep_event *event, void *context) 405138112Sle{ 406138112Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 407138112Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 408138112Sle unsigned long long duration; 409138112Sle unsigned long long vector; 410138112Sle unsigned long long start; 411138112Sle 412138112Sle if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) 413138112Sle return 0; 414138112Sle 415138112Sle tep_get_field_val(s, event, "duration", record, &duration, 1); 416138112Sle tep_get_field_val(s, event, "start", record, &start, 1); 417138112Sle tep_get_field_val(s, event, "vector", record, &vector, 1); 418138112Sle 419138112Sle taa_data->thread_softirq_sum += duration; 420138112Sle 421138112Sle trace_seq_printf(taa_data->softirqs_seq, " %24s:%-3llu %.*s %9.2f us\n", 422138112Sle softirq_name[vector], vector, 423138112Sle 24, spaces, 424138112Sle ns_to_usf(duration)); 425138112Sle return 0; 426138112Sle} 427130391Sle 428130391Sle/* 429130391Sle * timerlat_aa_softirq_handler - Handles thread noise 430130391Sle * 431130391Sle * It is used to collect information about interferences from threads. It is 432130391Sle * hooked to the osnoise:thread_noise event. 433130391Sle * 434130391Sle * Note: if you see thread noise, your timerlat thread was not the highest prio one. 435130391Sle */ 436130391Slestatic int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *record, 437130391Sle struct tep_event *event, void *context) 438130391Sle{ 439130391Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 440130391Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 441130391Sle unsigned long long duration; 442130391Sle unsigned long long start; 443130391Sle unsigned long long pid; 444130391Sle const char *comm; 445130391Sle int val; 446130391Sle 447130391Sle if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) 448130391Sle return 0; 449130391Sle 450130391Sle tep_get_field_val(s, event, "duration", record, &duration, 1); 451130391Sle tep_get_field_val(s, event, "start", record, &start, 1); 452130391Sle 453130391Sle tep_get_common_field_val(s, event, "common_pid", record, &pid, 1); 454130391Sle comm = tep_get_field_raw(s, event, "comm", record, &val, 1); 455130391Sle 456130391Sle if (pid == taa_data->run_thread_pid && !taa_data->thread_blocking_duration) { 457130391Sle taa_data->thread_blocking_duration = duration; 458130391Sle 459130391Sle if (comm) 460130391Sle strncpy(taa_data->run_thread_comm, comm, MAX_COMM); 461130391Sle else 462130391Sle sprintf(taa_data->run_thread_comm, "<...>"); 463130391Sle 464130391Sle } else { 465130391Sle taa_data->thread_thread_sum += duration; 466130391Sle 467130391Sle trace_seq_printf(taa_data->threads_seq, " %24s:%-12llu %.*s %9.2f us\n", 468130391Sle comm, pid, 469130391Sle 15, spaces, 470130391Sle ns_to_usf(duration)); 471130391Sle } 472130391Sle 473130391Sle return 0; 474130391Sle} 475130391Sle 476130391Sle/* 477130391Sle * timerlat_aa_stack_handler - Handles timerlat IRQ stack trace 478130391Sle * 479130391Sle * Saves and parse the stack trace generated by the timerlat IRQ. 480130391Sle */ 481130391Slestatic int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *record, 482130391Sle struct tep_event *event, void *context) 483130391Sle{ 484130391Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 485130391Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 486130391Sle unsigned long *caller; 487130391Sle const char *function; 488130391Sle int val, i; 489130391Sle 490130391Sle trace_seq_reset(taa_data->stack_seq); 491152616Sle 492130391Sle trace_seq_printf(taa_data->stack_seq, " Blocking thread stack trace\n"); 493152616Sle caller = tep_get_field_raw(s, event, "caller", record, &val, 1); 494152616Sle if (caller) { 495152616Sle for (i = 0; ; i++) { 496152616Sle function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]); 497152616Sle if (!function) 498152616Sle break; 499152616Sle trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n", 500152616Sle 14, spaces, function); 501152616Sle } 502152616Sle } 503152616Sle return 0; 504152616Sle} 505152616Sle 506152616Sle/* 507152616Sle * timerlat_aa_sched_switch_handler - Tracks the current thread running on the CPU 508152616Sle * 509152616Sle * Handles the sched:sched_switch event to trace the current thread running on the 510152616Sle * CPU. It is used to display the threads running on the other CPUs when the trace 511152616Sle * stops. 512152616Sle */ 513152616Slestatic int timerlat_aa_sched_switch_handler(struct trace_seq *s, struct tep_record *record, 514152616Sle struct tep_event *event, void *context) 515152616Sle{ 516152616Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 517152616Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 518152616Sle const char *comm; 519152616Sle int val; 520152616Sle 521152616Sle tep_get_field_val(s, event, "next_pid", record, &taa_data->current_pid, 1); 522152616Sle comm = tep_get_field_raw(s, event, "next_comm", record, &val, 1); 523152616Sle 524152616Sle strncpy(taa_data->current_comm, comm, MAX_COMM); 525152616Sle 526152616Sle /* 527152616Sle * If this was a kworker, clean the last kworkers that ran. 528152616Sle */ 529152616Sle taa_data->kworker = 0; 530152616Sle taa_data->kworker_func = 0; 531152616Sle 532152616Sle return 0; 533152616Sle} 534152616Sle 535152616Sle/* 536152616Sle * timerlat_aa_kworker_start_handler - Tracks a kworker running on the CPU 537152616Sle * 538152616Sle * Handles workqueue:workqueue_execute_start event, keeping track of 539152631Sle * the job that a kworker could be doing in the CPU. 540152616Sle * 541152616Sle * We already catch problems of hardware related latencies caused by work queues 542152616Sle * running driver code that causes hardware stall. For example, with DRM drivers. 543152616Sle */ 544152616Slestatic int timerlat_aa_kworker_start_handler(struct trace_seq *s, struct tep_record *record, 545152616Sle struct tep_event *event, void *context) 546152616Sle{ 547130391Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 548130391Sle struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu); 549130391Sle 550130391Sle tep_get_field_val(s, event, "work", record, &taa_data->kworker, 1); 551130391Sle tep_get_field_val(s, event, "function", record, &taa_data->kworker_func, 1); 552130391Sle return 0; 553138110Sle} 554138110Sle 555138110Sle/* 556138110Sle * timerlat_thread_analysis - Prints the analysis of a CPU that hit a stop tracing 557138110Sle * 558138110Sle * This is the core of the analysis. 559138110Sle */ 560138110Slestatic void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, 561138110Sle int irq_thresh, int thread_thresh) 562138110Sle{ 563138110Sle long long exp_irq_ts; 564138110Sle int total; 565138110Sle int irq; 566138110Sle 567138110Sle /* 568138110Sle * IRQ latency or Thread latency? 569138110Sle */ 570138110Sle if (taa_data->tlat_irq_seqnum > taa_data->tlat_thread_seqnum) { 571138110Sle irq = 1; 572138110Sle total = taa_data->tlat_irq_latency; 573138110Sle } else { 574138110Sle irq = 0; 575138110Sle total = taa_data->tlat_thread_latency; 576138110Sle } 577138110Sle 578138110Sle /* 579138110Sle * Expected IRQ arrival time using the trace clock as the base. 580138110Sle * 581138110Sle * TODO: Add a list of previous IRQ, and then run the list backwards. 582138110Sle */ 583138110Sle exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay; 584138110Sle if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) { 585138110Sle if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time) 586138110Sle printf(" Previous IRQ interference: %.*s up to %9.2f us\n", 587138110Sle 16, spaces, 588138110Sle ns_to_usf(taa_data->prev_irq_duration)); 589138110Sle } 590138110Sle 591138110Sle /* 592138110Sle * The delay that the IRQ suffered before starting. 593138110Sle */ 594138110Sle printf(" IRQ handler delay: %.*s %16s %9.2f us (%.2f %%)\n", 16, spaces, 595138110Sle (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", 596138110Sle ns_to_usf(taa_data->timer_irq_start_delay), 597138110Sle ns_to_per(total, taa_data->timer_irq_start_delay)); 598138110Sle 599138110Sle /* 600138110Sle * Timerlat IRQ. 601138110Sle */ 602138110Sle printf(" IRQ latency: %.*s %9.2f us\n", 40, spaces, 603138110Sle ns_to_usf(taa_data->tlat_irq_latency)); 604138110Sle 605138110Sle if (irq) { 606138110Sle /* 607138110Sle * If the trace stopped due to IRQ, the other events will not happen 608138110Sle * because... the trace stopped :-). 609138110Sle * 610138110Sle * That is all folks, the stack trace was printed before the stop, 611138110Sle * so it will be displayed, it is the key. 612138110Sle */ 613138110Sle printf(" Blocking thread:\n"); 614138110Sle printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm, 615138110Sle taa_data->run_thread_pid); 616138110Sle } else { 617138110Sle /* 618138110Sle * The duration of the IRQ handler that handled the timerlat IRQ. 619138110Sle */ 620138110Sle printf(" Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n", 621138110Sle 30, spaces, 622138110Sle ns_to_usf(taa_data->timer_irq_duration), 623138110Sle ns_to_per(total, taa_data->timer_irq_duration)); 624138110Sle 625138110Sle /* 626138110Sle * The amount of time that the current thread postponed the scheduler. 627138110Sle * 628138110Sle * Recalling that it is net from NMI/IRQ/Softirq interference, so there 629138110Sle * is no need to compute values here. 630138110Sle */ 631138110Sle printf(" Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces, 632138110Sle ns_to_usf(taa_data->thread_blocking_duration), 633138110Sle ns_to_per(total, taa_data->thread_blocking_duration)); 634138110Sle 635138110Sle printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces, 636152616Sle taa_data->run_thread_comm, taa_data->run_thread_pid, 637152616Sle 12, spaces, ns_to_usf(taa_data->thread_blocking_duration)); 638152616Sle } 639152616Sle 640152616Sle /* 641152616Sle * Print the stack trace! 642152616Sle */ 643152616Sle trace_seq_do_printf(taa_data->stack_seq); 644152616Sle 645152616Sle /* 646152616Sle * NMIs can happen during the IRQ, so they are always possible. 647152616Sle */ 648152616Sle if (taa_data->thread_nmi_sum) 649152616Sle printf(" NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, 650152616Sle ns_to_usf(taa_data->thread_nmi_sum), 651152616Sle ns_to_per(total, taa_data->thread_nmi_sum)); 652152616Sle 653152616Sle /* 654152616Sle * If it is an IRQ latency, the other factors can be skipped. 655152616Sle */ 656152616Sle if (irq) 657152616Sle goto print_total; 658152616Sle 659152616Sle /* 660152616Sle * Prints the interference caused by IRQs to the thread latency. 661152616Sle */ 662152616Sle if (taa_data->thread_irq_sum) { 663152616Sle printf(" IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, 664152616Sle ns_to_usf(taa_data->thread_irq_sum), 665152616Sle ns_to_per(total, taa_data->thread_irq_sum)); 666152616Sle 667152616Sle trace_seq_do_printf(taa_data->irqs_seq); 668152616Sle } 669152616Sle 670152616Sle /* 671152616Sle * Prints the interference caused by Softirqs to the thread latency. 672152616Sle */ 673152616Sle if (taa_data->thread_softirq_sum) { 674152616Sle printf(" Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces, 675152616Sle ns_to_usf(taa_data->thread_softirq_sum), 676152616Sle ns_to_per(total, taa_data->thread_softirq_sum)); 677152616Sle 678152616Sle trace_seq_do_printf(taa_data->softirqs_seq); 679152616Sle } 680152616Sle 681152631Sle /* 682152616Sle * Prints the interference caused by other threads to the thread latency. 683152616Sle * 684152616Sle * If this happens, your timerlat is not the highest prio. OK, migration 685152616Sle * thread can happen. But otherwise, you are not measuring the "scheduling 686152616Sle * latency" only, and here is the difference from scheduling latency and 687152616Sle * timer handling latency. 688152616Sle */ 689130391Sle if (taa_data->thread_thread_sum) { 690130391Sle printf(" Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces, 691130391Sle ns_to_usf(taa_data->thread_thread_sum), 692130391Sle ns_to_per(total, taa_data->thread_thread_sum)); 693130391Sle 694130391Sle trace_seq_do_printf(taa_data->threads_seq); 695130391Sle } 696130391Sle 697130391Sle /* 698130391Sle * Done. 699130391Sle */ 700130391Sleprint_total: 701130391Sle printf("------------------------------------------------------------------------\n"); 702130391Sle printf(" %s latency: %.*s %9.2f us (100%%)\n", irq ? " IRQ" : "Thread", 703130391Sle 37, spaces, ns_to_usf(total)); 704130391Sle} 705130391Sle 706130391Slestatic int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx) 707130391Sle{ 708130391Sle struct trace_instance *trace = &taa_ctx->tool->trace; 709130391Sle int retval; 710130391Sle 711130391Sle retval = tracefs_iterate_raw_events(trace->tep, 712130391Sle trace->inst, 713130391Sle NULL, 714130391Sle 0, 715130391Sle collect_registered_events, 716130391Sle trace); 717130391Sle if (retval < 0) { 718130391Sle err_msg("Error iterating on events\n"); 719130391Sle return 0; 720130391Sle } 721130391Sle 722130391Sle return 1; 723130391Sle} 724130391Sle 725130391Sle/** 726130391Sle * timerlat_auto_analysis - Analyze the collected data 727130391Sle */ 728130391Slevoid timerlat_auto_analysis(int irq_thresh, int thread_thresh) 729130391Sle{ 730130391Sle struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 731130391Sle unsigned long long max_exit_from_idle = 0; 732130391Sle struct timerlat_aa_data *taa_data; 733130391Sle int max_exit_from_idle_cpu; 734130391Sle struct tep_handle *tep; 735157052Sle int cpu; 736157052Sle 737157052Sle timerlat_auto_analysis_collect_trace(taa_ctx); 738157052Sle 739157052Sle /* bring stop tracing to the ns scale */ 740157052Sle irq_thresh = irq_thresh * 1000; 741157052Sle thread_thresh = thread_thresh * 1000; 742157052Sle 743157052Sle for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) { 744157052Sle taa_data = timerlat_aa_get_data(taa_ctx, cpu); 745157052Sle 746157052Sle if (irq_thresh && taa_data->tlat_irq_latency >= irq_thresh) { 747157052Sle printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu); 748157052Sle timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh); 749157052Sle } else if (thread_thresh && (taa_data->tlat_thread_latency) >= thread_thresh) { 750157052Sle printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu); 751157052Sle timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh); 752157052Sle } 753157052Sle 754157052Sle if (taa_data->max_exit_idle_latency > max_exit_from_idle) { 755157052Sle max_exit_from_idle = taa_data->max_exit_idle_latency; 756157052Sle max_exit_from_idle_cpu = cpu; 757157052Sle } 758157052Sle 759157052Sle } 760157052Sle 761157052Sle if (max_exit_from_idle) { 762157052Sle printf("\n"); 763157052Sle printf("Max timerlat IRQ latency from idle: %.2f us in cpu %d\n", 764157052Sle ns_to_usf(max_exit_from_idle), max_exit_from_idle_cpu); 765157052Sle } 766157052Sle if (!taa_ctx->dump_tasks) 767157052Sle return; 768157052Sle 769157052Sle printf("\n"); 770157052Sle printf("Printing CPU tasks:\n"); 771130391Sle for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) { 772130391Sle taa_data = timerlat_aa_get_data(taa_ctx, cpu); 773130391Sle tep = taa_ctx->tool->trace.tep; 774130391Sle 775130391Sle printf(" [%.3d] %24s:%llu", cpu, taa_data->current_comm, taa_data->current_pid); 776130391Sle 777130391Sle if (taa_data->kworker_func) 778130391Sle printf(" kworker:%s:%s", 779130391Sle tep_find_function(tep, taa_data->kworker) ? : "<...>", 780130391Sle tep_find_function(tep, taa_data->kworker_func)); 781130391Sle printf("\n"); 782130391Sle } 783130391Sle 784130391Sle} 785130391Sle 786130391Sle/* 787130391Sle * timerlat_aa_destroy_seqs - Destroy seq files used to store parsed data 788130391Sle */ 789130391Slestatic void timerlat_aa_destroy_seqs(struct timerlat_aa_context *taa_ctx) 790130391Sle{ 791130391Sle struct timerlat_aa_data *taa_data; 792130391Sle int i; 793130391Sle 794130391Sle if (!taa_ctx->taa_data) 795130391Sle return; 796130391Sle 797130391Sle for (i = 0; i < taa_ctx->nr_cpus; i++) { 798130391Sle taa_data = timerlat_aa_get_data(taa_ctx, i); 799130391Sle 800130391Sle if (taa_data->prev_irqs_seq) { 801130391Sle trace_seq_destroy(taa_data->prev_irqs_seq); 802130391Sle free(taa_data->prev_irqs_seq); 803130391Sle } 804130391Sle 805130391Sle if (taa_data->nmi_seq) { 806130391Sle trace_seq_destroy(taa_data->nmi_seq); 807130391Sle free(taa_data->nmi_seq); 808130391Sle } 809130391Sle 810130391Sle if (taa_data->irqs_seq) { 811130391Sle trace_seq_destroy(taa_data->irqs_seq); 812130391Sle free(taa_data->irqs_seq); 813130391Sle } 814130391Sle 815130391Sle if (taa_data->softirqs_seq) { 816130391Sle trace_seq_destroy(taa_data->softirqs_seq); 817130391Sle free(taa_data->softirqs_seq); 818130391Sle } 819130391Sle 820130391Sle if (taa_data->threads_seq) { 821130391Sle trace_seq_destroy(taa_data->threads_seq); 822130391Sle free(taa_data->threads_seq); 823130391Sle } 824130391Sle 825130391Sle if (taa_data->stack_seq) { 826130391Sle trace_seq_destroy(taa_data->stack_seq); 827130391Sle free(taa_data->stack_seq); 828130391Sle } 829130391Sle } 830130391Sle} 831130391Sle 832130391Sle/* 833130391Sle * timerlat_aa_init_seqs - Init seq files used to store parsed information 834130391Sle * 835130391Sle * Instead of keeping data structures to store raw data, use seq files to 836130391Sle * store parsed data. 837130391Sle * 838130391Sle * Allocates and initialize seq files. 839130391Sle * 840130391Sle * Returns 0 on success, -1 otherwise. 841130391Sle */ 842130391Slestatic int timerlat_aa_init_seqs(struct timerlat_aa_context *taa_ctx) 843130391Sle{ 844130391Sle struct timerlat_aa_data *taa_data; 845130391Sle int i; 846130391Sle 847130391Sle for (i = 0; i < taa_ctx->nr_cpus; i++) { 848130391Sle 849130391Sle taa_data = timerlat_aa_get_data(taa_ctx, i); 850130391Sle 851130391Sle taa_data->prev_irqs_seq = calloc(1, sizeof(*taa_data->prev_irqs_seq)); 852130391Sle if (!taa_data->prev_irqs_seq) 853130391Sle goto out_err; 854130391Sle 855130391Sle trace_seq_init(taa_data->prev_irqs_seq); 856130391Sle 857130391Sle taa_data->nmi_seq = calloc(1, sizeof(*taa_data->nmi_seq)); 858130391Sle if (!taa_data->nmi_seq) 859130391Sle goto out_err; 860130391Sle 861130391Sle trace_seq_init(taa_data->nmi_seq); 862130391Sle 863130391Sle taa_data->irqs_seq = calloc(1, sizeof(*taa_data->irqs_seq)); 864150044Sle if (!taa_data->irqs_seq) 865130391Sle goto out_err; 866130391Sle 867130391Sle trace_seq_init(taa_data->irqs_seq); 868130391Sle 869130391Sle taa_data->softirqs_seq = calloc(1, sizeof(*taa_data->softirqs_seq)); 870130391Sle if (!taa_data->softirqs_seq) 871130391Sle goto out_err; 872130391Sle 873130391Sle trace_seq_init(taa_data->softirqs_seq); 874130391Sle 875130391Sle taa_data->threads_seq = calloc(1, sizeof(*taa_data->threads_seq)); 876130391Sle if (!taa_data->threads_seq) 877130391Sle goto out_err; 878130391Sle 879130391Sle trace_seq_init(taa_data->threads_seq); 880152616Sle 881152616Sle taa_data->stack_seq = calloc(1, sizeof(*taa_data->stack_seq)); 882152616Sle if (!taa_data->stack_seq) 883152616Sle goto out_err; 884130391Sle 885130391Sle trace_seq_init(taa_data->stack_seq); 886152616Sle } 887152616Sle 888157052Sle return 0; 889157052Sle 890130391Sleout_err: 891130391Sle timerlat_aa_destroy_seqs(taa_ctx); 892130391Sle return -1; 893130391Sle} 894138112Sle 895138112Sle/* 896130391Sle * timerlat_aa_unregister_events - Unregister events used in the auto-analysis 897130391Sle */ 898130391Slestatic void timerlat_aa_unregister_events(struct osnoise_tool *tool, int dump_tasks) 899130391Sle{ 900138110Sle 901138110Sle tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "timerlat", 902138110Sle timerlat_aa_handler, tool); 903138110Sle 904130391Sle tracefs_event_disable(tool->trace.inst, "osnoise", NULL); 905130391Sle 906130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise", 907130391Sle timerlat_aa_nmi_handler, tool); 908130391Sle 909130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise", 910130391Sle timerlat_aa_irq_handler, tool); 911130391Sle 912130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise", 913130391Sle timerlat_aa_softirq_handler, tool); 914130391Sle 915130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise", 916130391Sle timerlat_aa_thread_handler, tool); 917130391Sle 918130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack", 919130391Sle timerlat_aa_stack_handler, tool); 920130391Sle if (!dump_tasks) 921130391Sle return; 922130391Sle 923130391Sle tracefs_event_disable(tool->trace.inst, "sched", "sched_switch"); 924130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "sched", "sched_switch", 925130391Sle timerlat_aa_sched_switch_handler, tool); 926130391Sle 927130391Sle tracefs_event_disable(tool->trace.inst, "workqueue", "workqueue_execute_start"); 928130391Sle tep_unregister_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start", 929130391Sle timerlat_aa_kworker_start_handler, tool); 930130391Sle} 931130391Sle 932130391Sle/* 933130391Sle * timerlat_aa_register_events - Register events used in the auto-analysis 934130391Sle * 935130391Sle * Returns 0 on success, -1 otherwise. 936130391Sle */ 937130391Slestatic int timerlat_aa_register_events(struct osnoise_tool *tool, int dump_tasks) 938130391Sle{ 939130391Sle int retval; 940130391Sle 941130391Sle tep_register_event_handler(tool->trace.tep, -1, "ftrace", "timerlat", 942130391Sle timerlat_aa_handler, tool); 943130391Sle 944130391Sle 945130391Sle /* 946130391Sle * register auto-analysis handlers. 947130391Sle */ 948 retval = tracefs_event_enable(tool->trace.inst, "osnoise", NULL); 949 if (retval < 0 && !errno) { 950 err_msg("Could not find osnoise events\n"); 951 goto out_err; 952 } 953 954 tep_register_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise", 955 timerlat_aa_nmi_handler, tool); 956 957 tep_register_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise", 958 timerlat_aa_irq_handler, tool); 959 960 tep_register_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise", 961 timerlat_aa_softirq_handler, tool); 962 963 tep_register_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise", 964 timerlat_aa_thread_handler, tool); 965 966 tep_register_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack", 967 timerlat_aa_stack_handler, tool); 968 969 if (!dump_tasks) 970 return 0; 971 972 /* 973 * Dump task events. 974 */ 975 retval = tracefs_event_enable(tool->trace.inst, "sched", "sched_switch"); 976 if (retval < 0 && !errno) { 977 err_msg("Could not find sched_switch\n"); 978 goto out_err; 979 } 980 981 tep_register_event_handler(tool->trace.tep, -1, "sched", "sched_switch", 982 timerlat_aa_sched_switch_handler, tool); 983 984 retval = tracefs_event_enable(tool->trace.inst, "workqueue", "workqueue_execute_start"); 985 if (retval < 0 && !errno) { 986 err_msg("Could not find workqueue_execute_start\n"); 987 goto out_err; 988 } 989 990 tep_register_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start", 991 timerlat_aa_kworker_start_handler, tool); 992 993 return 0; 994 995out_err: 996 timerlat_aa_unregister_events(tool, dump_tasks); 997 return -1; 998} 999 1000/** 1001 * timerlat_aa_destroy - Destroy timerlat auto-analysis 1002 */ 1003void timerlat_aa_destroy(void) 1004{ 1005 struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx(); 1006 1007 if (!taa_ctx) 1008 return; 1009 1010 if (!taa_ctx->taa_data) 1011 goto out_ctx; 1012 1013 timerlat_aa_unregister_events(taa_ctx->tool, taa_ctx->dump_tasks); 1014 timerlat_aa_destroy_seqs(taa_ctx); 1015 free(taa_ctx->taa_data); 1016out_ctx: 1017 free(taa_ctx); 1018} 1019 1020/** 1021 * timerlat_aa_init - Initialize timerlat auto-analysis 1022 * 1023 * Returns 0 on success, -1 otherwise. 1024 */ 1025int timerlat_aa_init(struct osnoise_tool *tool, int dump_tasks) 1026{ 1027 int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); 1028 struct timerlat_aa_context *taa_ctx; 1029 int retval; 1030 1031 taa_ctx = calloc(1, sizeof(*taa_ctx)); 1032 if (!taa_ctx) 1033 return -1; 1034 1035 __timerlat_aa_ctx = taa_ctx; 1036 1037 taa_ctx->nr_cpus = nr_cpus; 1038 taa_ctx->tool = tool; 1039 taa_ctx->dump_tasks = dump_tasks; 1040 1041 taa_ctx->taa_data = calloc(nr_cpus, sizeof(*taa_ctx->taa_data)); 1042 if (!taa_ctx->taa_data) 1043 goto out_err; 1044 1045 retval = timerlat_aa_init_seqs(taa_ctx); 1046 if (retval) 1047 goto out_err; 1048 1049 retval = timerlat_aa_register_events(tool, dump_tasks); 1050 if (retval) 1051 goto out_err; 1052 1053 return 0; 1054 1055out_err: 1056 timerlat_aa_destroy(); 1057 return -1; 1058} 1059