1#!/usr/bin/ksh
2#
3# iotop - display top disk I/O events by process.
4#         Written using DTrace (Solaris 10 3/05).
5#
6# This is measuring disk events that have made it past system caches.
7#
8# $Id: iotop 8 2007-08-06 05:55:26Z brendan $
9#
10# USAGE:	iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename] 
11#		      [-m mount_point] [-t top] [interval [count]]
12#
13#		iotop   	# default output, 5 second intervals
14#
15#		-C		# don't clear the screen
16#		-D		# print delta times, elapsed, us
17#		-j		# print project ID
18#		-o		# print disk delta times, us
19#		-P		# print %I/O (disk delta times)
20#		-Z		# print zone ID
21#		-d device	# instance name to snoop (eg, dad0)
22#		-f filename	# full pathname of file to snoop
23#		-m mount_point	# this FS only (will skip raw events)
24#		-t top		# print top number only
25#	eg,
26#		iotop 1  	# 1 second samples
27#		iotop -C	# don't clear the screen
28#		iotop -P	# print %I/O (time based)
29#		iotop -j	# print project IDs
30#		iotop -Z 	# print zone IDs
31#		iotop -t 20 	# print top 20 lines only
32#		iotop -C 5 12	# print 12 x 5 second samples
33# 	
34# FIELDS:
35#		UID		user ID
36#		PID		process ID
37#		PPID		parent process ID
38#		PROJ		project ID
39#		ZONE		zone ID
40#		CMD		process command name
41#		DEVICE  	device name
42#		MAJ     	device major number
43#		MIN     	device minor number
44#		D		direction, Read or Write
45#		BYTES		total size of operations, bytes
46#		ELAPSED		total elapsed from request to completion, us
47#		DISKTIME	total time for disk to complete request, us
48#		%I/O		percent disk I/O, based on time (DISKTIME)
49#		load		1 min load average
50#		disk_r		total disk read Kbytes for sample
51#		disk_w		total disk write Kbytes for sample
52# 
53# NOTE:
54# * There are two different delta times reported. -D prints the
55#   elapsed time from the disk request (strategy) to the disk completion
56#   (iodone); -o prints the time for the disk to complete that event 
57#   since it's last event (time between iodones), or, the time to the
58#   strategy if the disk had been idle. 
59# * The %I/O value can exceed 100%. It represents how busy a process is
60#   making the disks, in terms of a single disk. A value of 200% could 
61#   mean 2 disks are busy at 100%, or 4 disks at 50%...
62#
63# SEE ALSO: iosnoop
64#	    BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
65#	    Solaris Dynamic Tracing Guide, http://docs.sun.com
66#	    DTrace Tools, http://www.brendangregg.com/dtrace.html
67#
68# INSPIRATION:  top(1) by William LeFebvre
69#
70# COPYRIGHT: Copyright (c) 2005, 2006 Brendan Gregg.
71#
72# CDDL HEADER START
73#
74#  The contents of this file are subject to the terms of the
75#  Common Development and Distribution License, Version 1.0 only
76#  (the "License").  You may not use this file except in compliance
77#  with the License.
78#
79#  You can obtain a copy of the license at Docs/cddl1.txt
80#  or http://www.opensolaris.org/os/licensing.
81#  See the License for the specific language governing permissions
82#  and limitations under the License.
83#
84# CDDL HEADER END
85#
86# KNOWN BUGS: 
87# - This can print errors while running on servers with Veritas volumes.
88#
89# Author: Brendan Gregg  [Sydney, Australia]
90#
91# 15-Jul-2005	Brendan Gregg	Created this.
92# 20-Apr-2006	   "      "	Last update.
93#
94
95
96##############################
97# --- Process Arguments ---
98#
99
100### default variables
101opt_device=0; opt_file=0; opt_mount=0; opt_clear=1; opt_proj=0; opt_zone=0
102opt_percent=0; opt_def=1; opt_bytes=1; filter=0; device=.; filename=.; mount=.
103opt_top=0; opt_elapsed=0; opt_dtime=0; interval=5; count=-1; top=0
104
105### process options
106while getopts CDd:f:hjm:oPt:Z name
107do
108	case $name in
109	C)	opt_clear=0 ;;
110	D)	opt_elapsed=1; opt_bytes=0 ;;
111	d)	opt_device=1; device=$OPTARG ;;
112	f)	opt_file=1; filename=$OPTARG ;;
113	j)	opt_proj=1; opt_def=0 ;;
114	m)	opt_mount=1; mount=$OPTARG ;;
115	o)	opt_dtime=1; opt_bytes=0 ;;
116	P)	opt_percent=1; opt_dtime=1; opt_bytes=0 ;;
117	t)	opt_top=1; top=$OPTARG ;;
118	Z)	opt_zone=1; opt_def=0 ;;
119	h|?)	cat <<-END >&2
120		USAGE: iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
121		             [-m mount_point] [-t top] [interval [count]]
122 
123		                -C      # don't clear the screen
124		                -D      # print delta times, elapsed, us
125		                -j      # print project ID
126		                -o      # print disk delta times, us
127		                -P      # print %I/O (disk delta times)
128		                -Z      # print zone ID
129		                -d device       # instance name to snoop 
130		                -f filename     # snoop this file only
131		                -m mount_point  # this FS only 
132		                -t top  	# print top number only
133		   eg,
134		        iotop         # default output, 5 second samples
135		        iotop 1       # 1 second samples
136		        iotop -P      # print %I/O (time based)
137		        iotop -m /    # snoop events on filesystem / only
138		        iotop -t 20   # print top 20 lines only
139		        iotop -C 5 12 # print 12 x 5 second samples
140		END
141		exit 1
142	esac
143done
144
145shift $(( $OPTIND - 1 ))
146
147### option logic
148if [[ "$1" > 0 ]]; then
149        interval=$1; shift
150fi
151if [[ "$1" > 0 ]]; then
152        count=$1; shift
153fi
154if (( opt_proj && opt_zone )); then
155        opt_proj=0
156fi
157if (( opt_elapsed && opt_dtime )); then
158        opt_elapsed=0
159fi
160if (( opt_device || opt_mount || opt_file )); then
161	filter=1
162fi
163if (( opt_clear )); then
164        clearstr=`clear`
165else
166        clearstr=.
167fi
168
169
170
171#################################
172# --- Main Program, DTrace ---
173#
174/usr/sbin/dtrace -n '
175 /*
176  * Command line arguments
177  */
178 inline int OPT_def 	= '$opt_def';
179 inline int OPT_proj 	= '$opt_proj';
180 inline int OPT_zone 	= '$opt_zone';
181 inline int OPT_clear 	= '$opt_clear';
182 inline int OPT_bytes 	= '$opt_bytes';
183 inline int OPT_elapsed = '$opt_elapsed';
184 inline int OPT_dtime 	= '$opt_dtime';
185 inline int OPT_percent	= '$opt_percent';
186 inline int OPT_device 	= '$opt_device';
187 inline int OPT_mount 	= '$opt_mount';
188 inline int OPT_file 	= '$opt_file';
189 inline int OPT_top 	= '$opt_top';
190 inline int INTERVAL 	= '$interval';
191 inline int COUNTER 	= '$count';
192 inline int FILTER 	= '$filter';
193 inline int TOP 	= '$top';
194 inline string DEVICE 	= "'$device'";
195 inline string FILENAME = "'$filename'";
196 inline string MOUNT 	= "'$mount'";
197 inline string CLEAR 	= "'$clearstr'";
198 
199 #pragma D option quiet
200
201 /* boost the following if you get "dynamic variable drops" */
202 #pragma D option dynvarsize=8m
203
204 /*
205  * Print header
206  */
207 dtrace:::BEGIN 
208 {
209	last_event[""] = 0;
210
211        /* starting values */
212        counts = COUNTER;
213        secs = INTERVAL;
214        disk_r = 0;
215        disk_w = 0;
216
217        printf("Tracing... Please wait.\n");
218 }
219
220 /*
221  * Check event is being traced
222  */
223 io:genunix::start,
224 io:genunix::done 
225 { 
226	/* default is to trace unless filtering, */
227	this->ok = FILTER ? 0 : 1;
228
229	/* check each filter, */
230	(OPT_device == 1 && DEVICE == args[1]->dev_statname)? this->ok = 1 : 1;
231	(OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? this->ok = 1 : 1;
232	(OPT_mount == 1 && MOUNT == args[2]->fi_mount)  ? this->ok = 1 : 1;
233 }
234
235 /*
236  * Reset last_event for disk idle -> start
237  * this prevents idle time being counted as disk time.
238  */
239 io:genunix::start
240 /! pending[args[1]->dev_statname]/
241 {
242	/* save last disk event */
243	last_event[args[1]->dev_statname] = timestamp;
244 }
245
246 /*
247  * Store entry details
248  */
249 io:genunix::start
250 /this->ok/
251 {
252	/* these are used as a unique disk event key, */
253 	this->dev = args[0]->b_edev;
254 	this->blk = args[0]->b_blkno;
255
256	/* save disk event details, */
257 	start_uid[this->dev, this->blk] = uid;
258 	start_pid[this->dev, this->blk] = pid;
259 	start_ppid[this->dev, this->blk] = ppid;
260 	start_comm[this->dev, this->blk] = execname;
261 	start_time[this->dev, this->blk] = timestamp;
262 	start_proj[this->dev, this->blk] = curpsinfo->pr_projid;
263 	start_zone[this->dev, this->blk] = curpsinfo->pr_zoneid;
264 	start_rw[this->dev, this->blk] = args[0]->b_flags & B_READ ? "R" : "W";
265	disk_r += args[0]->b_flags & B_READ ? args[0]->b_bcount : 0;
266	disk_w += args[0]->b_flags & B_READ ? 0 : args[0]->b_bcount;
267
268	/* increase disk event pending count */
269	pending[args[1]->dev_statname]++;
270 }
271
272 /*
273  * Process and Print completion
274  */
275 io:genunix::done
276 /this->ok/
277 {
278	/* decrease disk event pending count */
279	pending[args[1]->dev_statname]--;
280
281	/*
282	 * Process details
283	 */
284
285 	/* fetch entry values */
286 	this->dev = args[0]->b_edev;
287 	this->blk = args[0]->b_blkno;
288 	this->suid = start_uid[this->dev, this->blk];
289 	this->spid = start_pid[this->dev, this->blk];
290 	this->sppid = start_ppid[this->dev, this->blk];
291 	this->sproj = start_proj[this->dev, this->blk];
292 	this->szone = start_zone[this->dev, this->blk];
293 	self->scomm = start_comm[this->dev, this->blk];
294 	this->stime = start_time[this->dev, this->blk];
295	this->etime = timestamp; /* endtime */
296	this->elapsed = this->etime - this->stime;
297 	self->rw = start_rw[this->dev, this->blk];
298	this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
299	    timestamp - last_event[args[1]->dev_statname];
300
301 	/* memory cleanup */
302 	start_uid[this->dev, this->blk]  = 0;
303 	start_pid[this->dev, this->blk]  = 0;
304 	start_ppid[this->dev, this->blk] = 0;
305 	start_time[this->dev, this->blk] = 0;
306 	start_comm[this->dev, this->blk] = 0;
307 	start_zone[this->dev, this->blk] = 0;
308 	start_proj[this->dev, this->blk] = 0;
309 	start_rw[this->dev, this->blk]   = 0;
310
311	/*
312	 * Choose statistic to track
313	 */
314	OPT_bytes   ? this->value = args[0]->b_bcount    : 1;
315	OPT_elapsed ? this->value = this->elapsed / 1000 : 1;
316	OPT_dtime   ? this->value = this->dtime / 1000   : 1;
317	
318	/*
319	 * Save details
320	 */
321	OPT_def ? @out[this->suid, this->spid, this->sppid, self->scomm,
322	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
323	    self->rw] = sum(this->value) : 1;
324	OPT_proj ? @out[this->sproj, this->spid, this->sppid, self->scomm,
325	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
326	    self->rw] = sum(this->value) : 1;
327	OPT_zone ? @out[this->szone, this->spid, this->sppid, self->scomm,
328	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
329	    self->rw] = sum(this->value) : 1;
330
331	/* save last disk event */
332	last_event[args[1]->dev_statname] = timestamp;
333
334	self->scomm = 0;
335	self->rw = 0;
336 }
337
338 /*
339  * Prevent pending from underflowing
340  * this can happen if this program is started during disk events.
341  */
342 io:genunix::done
343 /pending[args[1]->dev_statname] < 0/
344 {
345	pending[args[1]->dev_statname] = 0;
346 }
347
348 /*
349  * Timer
350  */
351 profile:::tick-1sec
352 {
353	secs--;
354 }
355
356 /*
357  * Print Report
358  */
359 profile:::tick-1sec
360 /secs == 0/
361 {
362	/* fetch 1 min load average */
363	this->load1a  = `hp_avenrun[0] / 65536;
364	this->load1b  = ((`hp_avenrun[0] % 65536) * 100) / 65536;
365
366	/* convert counters to Kbytes */
367	disk_r /= 1024;
368	disk_w /= 1024;
369
370	/* print status */
371	OPT_clear ? printf("%s", CLEAR) : 1;
372	printf("%Y,  load: %d.%02d,  disk_r: %6d KB,  disk_w: %6d KB\n\n",
373	    walltimestamp, this->load1a, this->load1b, disk_r, disk_w);
374
375	/* print headers */
376	OPT_def  ? printf("  UID ") : 1;
377	OPT_proj ? printf(" PROJ ") : 1;
378	OPT_zone ? printf(" ZONE ") : 1;
379	printf("%6s %6s %-16s %-7s %3s %3s %1s",
380	    "PID", "PPID", "CMD", "DEVICE", "MAJ", "MIN", "D");
381	OPT_bytes   ? printf(" %16s\n", "BYTES") : 1;
382	OPT_elapsed ? printf(" %16s\n", "ELAPSED") : 1;
383	OPT_dtime && ! OPT_percent  ? printf(" %16s\n", "DISKTIME") : 1;
384	OPT_dtime && OPT_percent    ? printf(" %6s\n", "%I/O") : 1;
385
386	/* truncate to top lines if needed */
387	OPT_top ? trunc(@out, TOP) : 1;
388
389	/* normalise to percentage if needed */
390	OPT_percent ? normalize(@out, INTERVAL * 10000) : 1;
391
392	/* print data */
393	! OPT_percent ? 
394	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %16@d\n", @out) :
395	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %6@d\n", @out);
396	printf("\n");
397
398	/* clear data */
399	trunc(@out);
400	disk_r = 0;
401	disk_w = 0;
402	secs = INTERVAL;
403	counts--;
404 }
405
406 /*
407  * End of program
408  */
409 profile:::tick-1sec
410 /counts == 0/
411 {
412	exit(0);
413 }
414
415 /*
416  * Cleanup for Ctrl-C
417  */
418 dtrace:::END
419 {
420	trunc(@out);
421 }
422'
423