1/*-
2 * Copyright (c) 2002 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/conf.h>
30#include <sys/cons.h>
31#include <sys/kdb.h>
32#include <sys/kernel.h>
33#include <sys/kerneldump.h>
34#include <sys/malloc.h>
35#include <sys/msgbuf.h>
36#include <sys/proc.h>
37#include <sys/watchdog.h>
38
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/vm_page.h>
42#include <vm/vm_phys.h>
43#include <vm/vm_dumpset.h>
44#include <vm/pmap.h>
45
46#include <machine/dump.h>
47#include <machine/elf.h>
48#include <machine/md_var.h>
49#include <machine/pcb.h>
50
51CTASSERT(sizeof(struct kerneldumpheader) == 512);
52
53#define	MD_ALIGN(x)	roundup2((off_t)(x), PAGE_SIZE)
54
55/* Handle buffered writes. */
56static size_t fragsz;
57
58struct dump_pa dump_map[DUMPSYS_MD_PA_NPAIRS];
59
60#if !defined(__powerpc__)
61void
62dumpsys_gen_pa_init(void)
63{
64	int n, idx;
65
66	bzero(dump_map, sizeof(dump_map));
67	for (n = 0; n < nitems(dump_map); n++) {
68		idx = n * 2;
69		if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0)
70			break;
71		dump_map[n].pa_start = dump_avail[idx];
72		dump_map[n].pa_size = dump_avail[idx + 1] - dump_avail[idx];
73	}
74}
75#endif
76
77struct dump_pa *
78dumpsys_gen_pa_next(struct dump_pa *mdp)
79{
80
81	if (mdp == NULL)
82		return (&dump_map[0]);
83
84	mdp++;
85	if (mdp->pa_size == 0)
86		mdp = NULL;
87	return (mdp);
88}
89
90void
91dumpsys_gen_wbinv_all(void)
92{
93
94}
95
96void
97dumpsys_gen_unmap_chunk(vm_paddr_t pa __unused, size_t chunk __unused,
98    void *va __unused)
99{
100
101}
102
103int
104dumpsys_gen_write_aux_headers(struct dumperinfo *di)
105{
106
107	return (0);
108}
109
110int
111dumpsys_buf_seek(struct dumperinfo *di, size_t sz)
112{
113	static uint8_t buf[DEV_BSIZE];
114	size_t nbytes;
115	int error;
116
117	bzero(buf, sizeof(buf));
118
119	while (sz > 0) {
120		nbytes = MIN(sz, sizeof(buf));
121
122		error = dump_append(di, buf, nbytes);
123		if (error)
124			return (error);
125		sz -= nbytes;
126	}
127
128	return (0);
129}
130
131int
132dumpsys_buf_write(struct dumperinfo *di, char *ptr, size_t sz)
133{
134	size_t len;
135	int error;
136
137	while (sz) {
138		len = di->blocksize - fragsz;
139		if (len > sz)
140			len = sz;
141		memcpy((char *)di->blockbuf + fragsz, ptr, len);
142		fragsz += len;
143		ptr += len;
144		sz -= len;
145		if (fragsz == di->blocksize) {
146			error = dump_append(di, di->blockbuf, di->blocksize);
147			if (error)
148				return (error);
149			fragsz = 0;
150		}
151	}
152	return (0);
153}
154
155int
156dumpsys_buf_flush(struct dumperinfo *di)
157{
158	int error;
159
160	if (fragsz == 0)
161		return (0);
162
163	error = dump_append(di, di->blockbuf, di->blocksize);
164	fragsz = 0;
165	return (error);
166}
167
168CTASSERT(PAGE_SHIFT < 20);
169#define PG2MB(pgs) ((pgs + (1 << (20 - PAGE_SHIFT)) - 1) >> (20 - PAGE_SHIFT))
170
171int
172dumpsys_cb_dumpdata(struct dump_pa *mdp, int seqnr, void *arg)
173{
174	struct dumperinfo *di = (struct dumperinfo*)arg;
175	vm_paddr_t pa;
176	void *va;
177	uint64_t pgs;
178	size_t counter, sz, chunk;
179	int c, error;
180	u_int maxdumppgs;
181
182	error = 0;	/* catch case in which chunk size is 0 */
183	counter = 0;	/* Update twiddle every 16MB */
184	va = NULL;
185	pgs = mdp->pa_size / PAGE_SIZE;
186	pa = mdp->pa_start;
187	maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS);
188	if (maxdumppgs == 0)	/* seatbelt */
189		maxdumppgs = 1;
190
191	printf("  chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs),
192	    (uintmax_t)pgs);
193
194	dumpsys_wbinv_all();
195	while (pgs) {
196		chunk = pgs;
197		if (chunk > maxdumppgs)
198			chunk = maxdumppgs;
199		sz = chunk << PAGE_SHIFT;
200		counter += sz;
201		if (counter >> 24) {
202			printf(" %ju", (uintmax_t)PG2MB(pgs));
203			counter &= (1 << 24) - 1;
204		}
205
206		dumpsys_map_chunk(pa, chunk, &va);
207		wdog_kern_pat(WD_LASTVAL);
208
209		error = dump_append(di, va, sz);
210		dumpsys_unmap_chunk(pa, chunk, va);
211		if (error)
212			break;
213		pgs -= chunk;
214		pa += sz;
215
216		/* Check for user abort. */
217		c = cncheckc();
218		if (c == 0x03)
219			return (ECANCELED);
220		if (c != -1)
221			printf(" (CTRL-C to abort) ");
222	}
223	printf(" ... %s\n", (error) ? "fail" : "ok");
224	return (error);
225}
226
227int
228dumpsys_foreach_chunk(dumpsys_callback_t cb, void *arg)
229{
230	struct dump_pa *mdp;
231	int error, seqnr;
232
233	seqnr = 0;
234	mdp = dumpsys_pa_next(NULL);
235	while (mdp != NULL) {
236		error = (*cb)(mdp, seqnr++, arg);
237		if (error)
238			return (-error);
239		mdp = dumpsys_pa_next(mdp);
240	}
241	return (seqnr);
242}
243
244static off_t fileofs;
245
246static int
247cb_dumphdr(struct dump_pa *mdp, int seqnr, void *arg)
248{
249	struct dumperinfo *di = (struct dumperinfo*)arg;
250	Elf_Phdr phdr;
251	uint64_t size;
252	int error;
253
254	size = mdp->pa_size;
255	bzero(&phdr, sizeof(phdr));
256	phdr.p_type = PT_LOAD;
257	phdr.p_flags = PF_R;			/* XXX */
258	phdr.p_offset = fileofs;
259#ifdef __powerpc__
260	phdr.p_vaddr = (do_minidump? mdp->pa_start : ~0L);
261	phdr.p_paddr = (do_minidump? ~0L : mdp->pa_start);
262#else
263	phdr.p_vaddr = mdp->pa_start;
264	phdr.p_paddr = mdp->pa_start;
265#endif
266	phdr.p_filesz = size;
267	phdr.p_memsz = size;
268	phdr.p_align = PAGE_SIZE;
269
270	error = dumpsys_buf_write(di, (char*)&phdr, sizeof(phdr));
271	fileofs += phdr.p_filesz;
272	return (error);
273}
274
275static int
276cb_size(struct dump_pa *mdp, int seqnr, void *arg)
277{
278	uint64_t *sz;
279
280	sz = (uint64_t *)arg;
281	*sz += (uint64_t)mdp->pa_size;
282	return (0);
283}
284
285int
286dumpsys_generic(struct dumperinfo *di)
287{
288	static struct kerneldumpheader kdh;
289	Elf_Ehdr ehdr;
290	uint64_t dumpsize;
291	off_t hdrgap;
292	size_t hdrsz;
293	int error;
294
295#if MINIDUMP_PAGE_TRACKING == 1
296	if (do_minidump)
297		return (minidumpsys(di, false));
298#endif
299
300	bzero(&ehdr, sizeof(ehdr));
301	ehdr.e_ident[EI_MAG0] = ELFMAG0;
302	ehdr.e_ident[EI_MAG1] = ELFMAG1;
303	ehdr.e_ident[EI_MAG2] = ELFMAG2;
304	ehdr.e_ident[EI_MAG3] = ELFMAG3;
305	ehdr.e_ident[EI_CLASS] = ELF_CLASS;
306#if BYTE_ORDER == LITTLE_ENDIAN
307	ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
308#else
309	ehdr.e_ident[EI_DATA] = ELFDATA2MSB;
310#endif
311	ehdr.e_ident[EI_VERSION] = EV_CURRENT;
312	ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE;	/* XXX big picture? */
313	ehdr.e_type = ET_CORE;
314	ehdr.e_machine = EM_VALUE;
315	ehdr.e_phoff = sizeof(ehdr);
316	ehdr.e_flags = 0;
317	ehdr.e_ehsize = sizeof(ehdr);
318	ehdr.e_phentsize = sizeof(Elf_Phdr);
319	ehdr.e_shentsize = sizeof(Elf_Shdr);
320
321	dumpsys_pa_init();
322
323	/* Calculate dump size. */
324	dumpsize = 0L;
325	ehdr.e_phnum = dumpsys_foreach_chunk(cb_size, &dumpsize) +
326	    DUMPSYS_NUM_AUX_HDRS;
327	hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize;
328	fileofs = MD_ALIGN(hdrsz);
329	dumpsize += fileofs;
330	hdrgap = fileofs - roundup2((off_t)hdrsz, di->blocksize);
331
332	dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARCH_VERSION,
333	    dumpsize);
334
335	error = dump_start(di, &kdh);
336	if (error != 0)
337		goto fail;
338
339	printf("Dumping %ju MB (%d chunks)\n", (uintmax_t)dumpsize >> 20,
340	    ehdr.e_phnum - DUMPSYS_NUM_AUX_HDRS);
341
342	/* Dump ELF header */
343	error = dumpsys_buf_write(di, (char*)&ehdr, sizeof(ehdr));
344	if (error)
345		goto fail;
346
347	/* Dump program headers */
348	error = dumpsys_foreach_chunk(cb_dumphdr, di);
349	if (error < 0)
350		goto fail;
351	error = dumpsys_write_aux_headers(di);
352	if (error < 0)
353		goto fail;
354	dumpsys_buf_flush(di);
355
356	/*
357	 * All headers are written using blocked I/O, so we know the
358	 * current offset is (still) block aligned. Skip the alignement
359	 * in the file to have the segment contents aligned at page
360	 * boundary.
361	 */
362	error = dumpsys_buf_seek(di, (size_t)hdrgap);
363	if (error)
364		goto fail;
365
366	/* Dump memory chunks. */
367	error = dumpsys_foreach_chunk(dumpsys_cb_dumpdata, di);
368	if (error < 0)
369		goto fail;
370
371	error = dump_finish(di, &kdh);
372	if (error != 0)
373		goto fail;
374
375	printf("\nDump complete\n");
376	return (0);
377
378 fail:
379	if (error < 0)
380		error = -error;
381
382	if (error == ECANCELED)
383		printf("\nDump aborted\n");
384	else if (error == E2BIG || error == ENOSPC)
385		printf("\nDump failed. Partition too small.\n");
386	else
387		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
388	return (error);
389}
390
391#if MINIDUMP_PAGE_TRACKING == 1
392
393/* Minidump progress bar */
394static struct {
395	const int min_per;
396	const int max_per;
397	bool visited;
398} progress_track[10] = {
399	{  0,  10, false},
400	{ 10,  20, false},
401	{ 20,  30, false},
402	{ 30,  40, false},
403	{ 40,  50, false},
404	{ 50,  60, false},
405	{ 60,  70, false},
406	{ 70,  80, false},
407	{ 80,  90, false},
408	{ 90, 100, false}
409};
410
411static uint64_t dumpsys_pb_size;
412static uint64_t dumpsys_pb_remaining;
413static uint64_t dumpsys_pb_check;
414
415/* Reset the progress bar for a dump of dumpsize. */
416void
417dumpsys_pb_init(uint64_t dumpsize)
418{
419	int i;
420
421	dumpsys_pb_size = dumpsys_pb_remaining = dumpsize;
422	dumpsys_pb_check = 0;
423
424	for (i = 0; i < nitems(progress_track); i++)
425		progress_track[i].visited = false;
426}
427
428/*
429 * Update the progress according to the delta bytes that were written out.
430 * Check and print the progress percentage.
431 */
432void
433dumpsys_pb_progress(size_t delta)
434{
435	int sofar, i;
436
437	dumpsys_pb_remaining -= delta;
438	dumpsys_pb_check += delta;
439
440	/*
441	 * To save time while dumping, only loop through progress_track
442	 * occasionally.
443	 */
444	if ((dumpsys_pb_check >> DUMPSYS_PB_CHECK_BITS) == 0)
445		return;
446	else
447		dumpsys_pb_check &= (1 << DUMPSYS_PB_CHECK_BITS) - 1;
448
449	sofar = 100 - ((dumpsys_pb_remaining * 100) / dumpsys_pb_size);
450	for (i = 0; i < nitems(progress_track); i++) {
451		if (sofar < progress_track[i].min_per ||
452		    sofar > progress_track[i].max_per)
453			continue;
454		if (!progress_track[i].visited) {
455			progress_track[i].visited = true;
456			printf("..%d%%", sofar);
457		}
458		break;
459	}
460}
461
462int
463minidumpsys(struct dumperinfo *di, bool livedump)
464{
465	struct minidumpstate state;
466	struct msgbuf mb_copy;
467	char *msg_ptr;
468	size_t sz;
469	int error;
470
471	if (livedump) {
472		KASSERT(!dumping, ("live dump invoked from incorrect context"));
473
474		/*
475		 * Before invoking cpu_minidumpsys() on the live system, we
476		 * must snapshot some required global state: the message
477		 * buffer, and the page dump bitset. They may be modified at
478		 * any moment, so for the sake of the live dump it is best to
479		 * have an unchanging snapshot to work with. Both are included
480		 * as part of the dump and consumed by userspace tools.
481		 *
482		 * Other global state important to the minidump code is the
483		 * dump_avail array and the kernel's page tables, but snapshots
484		 * are not taken of these. For one, dump_avail[] is expected
485		 * not to change after boot. Snapshotting the kernel page
486		 * tables would involve an additional walk, so this is avoided
487		 * too.
488		 *
489		 * This means live dumps are best effort, and the result may or
490		 * may not be usable; there are no guarantees about the
491		 * consistency of the dump's contents. Any of the following
492		 * (and likely more) may affect the live dump:
493		 *
494		 *  - Data may be modified, freed, or remapped during the
495		 *    course of the dump, such that the contents written out
496		 *    are partially or entirely unrecognizable. This means
497		 *    valid references may point to destroyed/mangled objects,
498		 *    and vice versa.
499		 *
500		 *  - The dumped context of any threads that ran during the
501		 *    dump process may be unreliable.
502		 *
503		 *  - The set of kernel page tables included in the dump likely
504		 *    won't correspond exactly to the copy of the dump bitset.
505		 *    This means some pages will be dumped without any way to
506		 *    locate them, and some pages may not have been dumped
507		 *    despite appearing as if they should.
508		 */
509		msg_ptr = malloc(msgbufsize, M_TEMP, M_WAITOK);
510		msgbuf_duplicate(msgbufp, &mb_copy, msg_ptr);
511		state.msgbufp = &mb_copy;
512
513		sz = BITSET_SIZE(vm_page_dump_pages);
514		state.dump_bitset = malloc(sz, M_TEMP, M_WAITOK);
515		BIT_COPY_STORE_REL(sz, vm_page_dump, state.dump_bitset);
516	} else {
517		KASSERT(dumping, ("minidump invoked outside of doadump()"));
518
519		/* Use the globals. */
520		state.msgbufp = msgbufp;
521		state.dump_bitset = vm_page_dump;
522	}
523
524	error = cpu_minidumpsys(di, &state);
525	if (livedump) {
526		free(msg_ptr, M_TEMP);
527		free(state.dump_bitset, M_TEMP);
528	}
529
530	return (error);
531}
532#endif /* MINIDUMP_PAGE_TRACKING == 1 */
533