zfsboot.c revision 329175
1/*-
2 * Copyright (c) 1998 Robert Nordier
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms are freely
6 * permitted provided that the above copyright notice and this
7 * paragraph and the following disclaimer are duplicated in all
8 * such forms.
9 *
10 * This software is provided "AS IS" and without any express or
11 * implied warranties, including, without limitation, the implied
12 * warranties of merchantability and fitness for a particular
13 * purpose.
14 */
15
16#include <sys/cdefs.h>
17__FBSDID("$FreeBSD: stable/11/stand/i386/zfsboot/zfsboot.c 329175 2018-02-12 17:44:35Z kevans $");
18
19#include "stand.h"
20
21#include <sys/param.h>
22#include <sys/errno.h>
23#include <sys/diskmbr.h>
24#ifdef GPT
25#include <sys/gpt.h>
26#endif
27#include <sys/reboot.h>
28#include <sys/queue.h>
29
30#include <machine/bootinfo.h>
31#include <machine/elf.h>
32#include <machine/pc/bios.h>
33
34#include <stdarg.h>
35#include <stddef.h>
36
37#include <a.out.h>
38
39#include <btxv86.h>
40
41#include "lib.h"
42#include "rbx.h"
43#include "drv.h"
44#include "edd.h"
45#include "cons.h"
46#include "bootargs.h"
47#include "paths.h"
48
49#include "libzfs.h"
50
51#define ARGS			0x900
52#define NOPT			14
53#define NDEV			3
54
55#define BIOS_NUMDRIVES		0x475
56#define DRV_HARD		0x80
57#define DRV_MASK		0x7f
58
59#define TYPE_AD			0
60#define TYPE_DA			1
61#define TYPE_MAXHARD		TYPE_DA
62#define TYPE_FD			2
63
64#define DEV_GELIBOOT_BSIZE	4096
65
66extern uint32_t _end;
67
68#ifdef GPT
69static const uuid_t freebsd_zfs_uuid = GPT_ENT_TYPE_FREEBSD_ZFS;
70#endif
71static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
72static const unsigned char flags[NOPT] = {
73    RBX_DUAL,
74    RBX_SERIAL,
75    RBX_ASKNAME,
76    RBX_CDROM,
77    RBX_CONFIG,
78    RBX_KDB,
79    RBX_GDB,
80    RBX_MUTE,
81    RBX_NOINTR,
82    RBX_PAUSE,
83    RBX_QUIET,
84    RBX_DFLTROOT,
85    RBX_SINGLE,
86    RBX_VERBOSE
87};
88uint32_t opts;
89
90static const unsigned char dev_maj[NDEV] = {30, 4, 2};
91
92static char cmd[512];
93static char cmddup[512];
94static char kname[1024];
95static char rootname[256];
96static int comspeed = SIOSPD;
97static struct bootinfo bootinfo;
98static uint32_t bootdev;
99static struct zfs_boot_args zfsargs;
100
101vm_offset_t	high_heap_base;
102uint32_t	bios_basemem, bios_extmem, high_heap_size;
103
104static struct bios_smap smap;
105
106/*
107 * The minimum amount of memory to reserve in bios_extmem for the heap.
108 */
109#define	HEAP_MIN		(64 * 1024 * 1024)
110
111static char *heap_next;
112static char *heap_end;
113
114/* Buffers that must not span a 64k boundary. */
115#define READ_BUF_SIZE		8192
116struct dmadat {
117	char rdbuf[READ_BUF_SIZE];	/* for reading large things */
118	char secbuf[READ_BUF_SIZE];	/* for MBR/disklabel */
119};
120static struct dmadat *dmadat;
121
122void exit(int);
123void reboot(void);
124static void load(void);
125static int parse_cmd(void);
126static void bios_getmem(void);
127int main(void);
128
129#ifdef LOADER_GELI_SUPPORT
130#include "geliboot.c"
131static char gelipw[GELI_PW_MAXLEN];
132static struct keybuf *gelibuf;
133#endif
134
135#include "zfsimpl.c"
136
137/*
138 * Read from a dnode (which must be from a ZPL filesystem).
139 */
140static int
141zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
142{
143	const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
144	size_t n;
145	int rc;
146
147	n = size;
148	if (*offp + n > zp->zp_size)
149		n = zp->zp_size - *offp;
150
151	rc = dnode_read(spa, dnode, *offp, start, n);
152	if (rc)
153		return (-1);
154	*offp += n;
155
156	return (n);
157}
158
159/*
160 * Current ZFS pool
161 */
162static spa_t *spa;
163static spa_t *primary_spa;
164static vdev_t *primary_vdev;
165
166/*
167 * A wrapper for dskread that doesn't have to worry about whether the
168 * buffer pointer crosses a 64k boundary.
169 */
170static int
171vdev_read(void *xvdev, void *priv, off_t off, void *buf, size_t bytes)
172{
173	char *p;
174	daddr_t lba, alignlba;
175	off_t diff;
176	unsigned int nb, alignnb;
177	struct dsk *dsk = (struct dsk *) priv;
178
179	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
180		return -1;
181
182	p = buf;
183	lba = off / DEV_BSIZE;
184	lba += dsk->start;
185	/*
186	 * Align reads to 4k else 4k sector GELIs will not decrypt.
187	 * Round LBA down to nearest multiple of DEV_GELIBOOT_BSIZE bytes.
188	 */
189	alignlba = rounddown2(off, DEV_GELIBOOT_BSIZE) / DEV_BSIZE;
190	/*
191	 * The read must be aligned to DEV_GELIBOOT_BSIZE bytes relative to the
192	 * start of the GELI partition, not the start of the actual disk.
193	 */
194	alignlba += dsk->start;
195	diff = (lba - alignlba) * DEV_BSIZE;
196
197	while (bytes > 0) {
198		nb = bytes / DEV_BSIZE;
199		/*
200		 * Ensure that the read size plus the leading offset does not
201		 * exceed the size of the read buffer.
202		 */
203		if (nb > (READ_BUF_SIZE - diff) / DEV_BSIZE)
204			nb = (READ_BUF_SIZE - diff) / DEV_BSIZE;
205		/*
206		 * Round the number of blocks to read up to the nearest multiple
207		 * of DEV_GELIBOOT_BSIZE.
208		 */
209		alignnb = roundup2(nb * DEV_BSIZE + diff, DEV_GELIBOOT_BSIZE)
210		    / DEV_BSIZE;
211
212		if (drvread(dsk, dmadat->rdbuf, alignlba, alignnb))
213			return -1;
214#ifdef LOADER_GELI_SUPPORT
215		/* decrypt */
216		if (is_geli(dsk) == 0) {
217			if (geli_read(dsk, ((alignlba - dsk->start) *
218			    DEV_BSIZE), dmadat->rdbuf, alignnb * DEV_BSIZE))
219				return (-1);
220		}
221#endif
222		memcpy(p, dmadat->rdbuf + diff, nb * DEV_BSIZE);
223		p += nb * DEV_BSIZE;
224		lba += nb;
225		alignlba += alignnb;
226		bytes -= nb * DEV_BSIZE;
227		/* Don't need the leading offset after the first block. */
228		diff = 0;
229	}
230
231	return 0;
232}
233/* Match the signature exactly due to signature madness */
234static int
235vdev_read2(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
236{
237	return vdev_read(vdev, priv, off, buf, bytes);
238}
239
240
241static int
242vdev_write(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
243{
244	char *p;
245	daddr_t lba;
246	unsigned int nb;
247	struct dsk *dsk = (struct dsk *) priv;
248
249	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
250		return -1;
251
252	p = buf;
253	lba = off / DEV_BSIZE;
254	lba += dsk->start;
255	while (bytes > 0) {
256		nb = bytes / DEV_BSIZE;
257		if (nb > READ_BUF_SIZE / DEV_BSIZE)
258			nb = READ_BUF_SIZE / DEV_BSIZE;
259		memcpy(dmadat->rdbuf, p, nb * DEV_BSIZE);
260		if (drvwrite(dsk, dmadat->rdbuf, lba, nb))
261			return -1;
262		p += nb * DEV_BSIZE;
263		lba += nb;
264		bytes -= nb * DEV_BSIZE;
265	}
266
267	return 0;
268}
269
270static int
271xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
272{
273    if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
274	printf("Invalid format\n");
275	return -1;
276    }
277    return 0;
278}
279
280/*
281 * Read Pad2 (formerly "Boot Block Header") area of the first
282 * vdev label of the given vdev.
283 */
284static int
285vdev_read_pad2(vdev_t *vdev, char *buf, size_t size)
286{
287	blkptr_t bp;
288	char *tmp = zap_scratch;
289	off_t off = offsetof(vdev_label_t, vl_pad2);
290
291	if (size > VDEV_PAD_SIZE)
292		size = VDEV_PAD_SIZE;
293
294	BP_ZERO(&bp);
295	BP_SET_LSIZE(&bp, VDEV_PAD_SIZE);
296	BP_SET_PSIZE(&bp, VDEV_PAD_SIZE);
297	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
298	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
299	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
300	if (vdev_read_phys(vdev, &bp, tmp, off, 0))
301		return (EIO);
302	memcpy(buf, tmp, size);
303	return (0);
304}
305
306static int
307vdev_clear_pad2(vdev_t *vdev)
308{
309	char *zeroes = zap_scratch;
310	uint64_t *end;
311	off_t off = offsetof(vdev_label_t, vl_pad2);
312
313	memset(zeroes, 0, VDEV_PAD_SIZE);
314	end = (uint64_t *)(zeroes + VDEV_PAD_SIZE);
315	/* ZIO_CHECKSUM_LABEL magic and pre-calcualted checksum for all zeros */
316	end[-5] = 0x0210da7ab10c7a11;
317	end[-4] = 0x97f48f807f6e2a3f;
318	end[-3] = 0xaf909f1658aacefc;
319	end[-2] = 0xcbd1ea57ff6db48b;
320	end[-1] = 0x6ec692db0d465fab;
321	if (vdev_write(vdev, vdev->v_read_priv, off, zeroes, VDEV_PAD_SIZE))
322		return (EIO);
323	return (0);
324}
325
326static void
327bios_getmem(void)
328{
329    uint64_t size;
330
331    /* Parse system memory map */
332    v86.ebx = 0;
333    do {
334	v86.ctl = V86_FLAGS;
335	v86.addr = 0x15;		/* int 0x15 function 0xe820*/
336	v86.eax = 0xe820;
337	v86.ecx = sizeof(struct bios_smap);
338	v86.edx = SMAP_SIG;
339	v86.es = VTOPSEG(&smap);
340	v86.edi = VTOPOFF(&smap);
341	v86int();
342	if (V86_CY(v86.efl) || (v86.eax != SMAP_SIG))
343	    break;
344	/* look for a low-memory segment that's large enough */
345	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) &&
346	    (smap.length >= (512 * 1024)))
347	    bios_basemem = smap.length;
348	/* look for the first segment in 'extended' memory */
349	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0x100000)) {
350	    bios_extmem = smap.length;
351	}
352
353	/*
354	 * Look for the largest segment in 'extended' memory beyond
355	 * 1MB but below 4GB.
356	 */
357	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) &&
358	    (smap.base < 0x100000000ull)) {
359	    size = smap.length;
360
361	    /*
362	     * If this segment crosses the 4GB boundary, truncate it.
363	     */
364	    if (smap.base + size > 0x100000000ull)
365		size = 0x100000000ull - smap.base;
366
367	    if (size > high_heap_size) {
368		high_heap_size = size;
369		high_heap_base = smap.base;
370	    }
371	}
372    } while (v86.ebx != 0);
373
374    /* Fall back to the old compatibility function for base memory */
375    if (bios_basemem == 0) {
376	v86.ctl = 0;
377	v86.addr = 0x12;		/* int 0x12 */
378	v86int();
379
380	bios_basemem = (v86.eax & 0xffff) * 1024;
381    }
382
383    /* Fall back through several compatibility functions for extended memory */
384    if (bios_extmem == 0) {
385	v86.ctl = V86_FLAGS;
386	v86.addr = 0x15;		/* int 0x15 function 0xe801*/
387	v86.eax = 0xe801;
388	v86int();
389	if (!V86_CY(v86.efl)) {
390	    bios_extmem = ((v86.ecx & 0xffff) + ((v86.edx & 0xffff) * 64)) * 1024;
391	}
392    }
393    if (bios_extmem == 0) {
394	v86.ctl = 0;
395	v86.addr = 0x15;		/* int 0x15 function 0x88*/
396	v86.eax = 0x8800;
397	v86int();
398	bios_extmem = (v86.eax & 0xffff) * 1024;
399    }
400
401    /*
402     * If we have extended memory and did not find a suitable heap
403     * region in the SMAP, use the last 3MB of 'extended' memory as a
404     * high heap candidate.
405     */
406    if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) {
407	high_heap_size = HEAP_MIN;
408	high_heap_base = bios_extmem + 0x100000 - HEAP_MIN;
409    }
410}
411
412/*
413 * Try to detect a device supported by the legacy int13 BIOS
414 */
415static int
416int13probe(int drive)
417{
418    v86.ctl = V86_FLAGS;
419    v86.addr = 0x13;
420    v86.eax = 0x800;
421    v86.edx = drive;
422    v86int();
423
424    if (!V86_CY(v86.efl) &&				/* carry clear */
425	((v86.edx & 0xff) != (drive & DRV_MASK))) {	/* unit # OK */
426	if ((v86.ecx & 0x3f) == 0) {			/* absurd sector size */
427		return(0);				/* skip device */
428	}
429	return (1);
430    }
431    return(0);
432}
433
434/*
435 * We call this when we find a ZFS vdev - ZFS consumes the dsk
436 * structure so we must make a new one.
437 */
438static struct dsk *
439copy_dsk(struct dsk *dsk)
440{
441    struct dsk *newdsk;
442
443    newdsk = malloc(sizeof(struct dsk));
444    *newdsk = *dsk;
445    return (newdsk);
446}
447
448/*
449 * Get disk size from eax=0x800 and 0x4800. We need to probe both
450 * because 0x4800 may not be available and we would like to get more
451 * or less correct disk size - if it is possible at all.
452 * Note we do not really want to touch drv.c because that code is shared
453 * with boot2 and we can not afford to grow that code.
454 */
455static uint64_t
456drvsize_ext(struct dsk *dskp)
457{
458	uint64_t size, tmp;
459	int cyl, hds, sec;
460
461	v86.ctl = V86_FLAGS;
462	v86.addr = 0x13;
463	v86.eax = 0x800;
464	v86.edx = dskp->drive;
465	v86int();
466
467	/* Don't error out if we get bad sector number, try EDD as well */
468	if (V86_CY(v86.efl) ||	/* carry set */
469	    (v86.edx & 0xff) <= (unsigned)(dskp->drive & 0x7f)) /* unit # bad */
470		return (0);
471	cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1;
472	/* Convert max head # -> # of heads */
473	hds = ((v86.edx & 0xff00) >> 8) + 1;
474	sec = v86.ecx & 0x3f;
475
476	size = (uint64_t)cyl * hds * sec;
477
478	/* Determine if we can use EDD with this device. */
479	v86.ctl = V86_FLAGS;
480	v86.addr = 0x13;
481	v86.eax = 0x4100;
482	v86.edx = dskp->drive;
483	v86.ebx = 0x55aa;
484	v86int();
485	if (V86_CY(v86.efl) ||  /* carry set */
486	    (v86.ebx & 0xffff) != 0xaa55 || /* signature */
487	    (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0)
488		return (size);
489
490	tmp = drvsize(dskp);
491	if (tmp > size)
492		size = tmp;
493
494	return (size);
495}
496
497/*
498 * The "layered" ioctl to read disk/partition size. Unfortunately
499 * the zfsboot case is hardest, because we do not have full software
500 * stack available, so we need to do some manual work here.
501 */
502uint64_t
503ldi_get_size(void *priv)
504{
505	struct dsk *dskp = priv;
506	uint64_t size = dskp->size;
507
508	if (dskp->start == 0)
509		size = drvsize_ext(dskp);
510
511	return (size * DEV_BSIZE);
512}
513
514static void
515probe_drive(struct dsk *dsk)
516{
517#ifdef GPT
518    struct gpt_hdr hdr;
519    struct gpt_ent *ent;
520    unsigned part, entries_per_sec;
521    daddr_t slba;
522#endif
523#if defined(GPT) || defined(LOADER_GELI_SUPPORT)
524    daddr_t elba;
525#endif
526
527    struct dos_partition *dp;
528    char *sec;
529    unsigned i;
530
531    /*
532     * If we find a vdev on the whole disk, stop here.
533     */
534    if (vdev_probe(vdev_read2, dsk, NULL) == 0)
535	return;
536
537#ifdef LOADER_GELI_SUPPORT
538    /*
539     * Taste the disk, if it is GELI encrypted, decrypt it and check to see if
540     * it is a usable vdev then. Otherwise dig
541     * out the partition table and probe each slice/partition
542     * in turn for a vdev or GELI encrypted vdev.
543     */
544    elba = drvsize_ext(dsk);
545    if (elba > 0) {
546	elba--;
547    }
548    if (geli_taste(vdev_read, dsk, elba) == 0) {
549	if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit,
550	  ':', 0, dsk) == 0) {
551	    if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
552		return;
553	    }
554	}
555    }
556#endif /* LOADER_GELI_SUPPORT */
557
558    sec = dmadat->secbuf;
559    dsk->start = 0;
560
561#ifdef GPT
562    /*
563     * First check for GPT.
564     */
565    if (drvread(dsk, sec, 1, 1)) {
566	return;
567    }
568    memcpy(&hdr, sec, sizeof(hdr));
569    if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 ||
570	hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 ||
571	hdr.hdr_entsz < sizeof(*ent) || DEV_BSIZE % hdr.hdr_entsz != 0) {
572	goto trymbr;
573    }
574
575    /*
576     * Probe all GPT partitions for the presence of ZFS pools. We
577     * return the spa_t for the first we find (if requested). This
578     * will have the effect of booting from the first pool on the
579     * disk.
580     *
581     * If no vdev is found, GELI decrypting the device and try again
582     */
583    entries_per_sec = DEV_BSIZE / hdr.hdr_entsz;
584    slba = hdr.hdr_lba_table;
585    elba = slba + hdr.hdr_entries / entries_per_sec;
586    while (slba < elba) {
587	dsk->start = 0;
588	if (drvread(dsk, sec, slba, 1))
589	    return;
590	for (part = 0; part < entries_per_sec; part++) {
591	    ent = (struct gpt_ent *)(sec + part * hdr.hdr_entsz);
592	    if (memcmp(&ent->ent_type, &freebsd_zfs_uuid,
593		     sizeof(uuid_t)) == 0) {
594		dsk->start = ent->ent_lba_start;
595		dsk->size = ent->ent_lba_end - ent->ent_lba_start + 1;
596		dsk->slice = part + 1;
597		dsk->part = 255;
598		if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
599		    /*
600		     * This slice had a vdev. We need a new dsk
601		     * structure now since the vdev now owns this one.
602		     */
603		    dsk = copy_dsk(dsk);
604		}
605#ifdef LOADER_GELI_SUPPORT
606		else if (geli_taste(vdev_read, dsk, ent->ent_lba_end -
607			 ent->ent_lba_start) == 0) {
608		    if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw,
609		      dsk->unit, 'p', dsk->slice, dsk) == 0) {
610			/*
611			 * This slice has GELI, check it for ZFS.
612			 */
613			if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
614			    /*
615			     * This slice had a vdev. We need a new dsk
616			     * structure now since the vdev now owns this one.
617			     */
618			    dsk = copy_dsk(dsk);
619			}
620			break;
621		    }
622		}
623#endif /* LOADER_GELI_SUPPORT */
624	    }
625	}
626	slba++;
627    }
628    return;
629trymbr:
630#endif /* GPT */
631
632    if (drvread(dsk, sec, DOSBBSECTOR, 1))
633	return;
634    dp = (void *)(sec + DOSPARTOFF);
635
636    for (i = 0; i < NDOSPART; i++) {
637	if (!dp[i].dp_typ)
638	    continue;
639	dsk->start = dp[i].dp_start;
640	dsk->size = dp[i].dp_size;
641	dsk->slice = i + 1;
642	if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
643	    dsk = copy_dsk(dsk);
644	}
645#ifdef LOADER_GELI_SUPPORT
646	else if (geli_taste(vdev_read, dsk, dp[i].dp_size -
647		 dp[i].dp_start) == 0) {
648	    if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit,
649	      's', i, dsk) == 0) {
650		/*
651		 * This slice has GELI, check it for ZFS.
652		 */
653		if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
654		    /*
655		     * This slice had a vdev. We need a new dsk
656		     * structure now since the vdev now owns this one.
657		     */
658		    dsk = copy_dsk(dsk);
659		}
660		break;
661	    }
662	}
663#endif /* LOADER_GELI_SUPPORT */
664    }
665}
666
667int
668main(void)
669{
670    dnode_phys_t dn;
671    off_t off;
672    struct dsk *dsk;
673    int autoboot, i;
674    int nextboot;
675    int rc;
676
677    dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
678
679    bios_getmem();
680
681    if (high_heap_size > 0) {
682	heap_end = PTOV(high_heap_base + high_heap_size);
683	heap_next = PTOV(high_heap_base);
684    } else {
685	heap_next = (char *)dmadat + sizeof(*dmadat);
686	heap_end = (char *)PTOV(bios_basemem);
687    }
688    setheap(heap_next, heap_end);
689
690    dsk = malloc(sizeof(struct dsk));
691    dsk->drive = *(uint8_t *)PTOV(ARGS);
692    dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD;
693    dsk->unit = dsk->drive & DRV_MASK;
694    dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
695    dsk->part = 0;
696    dsk->start = 0;
697    dsk->size = 0;
698
699    bootinfo.bi_version = BOOTINFO_VERSION;
700    bootinfo.bi_size = sizeof(bootinfo);
701    bootinfo.bi_basemem = bios_basemem / 1024;
702    bootinfo.bi_extmem = bios_extmem / 1024;
703    bootinfo.bi_memsizes_valid++;
704    bootinfo.bi_bios_dev = dsk->drive;
705
706    bootdev = MAKEBOOTDEV(dev_maj[dsk->type],
707			  dsk->slice, dsk->unit, dsk->part);
708
709    /* Process configuration file */
710
711    autoboot = 1;
712
713#ifdef LOADER_GELI_SUPPORT
714    geli_init();
715#endif
716    zfs_init();
717
718    /*
719     * Probe the boot drive first - we will try to boot from whatever
720     * pool we find on that drive.
721     */
722    probe_drive(dsk);
723
724    /*
725     * Probe the rest of the drives that the bios knows about. This
726     * will find any other available pools and it may fill in missing
727     * vdevs for the boot pool.
728     */
729#ifndef VIRTUALBOX
730    for (i = 0; i < *(unsigned char *)PTOV(BIOS_NUMDRIVES); i++)
731#else
732    for (i = 0; i < MAXBDDEV; i++)
733#endif
734    {
735	if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
736	    continue;
737
738	if (!int13probe(i | DRV_HARD))
739	    break;
740
741	dsk = malloc(sizeof(struct dsk));
742	dsk->drive = i | DRV_HARD;
743	dsk->type = dsk->drive & TYPE_AD;
744	dsk->unit = i;
745	dsk->slice = 0;
746	dsk->part = 0;
747	dsk->start = 0;
748	dsk->size = 0;
749	probe_drive(dsk);
750    }
751
752    /*
753     * The first discovered pool, if any, is the pool.
754     */
755    spa = spa_get_primary();
756    if (!spa) {
757	printf("%s: No ZFS pools located, can't boot\n", BOOTPROG);
758	for (;;)
759	    ;
760    }
761
762    primary_spa = spa;
763    primary_vdev = spa_get_primary_vdev(spa);
764
765    nextboot = 0;
766    rc  = vdev_read_pad2(primary_vdev, cmd, sizeof(cmd));
767    if (vdev_clear_pad2(primary_vdev))
768	printf("failed to clear pad2 area of primary vdev\n");
769    if (rc == 0) {
770	if (*cmd) {
771	    /*
772	     * We could find an old-style ZFS Boot Block header here.
773	     * Simply ignore it.
774	     */
775	    if (*(uint64_t *)cmd != 0x2f5b007b10c) {
776		/*
777		 * Note that parse() is destructive to cmd[] and we also want
778		 * to honor RBX_QUIET option that could be present in cmd[].
779		 */
780		nextboot = 1;
781		memcpy(cmddup, cmd, sizeof(cmd));
782		if (parse_cmd()) {
783		    printf("failed to parse pad2 area of primary vdev\n");
784		    reboot();
785		}
786		if (!OPT_CHECK(RBX_QUIET))
787		    printf("zfs nextboot: %s\n", cmddup);
788	    }
789	    /* Do not process this command twice */
790	    *cmd = 0;
791	}
792    } else
793	printf("failed to read pad2 area of primary vdev\n");
794
795    /* Mount ZFS only if it's not already mounted via nextboot parsing. */
796    if (zfsmount.spa == NULL &&
797	(zfs_spa_init(spa) != 0 || zfs_mount(spa, 0, &zfsmount) != 0)) {
798	printf("%s: failed to mount default pool %s\n",
799	    BOOTPROG, spa->spa_name);
800	autoboot = 0;
801    } else if (zfs_lookup(&zfsmount, PATH_CONFIG, &dn) == 0 ||
802        zfs_lookup(&zfsmount, PATH_DOTCONFIG, &dn) == 0) {
803	off = 0;
804	zfs_read(spa, &dn, &off, cmd, sizeof(cmd));
805    }
806
807    if (*cmd) {
808	/*
809	 * Note that parse_cmd() is destructive to cmd[] and we also want
810	 * to honor RBX_QUIET option that could be present in cmd[].
811	 */
812	memcpy(cmddup, cmd, sizeof(cmd));
813	if (parse_cmd())
814	    autoboot = 0;
815	if (!OPT_CHECK(RBX_QUIET))
816	    printf("%s: %s\n", PATH_CONFIG, cmddup);
817	/* Do not process this command twice */
818	*cmd = 0;
819    }
820
821    /* Do not risk waiting at the prompt forever. */
822    if (nextboot && !autoboot)
823	reboot();
824
825    /*
826     * Try to exec /boot/loader. If interrupted by a keypress,
827     * or in case of failure, try to load a kernel directly instead.
828     */
829
830    if (autoboot && !*kname) {
831	memcpy(kname, PATH_LOADER_ZFS, sizeof(PATH_LOADER_ZFS));
832	if (!keyhit(3)) {
833	    load();
834	    memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL));
835	}
836    }
837
838    /* Present the user with the boot2 prompt. */
839
840    for (;;) {
841	if (!autoboot || !OPT_CHECK(RBX_QUIET)) {
842	    printf("\nFreeBSD/x86 boot\n");
843	    if (zfs_rlookup(spa, zfsmount.rootobj, rootname) != 0)
844		printf("Default: %s/<0x%llx>:%s\n"
845		       "boot: ",
846		       spa->spa_name, zfsmount.rootobj, kname);
847	    else if (rootname[0] != '\0')
848		printf("Default: %s/%s:%s\n"
849		       "boot: ",
850		       spa->spa_name, rootname, kname);
851	    else
852		printf("Default: %s:%s\n"
853		       "boot: ",
854		       spa->spa_name, kname);
855	}
856	if (ioctrl & IO_SERIAL)
857	    sio_flush();
858	if (!autoboot || keyhit(5))
859	    getstr(cmd, sizeof(cmd));
860	else if (!autoboot || !OPT_CHECK(RBX_QUIET))
861	    putchar('\n');
862	autoboot = 0;
863	if (parse_cmd())
864	    putchar('\a');
865	else
866	    load();
867    }
868}
869
870/* XXX - Needed for btxld to link the boot2 binary; do not remove. */
871void
872exit(int x)
873{
874    __exit(x);
875}
876
877void
878reboot(void)
879{
880    __exit(0);
881}
882
883static void
884load(void)
885{
886    union {
887	struct exec ex;
888	Elf32_Ehdr eh;
889    } hdr;
890    static Elf32_Phdr ep[2];
891    static Elf32_Shdr es[2];
892    caddr_t p;
893    dnode_phys_t dn;
894    off_t off;
895    uint32_t addr, x;
896    int fmt, i, j;
897
898    if (zfs_lookup(&zfsmount, kname, &dn)) {
899	printf("\nCan't find %s\n", kname);
900	return;
901    }
902    off = 0;
903    if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
904	return;
905    if (N_GETMAGIC(hdr.ex) == ZMAGIC)
906	fmt = 0;
907    else if (IS_ELF(hdr.eh))
908	fmt = 1;
909    else {
910	printf("Invalid %s\n", "format");
911	return;
912    }
913    if (fmt == 0) {
914	addr = hdr.ex.a_entry & 0xffffff;
915	p = PTOV(addr);
916	off = PAGE_SIZE;
917	if (xfsread(&dn, &off, p, hdr.ex.a_text))
918	    return;
919	p += roundup2(hdr.ex.a_text, PAGE_SIZE);
920	if (xfsread(&dn, &off, p, hdr.ex.a_data))
921	    return;
922	p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
923	bootinfo.bi_symtab = VTOP(p);
924	memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
925	p += sizeof(hdr.ex.a_syms);
926	if (hdr.ex.a_syms) {
927	    if (xfsread(&dn, &off, p, hdr.ex.a_syms))
928		return;
929	    p += hdr.ex.a_syms;
930	    if (xfsread(&dn, &off, p, sizeof(int)))
931		return;
932	    x = *(uint32_t *)p;
933	    p += sizeof(int);
934	    x -= sizeof(int);
935	    if (xfsread(&dn, &off, p, x))
936		return;
937	    p += x;
938	}
939    } else {
940	off = hdr.eh.e_phoff;
941	for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
942	    if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
943		return;
944	    if (ep[j].p_type == PT_LOAD)
945		j++;
946	}
947	for (i = 0; i < 2; i++) {
948	    p = PTOV(ep[i].p_paddr & 0xffffff);
949	    off = ep[i].p_offset;
950	    if (xfsread(&dn, &off, p, ep[i].p_filesz))
951		return;
952	}
953	p += roundup2(ep[1].p_memsz, PAGE_SIZE);
954	bootinfo.bi_symtab = VTOP(p);
955	if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
956	    off = hdr.eh.e_shoff + sizeof(es[0]) *
957		(hdr.eh.e_shstrndx + 1);
958	    if (xfsread(&dn, &off, &es, sizeof(es)))
959		return;
960	    for (i = 0; i < 2; i++) {
961		memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
962		p += sizeof(es[i].sh_size);
963		off = es[i].sh_offset;
964		if (xfsread(&dn, &off, p, es[i].sh_size))
965		    return;
966		p += es[i].sh_size;
967	    }
968	}
969	addr = hdr.eh.e_entry & 0xffffff;
970    }
971    bootinfo.bi_esymtab = VTOP(p);
972    bootinfo.bi_kernelname = VTOP(kname);
973    zfsargs.size = sizeof(zfsargs);
974    zfsargs.pool = zfsmount.spa->spa_guid;
975    zfsargs.root = zfsmount.rootobj;
976    zfsargs.primary_pool = primary_spa->spa_guid;
977#ifdef LOADER_GELI_SUPPORT
978    explicit_bzero(gelipw, sizeof(gelipw));
979    gelibuf = malloc(sizeof(struct keybuf) + (GELI_MAX_KEYS * sizeof(struct keybuf_ent)));
980    geli_fill_keybuf(gelibuf);
981    zfsargs.notapw = '\0';
982    zfsargs.keybuf_sentinel = KEYBUF_SENTINEL;
983    zfsargs.keybuf = gelibuf;
984#else
985    zfsargs.gelipw[0] = '\0';
986#endif
987    if (primary_vdev != NULL)
988	zfsargs.primary_vdev = primary_vdev->v_guid;
989    else
990	printf("failed to detect primary vdev\n");
991    __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
992	   bootdev,
993	   KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG,
994	   (uint32_t) spa->spa_guid,
995	   (uint32_t) (spa->spa_guid >> 32),
996	   VTOP(&bootinfo),
997	   zfsargs);
998}
999
1000static int
1001zfs_mount_ds(char *dsname)
1002{
1003    uint64_t newroot;
1004    spa_t *newspa;
1005    char *q;
1006
1007    q = strchr(dsname, '/');
1008    if (q)
1009	*q++ = '\0';
1010    newspa = spa_find_by_name(dsname);
1011    if (newspa == NULL) {
1012	printf("\nCan't find ZFS pool %s\n", dsname);
1013	return -1;
1014    }
1015
1016    if (zfs_spa_init(newspa))
1017	return -1;
1018
1019    newroot = 0;
1020    if (q) {
1021	if (zfs_lookup_dataset(newspa, q, &newroot)) {
1022	    printf("\nCan't find dataset %s in ZFS pool %s\n",
1023		    q, newspa->spa_name);
1024	    return -1;
1025	}
1026    }
1027    if (zfs_mount(newspa, newroot, &zfsmount)) {
1028	printf("\nCan't mount ZFS dataset\n");
1029	return -1;
1030    }
1031    spa = newspa;
1032    return (0);
1033}
1034
1035static int
1036parse_cmd(void)
1037{
1038    char *arg = cmd;
1039    char *ep, *p, *q;
1040    const char *cp;
1041    int c, i, j;
1042
1043    while ((c = *arg++)) {
1044	if (c == ' ' || c == '\t' || c == '\n')
1045	    continue;
1046	for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
1047	ep = p;
1048	if (*p)
1049	    *p++ = 0;
1050	if (c == '-') {
1051	    while ((c = *arg++)) {
1052		if (c == 'P') {
1053		    if (*(uint8_t *)PTOV(0x496) & 0x10) {
1054			cp = "yes";
1055		    } else {
1056			opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
1057			cp = "no";
1058		    }
1059		    printf("Keyboard: %s\n", cp);
1060		    continue;
1061		} else if (c == 'S') {
1062		    j = 0;
1063		    while ((unsigned int)(i = *arg++ - '0') <= 9)
1064			j = j * 10 + i;
1065		    if (j > 0 && i == -'0') {
1066			comspeed = j;
1067			break;
1068		    }
1069		    /* Fall through to error below ('S' not in optstr[]). */
1070		}
1071		for (i = 0; c != optstr[i]; i++)
1072		    if (i == NOPT - 1)
1073			return -1;
1074		opts ^= OPT_SET(flags[i]);
1075	    }
1076	    ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
1077		     OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
1078	    if (ioctrl & IO_SERIAL) {
1079	        if (sio_init(115200 / comspeed) != 0)
1080		    ioctrl &= ~IO_SERIAL;
1081	    }
1082	} if (c == '?') {
1083	    dnode_phys_t dn;
1084
1085	    if (zfs_lookup(&zfsmount, arg, &dn) == 0) {
1086		zap_list(spa, &dn);
1087	    }
1088	    return -1;
1089	} else {
1090	    arg--;
1091
1092	    /*
1093	     * Report pool status if the comment is 'status'. Lets
1094	     * hope no-one wants to load /status as a kernel.
1095	     */
1096	    if (!strcmp(arg, "status")) {
1097		spa_all_status();
1098		return -1;
1099	    }
1100
1101	    /*
1102	     * If there is "zfs:" prefix simply ignore it.
1103	     */
1104	    if (strncmp(arg, "zfs:", 4) == 0)
1105		arg += 4;
1106
1107	    /*
1108	     * If there is a colon, switch pools.
1109	     */
1110	    q = strchr(arg, ':');
1111	    if (q) {
1112		*q++ = '\0';
1113		if (zfs_mount_ds(arg) != 0)
1114		    return -1;
1115		arg = q;
1116	    }
1117	    if ((i = ep - arg)) {
1118		if ((size_t)i >= sizeof(kname))
1119		    return -1;
1120		memcpy(kname, arg, i + 1);
1121	    }
1122	}
1123	arg = p;
1124    }
1125    return 0;
1126}
1127