1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 */
27
28/*
29 * The 512-byte leaf is broken into 32 16-byte chunks.
30 * chunk number n means l_chunk[n], even though the header precedes it.
31 * the names are stored null-terminated.
32 */
33
34#include <sys/zio.h>
35#include <sys/spa.h>
36#include <sys/dmu.h>
37#include <sys/zfs_context.h>
38#include <sys/fs/zfs.h>
39#include <sys/zap.h>
40#include <sys/zap_impl.h>
41#include <sys/zap_leaf.h>
42#include <sys/arc.h>
43
44static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
45    uint16_t entry);
46
47#define	CHAIN_END 0xffff /* end of the chunk chain */
48
49#define	LEAF_HASH(l, h) \
50	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
51	((h) >> \
52	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
53
54#define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
55
56static void
57stv(int len, void *addr, uint64_t value)
58{
59	switch (len) {
60	case 1:
61		*(uint8_t *)addr = value;
62		return;
63	case 2:
64		*(uint16_t *)addr = value;
65		return;
66	case 4:
67		*(uint32_t *)addr = value;
68		return;
69	case 8:
70		*(uint64_t *)addr = value;
71		return;
72	default:
73		PANIC("bad int len %d", len);
74	}
75}
76
77static uint64_t
78ldv(int len, const void *addr)
79{
80	switch (len) {
81	case 1:
82		return (*(uint8_t *)addr);
83	case 2:
84		return (*(uint16_t *)addr);
85	case 4:
86		return (*(uint32_t *)addr);
87	case 8:
88		return (*(uint64_t *)addr);
89	default:
90		PANIC("bad int len %d", len);
91	}
92	return (0xFEEDFACEDEADBEEFULL);
93}
94
95void
96zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
97{
98	zap_leaf_t l;
99	dmu_buf_t l_dbuf;
100
101	l_dbuf.db_data = buf;
102	l.l_bs = highbit64(size) - 1;
103	l.l_dbuf = &l_dbuf;
104
105	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
106	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
107	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
108	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
109	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
110	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
111	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
112
113	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
114		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
115
116	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
117		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
118		struct zap_leaf_entry *le;
119
120		switch (lc->l_free.lf_type) {
121		case ZAP_CHUNK_ENTRY:
122			le = &lc->l_entry;
123
124			le->le_type =		BSWAP_8(le->le_type);
125			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
126			le->le_next =		BSWAP_16(le->le_next);
127			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
128			le->le_name_numints =	BSWAP_16(le->le_name_numints);
129			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
130			le->le_value_numints =	BSWAP_16(le->le_value_numints);
131			le->le_cd =		BSWAP_32(le->le_cd);
132			le->le_hash =		BSWAP_64(le->le_hash);
133			break;
134		case ZAP_CHUNK_FREE:
135			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
136			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
137			break;
138		case ZAP_CHUNK_ARRAY:
139			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
140			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
141			/* la_array doesn't need swapping */
142			break;
143		default:
144			cmn_err(CE_PANIC, "bad leaf type %d",
145			    lc->l_free.lf_type);
146		}
147	}
148}
149
150void
151zap_leaf_init(zap_leaf_t *l, boolean_t sort)
152{
153	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
154	memset(&zap_leaf_phys(l)->l_hdr, 0,
155	    sizeof (struct zap_leaf_header));
156	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
157	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
158	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
159		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
160		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
161	}
162	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
163	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
164	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
165	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
166	if (sort)
167		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
168}
169
170/*
171 * Routines which manipulate leaf chunks (l_chunk[]).
172 */
173
174static uint16_t
175zap_leaf_chunk_alloc(zap_leaf_t *l)
176{
177	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
178
179	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
180	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
181	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
182
183	zap_leaf_phys(l)->l_hdr.lh_freelist =
184	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
185
186	zap_leaf_phys(l)->l_hdr.lh_nfree--;
187
188	return (chunk);
189}
190
191static void
192zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
193{
194	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
195	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
196	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
197	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
198
199	zlf->lf_type = ZAP_CHUNK_FREE;
200	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
201	memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
202	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
203
204	zap_leaf_phys(l)->l_hdr.lh_nfree++;
205}
206
207/*
208 * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
209 */
210
211static uint16_t
212zap_leaf_array_create(zap_leaf_t *l, const char *buf,
213    int integer_size, int num_integers)
214{
215	uint16_t chunk_head;
216	uint16_t *chunkp = &chunk_head;
217	int byten = integer_size;
218	uint64_t value = 0;
219	int shift = (integer_size - 1) * 8;
220	int len = num_integers;
221
222	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
223
224	if (len > 0)
225		value = ldv(integer_size, buf);
226	while (len > 0) {
227		uint16_t chunk = zap_leaf_chunk_alloc(l);
228		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
229
230		la->la_type = ZAP_CHUNK_ARRAY;
231		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
232			la->la_array[i] = value >> shift;
233			value <<= 8;
234			if (--byten == 0) {
235				if (--len == 0)
236					break;
237				byten = integer_size;
238				buf += integer_size;
239				value = ldv(integer_size, buf);
240			}
241		}
242
243		*chunkp = chunk;
244		chunkp = &la->la_next;
245	}
246	*chunkp = CHAIN_END;
247
248	return (chunk_head);
249}
250
251static void
252zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
253{
254	uint16_t chunk = *chunkp;
255
256	*chunkp = CHAIN_END;
257
258	while (chunk != CHAIN_END) {
259		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
260		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
261		    ZAP_CHUNK_ARRAY);
262		zap_leaf_chunk_free(l, chunk);
263		chunk = nextchunk;
264	}
265}
266
267/* array_len and buf_len are in integers, not bytes */
268static void
269zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
270    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
271    void *buf)
272{
273	int len = MIN(array_len, buf_len);
274	int byten = 0;
275	uint64_t value = 0;
276	char *p = buf;
277
278	ASSERT3U(array_int_len, <=, buf_int_len);
279
280	/* Fast path for one 8-byte integer */
281	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
282		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
283		uint8_t *ip = la->la_array;
284		uint64_t *buf64 = buf;
285
286		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
287		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
288		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
289		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
290		return;
291	}
292
293	/* Fast path for an array of 1-byte integers (eg. the entry name) */
294	if (array_int_len == 1 && buf_int_len == 1 &&
295	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
296		while (chunk != CHAIN_END) {
297			struct zap_leaf_array *la =
298			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
299			memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
300			p += ZAP_LEAF_ARRAY_BYTES;
301			chunk = la->la_next;
302		}
303		return;
304	}
305
306	while (len > 0) {
307		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
308
309		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
310		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
311			value = (value << 8) | la->la_array[i];
312			byten++;
313			if (byten == array_int_len) {
314				stv(buf_int_len, p, value);
315				byten = 0;
316				len--;
317				if (len == 0)
318					return;
319				p += buf_int_len;
320			}
321		}
322		chunk = la->la_next;
323	}
324}
325
326static boolean_t
327zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
328    uint_t chunk, int array_numints)
329{
330	int bseen = 0;
331
332	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
333		uint64_t *thiskey =
334		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
335		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
336
337		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
338		    sizeof (*thiskey), array_numints, thiskey);
339		boolean_t match = memcmp(thiskey, zn->zn_key_orig,
340		    array_numints * sizeof (*thiskey)) == 0;
341		kmem_free(thiskey, array_numints * sizeof (*thiskey));
342		return (match);
343	}
344
345	ASSERT(zn->zn_key_intlen == 1);
346	if (zn->zn_matchtype & MT_NORMALIZE) {
347		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
348
349		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
350		    sizeof (char), array_numints, thisname);
351		boolean_t match = zap_match(zn, thisname);
352		kmem_free(thisname, array_numints);
353		return (match);
354	}
355
356	/*
357	 * Fast path for exact matching.
358	 * First check that the lengths match, so that we don't read
359	 * past the end of the zn_key_orig array.
360	 */
361	if (array_numints != zn->zn_key_orig_numints)
362		return (B_FALSE);
363	while (bseen < array_numints) {
364		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
365		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
366		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
367		if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
368		    toread))
369			break;
370		chunk = la->la_next;
371		bseen += toread;
372	}
373	return (bseen == array_numints);
374}
375
376/*
377 * Routines which manipulate leaf entries.
378 */
379
380int
381zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
382{
383	struct zap_leaf_entry *le;
384
385	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
386
387	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
388	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
389		uint16_t chunk = *chunkp;
390		le = ZAP_LEAF_ENTRY(l, chunk);
391
392		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
393		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
394
395		if (le->le_hash != zn->zn_hash)
396			continue;
397
398		/*
399		 * NB: the entry chain is always sorted by cd on
400		 * normalized zap objects, so this will find the
401		 * lowest-cd match for MT_NORMALIZE.
402		 */
403		ASSERT((zn->zn_matchtype == 0) ||
404		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
405		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
406		    le->le_name_numints)) {
407			zeh->zeh_num_integers = le->le_value_numints;
408			zeh->zeh_integer_size = le->le_value_intlen;
409			zeh->zeh_cd = le->le_cd;
410			zeh->zeh_hash = le->le_hash;
411			zeh->zeh_chunkp = chunkp;
412			zeh->zeh_leaf = l;
413			return (0);
414		}
415	}
416
417	return (SET_ERROR(ENOENT));
418}
419
420/* Return (h1,cd1 >= h2,cd2) */
421#define	HCD_GTEQ(h1, cd1, h2, cd2) \
422	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
423
424int
425zap_leaf_lookup_closest(zap_leaf_t *l,
426    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
427{
428	uint64_t besth = -1ULL;
429	uint32_t bestcd = -1U;
430	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
431	struct zap_leaf_entry *le;
432
433	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
434
435	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
436		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
437		    chunk != CHAIN_END; chunk = le->le_next) {
438			le = ZAP_LEAF_ENTRY(l, chunk);
439
440			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
441			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
442
443			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
444			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
445				ASSERT3U(bestlh, >=, lh);
446				bestlh = lh;
447				besth = le->le_hash;
448				bestcd = le->le_cd;
449
450				zeh->zeh_num_integers = le->le_value_numints;
451				zeh->zeh_integer_size = le->le_value_intlen;
452				zeh->zeh_cd = le->le_cd;
453				zeh->zeh_hash = le->le_hash;
454				zeh->zeh_fakechunk = chunk;
455				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
456				zeh->zeh_leaf = l;
457			}
458		}
459	}
460
461	return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
462}
463
464int
465zap_entry_read(const zap_entry_handle_t *zeh,
466    uint8_t integer_size, uint64_t num_integers, void *buf)
467{
468	struct zap_leaf_entry *le =
469	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
470	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
471
472	if (le->le_value_intlen > integer_size)
473		return (SET_ERROR(EINVAL));
474
475	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
476	    le->le_value_intlen, le->le_value_numints,
477	    integer_size, num_integers, buf);
478
479	if (zeh->zeh_num_integers > num_integers)
480		return (SET_ERROR(EOVERFLOW));
481	return (0);
482
483}
484
485int
486zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
487    char *buf)
488{
489	struct zap_leaf_entry *le =
490	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
491	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
492
493	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
494		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
495		    le->le_name_numints, 8, buflen / 8, buf);
496	} else {
497		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
498		    le->le_name_numints, 1, buflen, buf);
499	}
500	if (le->le_name_numints > buflen)
501		return (SET_ERROR(EOVERFLOW));
502	return (0);
503}
504
505int
506zap_entry_update(zap_entry_handle_t *zeh,
507    uint8_t integer_size, uint64_t num_integers, const void *buf)
508{
509	zap_leaf_t *l = zeh->zeh_leaf;
510	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
511
512	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
513	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
514
515	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
516		return (SET_ERROR(EAGAIN));
517
518	zap_leaf_array_free(l, &le->le_value_chunk);
519	le->le_value_chunk =
520	    zap_leaf_array_create(l, buf, integer_size, num_integers);
521	le->le_value_numints = num_integers;
522	le->le_value_intlen = integer_size;
523	return (0);
524}
525
526void
527zap_entry_remove(zap_entry_handle_t *zeh)
528{
529	zap_leaf_t *l = zeh->zeh_leaf;
530
531	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
532
533	uint16_t entry_chunk = *zeh->zeh_chunkp;
534	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
535	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
536
537	zap_leaf_array_free(l, &le->le_name_chunk);
538	zap_leaf_array_free(l, &le->le_value_chunk);
539
540	*zeh->zeh_chunkp = le->le_next;
541	zap_leaf_chunk_free(l, entry_chunk);
542
543	zap_leaf_phys(l)->l_hdr.lh_nentries--;
544}
545
546int
547zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
548    uint8_t integer_size, uint64_t num_integers, const void *buf,
549    zap_entry_handle_t *zeh)
550{
551	uint16_t chunk;
552	struct zap_leaf_entry *le;
553	uint64_t h = zn->zn_hash;
554
555	uint64_t valuelen = integer_size * num_integers;
556
557	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
558	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
559	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
560		return (SET_ERROR(E2BIG));
561
562	if (cd == ZAP_NEED_CD) {
563		/* find the lowest unused cd */
564		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
565			cd = 0;
566
567			for (chunk = *LEAF_HASH_ENTPTR(l, h);
568			    chunk != CHAIN_END; chunk = le->le_next) {
569				le = ZAP_LEAF_ENTRY(l, chunk);
570				if (le->le_cd > cd)
571					break;
572				if (le->le_hash == h) {
573					ASSERT3U(cd, ==, le->le_cd);
574					cd++;
575				}
576			}
577		} else {
578			/* old unsorted format; do it the O(n^2) way */
579			for (cd = 0; ; cd++) {
580				for (chunk = *LEAF_HASH_ENTPTR(l, h);
581				    chunk != CHAIN_END; chunk = le->le_next) {
582					le = ZAP_LEAF_ENTRY(l, chunk);
583					if (le->le_hash == h &&
584					    le->le_cd == cd) {
585						break;
586					}
587				}
588				/* If this cd is not in use, we are good. */
589				if (chunk == CHAIN_END)
590					break;
591			}
592		}
593		/*
594		 * We would run out of space in a block before we could
595		 * store enough entries to run out of CD values.
596		 */
597		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
598	}
599
600	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
601		return (SET_ERROR(EAGAIN));
602
603	/* make the entry */
604	chunk = zap_leaf_chunk_alloc(l);
605	le = ZAP_LEAF_ENTRY(l, chunk);
606	le->le_type = ZAP_CHUNK_ENTRY;
607	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
608	    zn->zn_key_intlen, zn->zn_key_orig_numints);
609	le->le_name_numints = zn->zn_key_orig_numints;
610	le->le_value_chunk =
611	    zap_leaf_array_create(l, buf, integer_size, num_integers);
612	le->le_value_numints = num_integers;
613	le->le_value_intlen = integer_size;
614	le->le_hash = h;
615	le->le_cd = cd;
616
617	/* link it into the hash chain */
618	/* XXX if we did the search above, we could just use that */
619	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
620
621	zap_leaf_phys(l)->l_hdr.lh_nentries++;
622
623	zeh->zeh_leaf = l;
624	zeh->zeh_num_integers = num_integers;
625	zeh->zeh_integer_size = le->le_value_intlen;
626	zeh->zeh_cd = le->le_cd;
627	zeh->zeh_hash = le->le_hash;
628	zeh->zeh_chunkp = chunkp;
629
630	return (0);
631}
632
633/*
634 * Determine if there is another entry with the same normalized form.
635 * For performance purposes, either zn or name must be provided (the
636 * other can be NULL).  Note, there usually won't be any hash
637 * conflicts, in which case we don't need the concatenated/normalized
638 * form of the name.  But all callers have one of these on hand anyway,
639 * so might as well take advantage.  A cleaner but slower interface
640 * would accept neither argument, and compute the normalized name as
641 * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
642 */
643boolean_t
644zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
645    const char *name, zap_t *zap)
646{
647	struct zap_leaf_entry *le;
648	boolean_t allocdzn = B_FALSE;
649
650	if (zap->zap_normflags == 0)
651		return (B_FALSE);
652
653	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
654	    chunk != CHAIN_END; chunk = le->le_next) {
655		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
656		if (le->le_hash != zeh->zeh_hash)
657			continue;
658		if (le->le_cd == zeh->zeh_cd)
659			continue;
660
661		if (zn == NULL) {
662			zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
663			allocdzn = B_TRUE;
664		}
665		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
666		    le->le_name_chunk, le->le_name_numints)) {
667			if (allocdzn)
668				zap_name_free(zn);
669			return (B_TRUE);
670		}
671	}
672	if (allocdzn)
673		zap_name_free(zn);
674	return (B_FALSE);
675}
676
677/*
678 * Routines for transferring entries between leafs.
679 */
680
681static uint16_t *
682zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
683{
684	struct zap_leaf_entry *le2;
685	uint16_t *chunkp;
686
687	/*
688	 * keep the entry chain sorted by cd
689	 * NB: this will not cause problems for unsorted leafs, though
690	 * it is unnecessary there.
691	 */
692	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
693	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
694		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
695		if (le2->le_cd > le->le_cd)
696			break;
697	}
698
699	le->le_next = *chunkp;
700	*chunkp = entry;
701	return (chunkp);
702}
703
704static uint16_t
705zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
706{
707	uint16_t new_chunk;
708	uint16_t *nchunkp = &new_chunk;
709
710	while (chunk != CHAIN_END) {
711		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
712		struct zap_leaf_array *nla =
713		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
714		struct zap_leaf_array *la =
715		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
716		uint_t nextchunk = la->la_next;
717
718		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
719		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
720
721		*nla = *la; /* structure assignment */
722
723		zap_leaf_chunk_free(l, chunk);
724		chunk = nextchunk;
725		*nchunkp = nchunk;
726		nchunkp = &nla->la_next;
727	}
728	*nchunkp = CHAIN_END;
729	return (new_chunk);
730}
731
732static void
733zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
734{
735	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
736	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
737
738	uint16_t chunk = zap_leaf_chunk_alloc(nl);
739	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
740	*nle = *le; /* structure assignment */
741
742	(void) zap_leaf_rehash_entry(nl, nle, chunk);
743
744	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
745	nle->le_value_chunk =
746	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
747
748	zap_leaf_chunk_free(l, entry);
749
750	zap_leaf_phys(l)->l_hdr.lh_nentries--;
751	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
752}
753
754/*
755 * Transfer the entries whose hash prefix ends in 1 to the new leaf.
756 */
757void
758zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
759{
760	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
761
762	/* set new prefix and prefix_len */
763	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
764	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
765	zap_leaf_phys(nl)->l_hdr.lh_prefix =
766	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
767	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
768	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
769
770	/* break existing hash chains */
771	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
772	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
773
774	if (sort)
775		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
776
777	/*
778	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
779	 * the remaining entries
780	 *
781	 * NB: We could find entries via the hashtable instead. That
782	 * would be O(hashents+numents) rather than O(numblks+numents),
783	 * but this accesses memory more sequentially, and when we're
784	 * called, the block is usually pretty full.
785	 */
786	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
787		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
788		if (le->le_type != ZAP_CHUNK_ENTRY)
789			continue;
790
791		if (le->le_hash & (1ULL << bit))
792			zap_leaf_transfer_entry(l, i, nl);
793		else
794			(void) zap_leaf_rehash_entry(l, le, i);
795	}
796}
797
798void
799zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
800{
801	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
802	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
803	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
804	zs->zs_leafs_with_2n_pointers[n]++;
805
806
807	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
808	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
809	zs->zs_blocks_with_n5_entries[n]++;
810
811	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
812	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
813	    (1<<FZAP_BLOCK_SHIFT(zap));
814	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
815	zs->zs_blocks_n_tenths_full[n]++;
816
817	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
818		uint_t nentries = 0;
819		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
820
821		while (chunk != CHAIN_END) {
822			struct zap_leaf_entry *le =
823			    ZAP_LEAF_ENTRY(l, chunk);
824
825			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
826			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
827			    le->le_value_intlen);
828			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
829			zs->zs_entries_using_n_chunks[n]++;
830
831			chunk = le->le_next;
832			nentries++;
833		}
834
835		n = nentries;
836		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
837		zs->zs_buckets_with_n_entries[n]++;
838	}
839}
840