1/*
2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 *
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/*
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niew��hner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
37 *
38 * [1] Portions of this software were developed by Allan Jude
39 *     under sponsorship from the FreeBSD Foundation.
40 */
41
42#include <sys/param.h>
43#include <sys/sysmacros.h>
44#include <sys/zfs_context.h>
45#include <sys/zio_compress.h>
46#include <sys/spa.h>
47#include <sys/zstd/zstd.h>
48
49#define	ZSTD_STATIC_LINKING_ONLY
50#include "lib/zstd.h"
51#include "lib/common/zstd_errors.h"
52
53#ifndef IN_LIBSA
54static uint_t zstd_earlyabort_pass = 1;
55static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
56static unsigned int zstd_abort_size = (128 * 1024);
57#endif
58
59static kstat_t *zstd_ksp = NULL;
60
61typedef struct zstd_stats {
62	kstat_named_t	zstd_stat_alloc_fail;
63	kstat_named_t	zstd_stat_alloc_fallback;
64	kstat_named_t	zstd_stat_com_alloc_fail;
65	kstat_named_t	zstd_stat_dec_alloc_fail;
66	kstat_named_t	zstd_stat_com_inval;
67	kstat_named_t	zstd_stat_dec_inval;
68	kstat_named_t	zstd_stat_dec_header_inval;
69	kstat_named_t	zstd_stat_com_fail;
70	kstat_named_t	zstd_stat_dec_fail;
71	/*
72	 * LZ4 first-pass early abort verdict
73	 */
74	kstat_named_t	zstd_stat_lz4pass_allowed;
75	kstat_named_t	zstd_stat_lz4pass_rejected;
76	/*
77	 * zstd-1 second-pass early abort verdict
78	 */
79	kstat_named_t	zstd_stat_zstdpass_allowed;
80	kstat_named_t	zstd_stat_zstdpass_rejected;
81	/*
82	 * We excluded this from early abort for some reason
83	 */
84	kstat_named_t	zstd_stat_passignored;
85	kstat_named_t	zstd_stat_passignored_size;
86	kstat_named_t	zstd_stat_buffers;
87	kstat_named_t	zstd_stat_size;
88} zstd_stats_t;
89
90static zstd_stats_t zstd_stats = {
91	{ "alloc_fail",			KSTAT_DATA_UINT64 },
92	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
93	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
94	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
95	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
96	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
97	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
98	{ "compress_failed",		KSTAT_DATA_UINT64 },
99	{ "decompress_failed",		KSTAT_DATA_UINT64 },
100	{ "lz4pass_allowed",		KSTAT_DATA_UINT64 },
101	{ "lz4pass_rejected",		KSTAT_DATA_UINT64 },
102	{ "zstdpass_allowed",		KSTAT_DATA_UINT64 },
103	{ "zstdpass_rejected",		KSTAT_DATA_UINT64 },
104	{ "passignored",		KSTAT_DATA_UINT64 },
105	{ "passignored_size",		KSTAT_DATA_UINT64 },
106	{ "buffers",			KSTAT_DATA_UINT64 },
107	{ "size",			KSTAT_DATA_UINT64 },
108};
109
110#ifdef _KERNEL
111static int
112kstat_zstd_update(kstat_t *ksp, int rw)
113{
114	ASSERT(ksp != NULL);
115
116	if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
117		ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
118		ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
119		ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
120		ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
121		ZSTDSTAT_ZERO(zstd_stat_com_inval);
122		ZSTDSTAT_ZERO(zstd_stat_dec_inval);
123		ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
124		ZSTDSTAT_ZERO(zstd_stat_com_fail);
125		ZSTDSTAT_ZERO(zstd_stat_dec_fail);
126		ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
127		ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
128		ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
129		ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
130		ZSTDSTAT_ZERO(zstd_stat_passignored);
131		ZSTDSTAT_ZERO(zstd_stat_passignored_size);
132	}
133
134	return (0);
135}
136#endif
137
138/* Enums describing the allocator type specified by kmem_type in zstd_kmem */
139enum zstd_kmem_type {
140	ZSTD_KMEM_UNKNOWN = 0,
141	/* Allocation type using kmem_vmalloc */
142	ZSTD_KMEM_DEFAULT,
143	/* Pool based allocation using mempool_alloc */
144	ZSTD_KMEM_POOL,
145	/* Reserved fallback memory for decompression only */
146	ZSTD_KMEM_DCTX,
147	ZSTD_KMEM_COUNT,
148};
149
150/* Structure for pooled memory objects */
151struct zstd_pool {
152	void *mem;
153	size_t size;
154	kmutex_t barrier;
155	hrtime_t timeout;
156};
157
158/* Global structure for handling memory allocations */
159struct zstd_kmem {
160	enum zstd_kmem_type kmem_type;
161	size_t kmem_size;
162	struct zstd_pool *pool;
163};
164
165/* Fallback memory structure used for decompression only if memory runs out */
166struct zstd_fallback_mem {
167	size_t mem_size;
168	void *mem;
169	kmutex_t barrier;
170};
171
172struct zstd_levelmap {
173	int16_t zstd_level;
174	enum zio_zstd_levels level;
175};
176
177/*
178 * ZSTD memory handlers
179 *
180 * For decompression we use a different handler which also provides fallback
181 * memory allocation in case memory runs out.
182 *
183 * The ZSTD handlers were split up for the most simplified implementation.
184 */
185#ifndef IN_LIBSA
186static void *zstd_alloc(void *opaque, size_t size);
187#endif
188static void *zstd_dctx_alloc(void *opaque, size_t size);
189static void zstd_free(void *opaque, void *ptr);
190
191#ifndef IN_LIBSA
192/* Compression memory handler */
193static const ZSTD_customMem zstd_malloc = {
194	zstd_alloc,
195	zstd_free,
196	NULL,
197};
198#endif
199
200/* Decompression memory handler */
201static const ZSTD_customMem zstd_dctx_malloc = {
202	zstd_dctx_alloc,
203	zstd_free,
204	NULL,
205};
206
207/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
208static struct zstd_levelmap zstd_levels[] = {
209	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
210	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
211	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
212	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
213	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
214	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
215	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
216	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
217	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
218	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
219	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
220	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
221	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
222	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
223	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
224	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
225	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
226	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
227	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
228	{-1, ZIO_ZSTD_LEVEL_FAST_1},
229	{-2, ZIO_ZSTD_LEVEL_FAST_2},
230	{-3, ZIO_ZSTD_LEVEL_FAST_3},
231	{-4, ZIO_ZSTD_LEVEL_FAST_4},
232	{-5, ZIO_ZSTD_LEVEL_FAST_5},
233	{-6, ZIO_ZSTD_LEVEL_FAST_6},
234	{-7, ZIO_ZSTD_LEVEL_FAST_7},
235	{-8, ZIO_ZSTD_LEVEL_FAST_8},
236	{-9, ZIO_ZSTD_LEVEL_FAST_9},
237	{-10, ZIO_ZSTD_LEVEL_FAST_10},
238	{-20, ZIO_ZSTD_LEVEL_FAST_20},
239	{-30, ZIO_ZSTD_LEVEL_FAST_30},
240	{-40, ZIO_ZSTD_LEVEL_FAST_40},
241	{-50, ZIO_ZSTD_LEVEL_FAST_50},
242	{-60, ZIO_ZSTD_LEVEL_FAST_60},
243	{-70, ZIO_ZSTD_LEVEL_FAST_70},
244	{-80, ZIO_ZSTD_LEVEL_FAST_80},
245	{-90, ZIO_ZSTD_LEVEL_FAST_90},
246	{-100, ZIO_ZSTD_LEVEL_FAST_100},
247	{-500, ZIO_ZSTD_LEVEL_FAST_500},
248	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
249};
250
251/*
252 * This variable represents the maximum count of the pool based on the number
253 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
254 */
255static int pool_count = 16;
256
257#define	ZSTD_POOL_MAX		pool_count
258#define	ZSTD_POOL_TIMEOUT	60 * 2
259
260static struct zstd_fallback_mem zstd_dctx_fallback;
261static struct zstd_pool *zstd_mempool_cctx;
262static struct zstd_pool *zstd_mempool_dctx;
263
264/*
265 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
266 * and while ASAN does this, KASAN defines that and does not. So to avoid
267 * changing the external code, we do this.
268 */
269#if defined(ZFS_ASAN_ENABLED)
270#define	ADDRESS_SANITIZER 1
271#endif
272#if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
273void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
274void __asan_poison_memory_region(void const volatile *addr, size_t size);
275void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
276void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
277#endif
278
279
280static void
281zstd_mempool_reap(struct zstd_pool *zstd_mempool)
282{
283	struct zstd_pool *pool;
284
285	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
286		return;
287	}
288
289	/* free obsolete slots */
290	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
291		pool = &zstd_mempool[i];
292		if (pool->mem && mutex_tryenter(&pool->barrier)) {
293			/* Free memory if unused object older than 2 minutes */
294			if (pool->mem && gethrestime_sec() > pool->timeout) {
295				vmem_free(pool->mem, pool->size);
296				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
297				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
298				pool->mem = NULL;
299				pool->size = 0;
300				pool->timeout = 0;
301			}
302			mutex_exit(&pool->barrier);
303		}
304	}
305}
306
307/*
308 * Try to get a cached allocated buffer from memory pool or allocate a new one
309 * if necessary. If a object is older than 2 minutes and does not fit the
310 * requested size, it will be released and a new cached entry will be allocated.
311 * If other pooled objects are detected without being used for 2 minutes, they
312 * will be released, too.
313 *
314 * The concept is that high frequency memory allocations of bigger objects are
315 * expensive. So if a lot of work is going on, allocations will be kept for a
316 * while and can be reused in that time frame.
317 *
318 * The scheduled release will be updated every time a object is reused.
319 */
320
321static void *
322zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
323{
324	struct zstd_pool *pool;
325	struct zstd_kmem *mem = NULL;
326
327	if (!zstd_mempool) {
328		return (NULL);
329	}
330
331	/* Seek for preallocated memory slot and free obsolete slots */
332	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
333		pool = &zstd_mempool[i];
334		/*
335		 * This lock is simply a marker for a pool object being in use.
336		 * If it's already hold, it will be skipped.
337		 *
338		 * We need to create it before checking it to avoid race
339		 * conditions caused by running in a threaded context.
340		 *
341		 * The lock is later released by zstd_mempool_free.
342		 */
343		if (mutex_tryenter(&pool->barrier)) {
344			/*
345			 * Check if objects fits the size, if so we take it and
346			 * update the timestamp.
347			 */
348			if (pool->mem && size <= pool->size) {
349				pool->timeout = gethrestime_sec() +
350				    ZSTD_POOL_TIMEOUT;
351				mem = pool->mem;
352				return (mem);
353			}
354			mutex_exit(&pool->barrier);
355		}
356	}
357
358	/*
359	 * If no preallocated slot was found, try to fill in a new one.
360	 *
361	 * We run a similar algorithm twice here to avoid pool fragmentation.
362	 * The first one may generate holes in the list if objects get released.
363	 * We always make sure that these holes get filled instead of adding new
364	 * allocations constantly at the end.
365	 */
366	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
367		pool = &zstd_mempool[i];
368		if (mutex_tryenter(&pool->barrier)) {
369			/* Object is free, try to allocate new one */
370			if (!pool->mem) {
371				mem = vmem_alloc(size, KM_SLEEP);
372				if (mem) {
373					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
374					ZSTDSTAT_ADD(zstd_stat_size, size);
375					pool->mem = mem;
376					pool->size = size;
377					/* Keep track for later release */
378					mem->pool = pool;
379					mem->kmem_type = ZSTD_KMEM_POOL;
380					mem->kmem_size = size;
381				}
382			}
383
384			if (size <= pool->size) {
385				/* Update timestamp */
386				pool->timeout = gethrestime_sec() +
387				    ZSTD_POOL_TIMEOUT;
388
389				return (pool->mem);
390			}
391
392			mutex_exit(&pool->barrier);
393		}
394	}
395
396	/*
397	 * If the pool is full or the allocation failed, try lazy allocation
398	 * instead.
399	 */
400	if (!mem) {
401		mem = vmem_alloc(size, KM_NOSLEEP);
402		if (mem) {
403			mem->pool = NULL;
404			mem->kmem_type = ZSTD_KMEM_DEFAULT;
405			mem->kmem_size = size;
406		}
407	}
408
409	return (mem);
410}
411
412/* Mark object as released by releasing the barrier mutex */
413static void
414zstd_mempool_free(struct zstd_kmem *z)
415{
416	mutex_exit(&z->pool->barrier);
417}
418
419/* Convert ZFS internal enum to ZSTD level */
420static int
421zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
422{
423	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
424		*zstd_level = zstd_levels[level - 1].zstd_level;
425		return (0);
426	}
427	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
428	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
429		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
430		    + ZIO_ZSTD_LEVEL_19].zstd_level;
431		return (0);
432	}
433
434	/* Invalid/unknown zfs compression enum - this should never happen. */
435	return (1);
436}
437
438#ifndef IN_LIBSA
439size_t
440zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
441    int level)
442{
443	int16_t zstd_level;
444	if (zstd_enum_to_level(level, &zstd_level)) {
445		ZSTDSTAT_BUMP(zstd_stat_com_inval);
446		return (s_len);
447	}
448	/*
449	 * A zstd early abort heuristic.
450	 *
451	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
452	 *   128k), don't try any of this, just go.
453	 *   (because experimentally that was a reasonable cutoff for a perf win
454	 *   with tiny ratio change)
455	 * - First, we try LZ4 compression, and if it doesn't early abort, we
456	 *   jump directly to whatever compression level we intended to try.
457	 * - Second, we try zstd-1 - if that errors out (usually, but not
458	 *   exclusively, if it would overflow), we give up early.
459	 *
460	 *   If it works, instead we go on and compress anyway.
461	 *
462	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
463	 * compressible data, it was losing up to 8.5% of the compressed
464	 * savings versus no early abort, and all the zstd-fast levels are
465	 * worse indications on their own than LZ4, and don't improve the LZ4
466	 * pass noticably if stacked like this.
467	 */
468	size_t actual_abort_size = zstd_abort_size;
469	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
470	    s_len >= actual_abort_size) {
471		int pass_len = 1;
472		pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
473		if (pass_len < d_len) {
474			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
475			goto keep_trying;
476		}
477		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
478
479		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
480		    ZIO_ZSTD_LEVEL_1);
481		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
482			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
483			return (s_len);
484		}
485		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
486	} else {
487		ZSTDSTAT_BUMP(zstd_stat_passignored);
488		if (s_len < actual_abort_size) {
489			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
490		}
491	}
492keep_trying:
493	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
494
495}
496
497/* Compress block using zstd */
498size_t
499zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
500    int level)
501{
502	size_t c_len;
503	int16_t zstd_level;
504	zfs_zstdhdr_t *hdr;
505	ZSTD_CCtx *cctx;
506
507	hdr = (zfs_zstdhdr_t *)d_start;
508
509	/* Skip compression if the specified level is invalid */
510	if (zstd_enum_to_level(level, &zstd_level)) {
511		ZSTDSTAT_BUMP(zstd_stat_com_inval);
512		return (s_len);
513	}
514
515	ASSERT3U(d_len, >=, sizeof (*hdr));
516	ASSERT3U(d_len, <=, s_len);
517	ASSERT3U(zstd_level, !=, 0);
518
519	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
520
521	/*
522	 * Out of kernel memory, gently fall through - this will disable
523	 * compression in zio_compress_data
524	 */
525	if (!cctx) {
526		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
527		return (s_len);
528	}
529
530	/* Set the compression level */
531	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
532
533	/* Use the "magicless" zstd header which saves us 4 header bytes */
534	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
535
536	/*
537	 * Disable redundant checksum calculation and content size storage since
538	 * this is already done by ZFS itself.
539	 */
540	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
541	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
542
543	c_len = ZSTD_compress2(cctx,
544	    hdr->data,
545	    d_len - sizeof (*hdr),
546	    s_start, s_len);
547
548	ZSTD_freeCCtx(cctx);
549
550	/* Error in the compression routine, disable compression. */
551	if (ZSTD_isError(c_len)) {
552		/*
553		 * If we are aborting the compression because the saves are
554		 * too small, that is not a failure. Everything else is a
555		 * failure, so increment the compression failure counter.
556		 */
557		int err = ZSTD_getErrorCode(c_len);
558		if (err != ZSTD_error_dstSize_tooSmall) {
559			ZSTDSTAT_BUMP(zstd_stat_com_fail);
560			dprintf("Error: %s", ZSTD_getErrorString(err));
561		}
562		return (s_len);
563	}
564
565	/*
566	 * Encode the compressed buffer size at the start. We'll need this in
567	 * decompression to counter the effects of padding which might be added
568	 * to the compressed buffer and which, if unhandled, would confuse the
569	 * hell out of our decompression function.
570	 */
571	hdr->c_len = BE_32(c_len);
572
573	/*
574	 * Check version for overflow.
575	 * The limit of 24 bits must not be exceeded. This allows a maximum
576	 * version 1677.72.15 which we don't expect to be ever reached.
577	 */
578	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
579
580	/*
581	 * Encode the compression level as well. We may need to know the
582	 * original compression level if compressed_arc is disabled, to match
583	 * the compression settings to write this block to the L2ARC.
584	 *
585	 * Encode the actual level, so if the enum changes in the future, we
586	 * will be compatible.
587	 *
588	 * The upper 24 bits store the ZSTD version to be able to provide
589	 * future compatibility, since new versions might enhance the
590	 * compression algorithm in a way, where the compressed data will
591	 * change.
592	 *
593	 * As soon as such incompatibility occurs, handling code needs to be
594	 * added, differentiating between the versions.
595	 */
596	zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
597	zfs_set_hdrlevel(hdr, level);
598	hdr->raw_version_level = BE_32(hdr->raw_version_level);
599
600	return (c_len + sizeof (*hdr));
601}
602#endif
603
604/* Decompress block using zstd and return its stored level */
605int
606zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
607    size_t d_len, uint8_t *level)
608{
609	ZSTD_DCtx *dctx;
610	size_t result;
611	int16_t zstd_level;
612	uint32_t c_len;
613	const zfs_zstdhdr_t *hdr;
614	zfs_zstdhdr_t hdr_copy;
615
616	hdr = (const zfs_zstdhdr_t *)s_start;
617	c_len = BE_32(hdr->c_len);
618
619	/*
620	 * Make a copy instead of directly converting the header, since we must
621	 * not modify the original data that may be used again later.
622	 */
623	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
624	uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
625
626	/*
627	 * NOTE: We ignore the ZSTD version for now. As soon as any
628	 * incompatibility occurs, it has to be handled accordingly.
629	 * The version can be accessed via `hdr_copy.version`.
630	 */
631
632	/*
633	 * Convert and check the level
634	 * An invalid level is a strong indicator for data corruption! In such
635	 * case return an error so the upper layers can try to fix it.
636	 */
637	if (zstd_enum_to_level(curlevel, &zstd_level)) {
638		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
639		return (1);
640	}
641
642	ASSERT3U(d_len, >=, s_len);
643	ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
644
645	/* Invalid compressed buffer size encoded at start */
646	if (c_len + sizeof (*hdr) > s_len) {
647		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
648		return (1);
649	}
650
651	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
652	if (!dctx) {
653		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
654		return (1);
655	}
656
657	/* Set header type to "magicless" */
658	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
659
660	/* Decompress the data and release the context */
661	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
662	ZSTD_freeDCtx(dctx);
663
664	/*
665	 * Returns 0 on success (decompression function returned non-negative)
666	 * and non-zero on failure (decompression function returned negative.
667	 */
668	if (ZSTD_isError(result)) {
669		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
670		return (1);
671	}
672
673	if (level) {
674		*level = curlevel;
675	}
676
677	return (0);
678}
679
680/* Decompress datablock using zstd */
681int
682zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
683    int level __maybe_unused)
684{
685
686	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
687	    NULL));
688}
689
690#ifndef IN_LIBSA
691/* Allocator for zstd compression context using mempool_allocator */
692static void *
693zstd_alloc(void *opaque __maybe_unused, size_t size)
694{
695	size_t nbytes = sizeof (struct zstd_kmem) + size;
696	struct zstd_kmem *z = NULL;
697
698	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
699
700	if (!z) {
701		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
702		return (NULL);
703	}
704
705	return ((void*)z + (sizeof (struct zstd_kmem)));
706}
707#endif
708
709/*
710 * Allocator for zstd decompression context using mempool_allocator with
711 * fallback to reserved memory if allocation fails
712 */
713static void *
714zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
715{
716	size_t nbytes = sizeof (struct zstd_kmem) + size;
717	struct zstd_kmem *z = NULL;
718	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
719
720	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
721	if (!z) {
722		/* Try harder, decompression shall not fail */
723		z = vmem_alloc(nbytes, KM_SLEEP);
724		if (z) {
725			z->pool = NULL;
726		}
727		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
728	} else {
729		return ((void*)z + (sizeof (struct zstd_kmem)));
730	}
731
732	/* Fallback if everything fails */
733	if (!z) {
734		/*
735		 * Barrier since we only can handle it in a single thread. All
736		 * other following threads need to wait here until decompression
737		 * is completed. zstd_free will release this barrier later.
738		 */
739		mutex_enter(&zstd_dctx_fallback.barrier);
740
741		z = zstd_dctx_fallback.mem;
742		type = ZSTD_KMEM_DCTX;
743		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
744	}
745
746	/* Allocation should always be successful */
747	if (!z) {
748		return (NULL);
749	}
750
751	z->kmem_type = type;
752	z->kmem_size = nbytes;
753
754	return ((void*)z + (sizeof (struct zstd_kmem)));
755}
756
757/* Free allocated memory by its specific type */
758static void
759zstd_free(void *opaque __maybe_unused, void *ptr)
760{
761	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
762	enum zstd_kmem_type type;
763
764	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
765	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
766
767	type = z->kmem_type;
768	switch (type) {
769	case ZSTD_KMEM_DEFAULT:
770		vmem_free(z, z->kmem_size);
771		break;
772	case ZSTD_KMEM_POOL:
773		zstd_mempool_free(z);
774		break;
775	case ZSTD_KMEM_DCTX:
776		mutex_exit(&zstd_dctx_fallback.barrier);
777		break;
778	default:
779		break;
780	}
781}
782
783/* Allocate fallback memory to ensure safe decompression */
784static void __init
785create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
786{
787	mem->mem_size = size;
788	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
789	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
790}
791
792/* Initialize memory pool barrier mutexes */
793static void __init
794zstd_mempool_init(void)
795{
796	zstd_mempool_cctx =
797	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
798	zstd_mempool_dctx =
799	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
800
801	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
802		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
803		    MUTEX_DEFAULT, NULL);
804		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
805		    MUTEX_DEFAULT, NULL);
806	}
807}
808
809/* Initialize zstd-related memory handling */
810static int __init
811zstd_meminit(void)
812{
813	zstd_mempool_init();
814
815	/*
816	 * Estimate the size of the fallback decompression context.
817	 * The expected size on x64 with current ZSTD should be about 160 KB.
818	 */
819	create_fallback_mem(&zstd_dctx_fallback,
820	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
821	    PAGESIZE));
822
823	return (0);
824}
825
826/* Release object from pool and free memory */
827static void
828release_pool(struct zstd_pool *pool)
829{
830	mutex_destroy(&pool->barrier);
831	vmem_free(pool->mem, pool->size);
832	pool->mem = NULL;
833	pool->size = 0;
834}
835
836/* Release memory pool objects */
837static void
838zstd_mempool_deinit(void)
839{
840	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
841		release_pool(&zstd_mempool_cctx[i]);
842		release_pool(&zstd_mempool_dctx[i]);
843	}
844
845	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
846	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
847	zstd_mempool_dctx = NULL;
848	zstd_mempool_cctx = NULL;
849}
850
851/* release unused memory from pool */
852
853void
854zfs_zstd_cache_reap_now(void)
855{
856
857	/*
858	 * Short-circuit if there are no buffers to begin with.
859	 */
860	if (ZSTDSTAT(zstd_stat_buffers) == 0)
861		return;
862
863	/*
864	 * calling alloc with zero size seeks
865	 * and releases old unused objects
866	 */
867	zstd_mempool_reap(zstd_mempool_cctx);
868	zstd_mempool_reap(zstd_mempool_dctx);
869}
870
871extern int __init
872zstd_init(void)
873{
874	/* Set pool size by using maximum sane thread count * 4 */
875	pool_count = (boot_ncpus * 4);
876	zstd_meminit();
877
878	/* Initialize kstat */
879	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
880	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
881	    KSTAT_FLAG_VIRTUAL);
882	if (zstd_ksp != NULL) {
883		zstd_ksp->ks_data = &zstd_stats;
884		kstat_install(zstd_ksp);
885#ifdef _KERNEL
886		zstd_ksp->ks_update = kstat_zstd_update;
887#endif
888	}
889
890	return (0);
891}
892
893extern void
894zstd_fini(void)
895{
896	/* Deinitialize kstat */
897	if (zstd_ksp != NULL) {
898		kstat_delete(zstd_ksp);
899		zstd_ksp = NULL;
900	}
901
902	/* Release fallback memory */
903	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
904	mutex_destroy(&zstd_dctx_fallback.barrier);
905
906	/* Deinit memory pool */
907	zstd_mempool_deinit();
908}
909
910#if defined(_KERNEL)
911#ifdef __FreeBSD__
912module_init(zstd_init);
913module_exit(zstd_fini);
914#endif
915
916ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
917	"Enable early abort attempts when using zstd");
918ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
919	"Minimal size of block to attempt early abort");
920#endif
921