1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24 */
25
26#include <sys/zio_checksum.h>
27#include <sys/zfs_context.h>
28#include <sys/zfs_chksum.h>
29#include <sys/zfs_impl.h>
30
31#include <sys/blake3.h>
32#include <sys/sha2.h>
33
34/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
35#define	LIMIT_PERF_MBS	300
36
37typedef struct {
38	const char *name;
39	const char *impl;
40	uint64_t bs1k;
41	uint64_t bs4k;
42	uint64_t bs16k;
43	uint64_t bs64k;
44	uint64_t bs256k;
45	uint64_t bs1m;
46	uint64_t bs4m;
47	uint64_t bs16m;
48	zio_cksum_salt_t salt;
49	zio_checksum_t *(func);
50	zio_checksum_tmpl_init_t *(init);
51	zio_checksum_tmpl_free_t *(free);
52} chksum_stat_t;
53
54static chksum_stat_t *chksum_stat_data = 0;
55static int chksum_stat_cnt = 0;
56static kstat_t *chksum_kstat = NULL;
57
58/*
59 * Sample output on i3-1005G1 System:
60 *
61 * implementation   1k      4k     16k     64k    256k      1m      4m     16m
62 * edonr-generic  1278    1625    1769    1776    1783    1778    1771    1767
63 * skein-generic   548     594     613     623     621     623     621     486
64 * sha256-generic  255     270     281     278     279     281     283     283
65 * sha256-x64      288     310     316     317     318     317     317     316
66 * sha256-ssse3    304     342     351     355     356     357     356     356
67 * sha256-avx      311     348     359     362     362     363     363     362
68 * sha256-avx2     330     378     389     395     395     395     395     395
69 * sha256-shani    908    1127    1212    1230    1233    1234    1223    1230
70 * sha512-generic  359     409     431     427     429     430     428     423
71 * sha512-x64      420     473     490     496     497     497     496     495
72 * sha512-avx      406     522     546     560     560     560     556     560
73 * sha512-avx2     464     568     601     606     609     610     607     608
74 * blake3-generic  330     327     324     323     324     320     323     322
75 * blake3-sse2     424    1366    1449    1468    1458    1453    1395    1408
76 * blake3-sse41    453    1554    1658    1703    1689    1669    1622    1630
77 * blake3-avx2     452    2013    3225    3351    3356    3261    3076    3101
78 * blake3-avx512   498    2869    5269    5926    5872    5643    5014    5005
79 */
80static int
81chksum_kstat_headers(char *buf, size_t size)
82{
83	ssize_t off = 0;
84
85	off += kmem_scnprintf(buf + off, size, "%-23s", "implementation");
86	off += kmem_scnprintf(buf + off, size - off, "%8s", "1k");
87	off += kmem_scnprintf(buf + off, size - off, "%8s", "4k");
88	off += kmem_scnprintf(buf + off, size - off, "%8s", "16k");
89	off += kmem_scnprintf(buf + off, size - off, "%8s", "64k");
90	off += kmem_scnprintf(buf + off, size - off, "%8s", "256k");
91	off += kmem_scnprintf(buf + off, size - off, "%8s", "1m");
92	off += kmem_scnprintf(buf + off, size - off, "%8s", "4m");
93	(void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m");
94
95	return (0);
96}
97
98static int
99chksum_kstat_data(char *buf, size_t size, void *data)
100{
101	chksum_stat_t *cs;
102	ssize_t off = 0;
103	char b[24];
104
105	cs = (chksum_stat_t *)data;
106	kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl);
107	off += kmem_scnprintf(buf + off, size - off, "%-23s", b);
108	off += kmem_scnprintf(buf + off, size - off, "%8llu",
109	    (u_longlong_t)cs->bs1k);
110	off += kmem_scnprintf(buf + off, size - off, "%8llu",
111	    (u_longlong_t)cs->bs4k);
112	off += kmem_scnprintf(buf + off, size - off, "%8llu",
113	    (u_longlong_t)cs->bs16k);
114	off += kmem_scnprintf(buf + off, size - off, "%8llu",
115	    (u_longlong_t)cs->bs64k);
116	off += kmem_scnprintf(buf + off, size - off, "%8llu",
117	    (u_longlong_t)cs->bs256k);
118	off += kmem_scnprintf(buf + off, size - off, "%8llu",
119	    (u_longlong_t)cs->bs1m);
120	off += kmem_scnprintf(buf + off, size - off, "%8llu",
121	    (u_longlong_t)cs->bs4m);
122	(void) kmem_scnprintf(buf + off, size - off, "%8llu\n",
123	    (u_longlong_t)cs->bs16m);
124
125	return (0);
126}
127
128static void *
129chksum_kstat_addr(kstat_t *ksp, loff_t n)
130{
131	if (n < chksum_stat_cnt)
132		ksp->ks_private = (void *)(chksum_stat_data + n);
133	else
134		ksp->ks_private = NULL;
135
136	return (ksp->ks_private);
137}
138
139static void
140chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
141    uint64_t *result)
142{
143	hrtime_t start;
144	uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
145	uint32_t l, loops = 0;
146	zio_cksum_t zcp;
147
148	switch (round) {
149	case 1: /* 1k */
150		size = 1<<10; loops = 128; break;
151	case 2: /* 2k */
152		size = 1<<12; loops = 64; break;
153	case 3: /* 4k */
154		size = 1<<14; loops = 32; break;
155	case 4: /* 16k */
156		size = 1<<16; loops = 16; break;
157	case 5: /* 256k */
158		size = 1<<18; loops = 8; break;
159	case 6: /* 1m */
160		size = 1<<20; loops = 4; break;
161	case 7: /* 4m */
162		size = 1<<22; loops = 1; break;
163	case 8: /* 16m */
164		size = 1<<24; loops = 1; break;
165	}
166
167	kpreempt_disable();
168	start = gethrtime();
169	do {
170		for (l = 0; l < loops; l++, run_count++)
171			cs->func(abd, size, ctx, &zcp);
172
173		run_time_ns = gethrtime() - start;
174	} while (run_time_ns < MSEC2NSEC(1));
175	kpreempt_enable();
176
177	run_bw = size * run_count * NANOSEC;
178	run_bw /= run_time_ns;	/* B/s */
179	*result = run_bw/1024/1024; /* MiB/s */
180}
181
182#define	LIMIT_INIT	0
183#define	LIMIT_NEEDED	1
184#define	LIMIT_NOLIMIT	2
185
186static void
187chksum_benchit(chksum_stat_t *cs)
188{
189	abd_t *abd;
190	void *ctx = 0;
191	void *salt = &cs->salt.zcs_bytes;
192	static int chksum_stat_limit = LIMIT_INIT;
193
194	memset(salt, 0, sizeof (cs->salt.zcs_bytes));
195	if (cs->init)
196		ctx = cs->init(&cs->salt);
197
198	/* allocate test memory via abd linear interface */
199	abd = abd_alloc_linear(1<<20, B_FALSE);
200	chksum_run(cs, abd, ctx, 1, &cs->bs1k);
201	chksum_run(cs, abd, ctx, 2, &cs->bs4k);
202	chksum_run(cs, abd, ctx, 3, &cs->bs16k);
203	chksum_run(cs, abd, ctx, 4, &cs->bs64k);
204	chksum_run(cs, abd, ctx, 5, &cs->bs256k);
205
206	/* check if we ran on a slow cpu */
207	if (chksum_stat_limit == LIMIT_INIT) {
208		if (cs->bs1k < LIMIT_PERF_MBS) {
209			chksum_stat_limit = LIMIT_NEEDED;
210		} else {
211			chksum_stat_limit = LIMIT_NOLIMIT;
212		}
213	}
214
215	/* skip benchmarks >= 1MiB when the CPU is to slow */
216	if (chksum_stat_limit == LIMIT_NEEDED)
217		goto abort;
218
219	chksum_run(cs, abd, ctx, 6, &cs->bs1m);
220	abd_free(abd);
221
222	/* allocate test memory via abd non linear interface */
223	abd = abd_alloc(1<<24, B_FALSE);
224	chksum_run(cs, abd, ctx, 7, &cs->bs4m);
225	chksum_run(cs, abd, ctx, 8, &cs->bs16m);
226
227abort:
228	abd_free(abd);
229
230	/* free up temp memory */
231	if (cs->free)
232		cs->free(ctx);
233}
234
235/*
236 * Initialize and benchmark all supported implementations.
237 */
238static void
239chksum_benchmark(void)
240{
241#ifndef _KERNEL
242	/* we need the benchmark only for the kernel module */
243	return;
244#endif
245
246	chksum_stat_t *cs;
247	uint64_t max;
248	uint32_t id, cbid = 0, id_save;
249	const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3");
250	const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
251	const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
252
253	/* count implementations */
254	chksum_stat_cnt = 2;
255	chksum_stat_cnt += sha256->getcnt();
256	chksum_stat_cnt += sha512->getcnt();
257	chksum_stat_cnt += blake3->getcnt();
258	chksum_stat_data = kmem_zalloc(
259	    sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
260
261	/* edonr - needs to be the first one here (slow CPU check) */
262	cs = &chksum_stat_data[cbid++];
263
264	/* edonr */
265	cs->init = abd_checksum_edonr_tmpl_init;
266	cs->func = abd_checksum_edonr_native;
267	cs->free = abd_checksum_edonr_tmpl_free;
268	cs->name = "edonr";
269	cs->impl = "generic";
270	chksum_benchit(cs);
271
272	/* skein */
273	cs = &chksum_stat_data[cbid++];
274	cs->init = abd_checksum_skein_tmpl_init;
275	cs->func = abd_checksum_skein_native;
276	cs->free = abd_checksum_skein_tmpl_free;
277	cs->name = "skein";
278	cs->impl = "generic";
279	chksum_benchit(cs);
280
281	/* sha256 */
282	id_save = sha256->getid();
283	for (max = 0, id = 0; id < sha256->getcnt(); id++) {
284		sha256->setid(id);
285		cs = &chksum_stat_data[cbid++];
286		cs->init = 0;
287		cs->func = abd_checksum_sha256;
288		cs->free = 0;
289		cs->name = sha256->name;
290		cs->impl = sha256->getname();
291		chksum_benchit(cs);
292		if (cs->bs256k > max) {
293			max = cs->bs256k;
294			sha256->set_fastest(id);
295		}
296	}
297	sha256->setid(id_save);
298
299	/* sha512 */
300	id_save = sha512->getid();
301	for (max = 0, id = 0; id < sha512->getcnt(); id++) {
302		sha512->setid(id);
303		cs = &chksum_stat_data[cbid++];
304		cs->init = 0;
305		cs->func = abd_checksum_sha512_native;
306		cs->free = 0;
307		cs->name = sha512->name;
308		cs->impl = sha512->getname();
309		chksum_benchit(cs);
310		if (cs->bs256k > max) {
311			max = cs->bs256k;
312			sha512->set_fastest(id);
313		}
314	}
315	sha512->setid(id_save);
316
317	/* blake3 */
318	id_save = blake3->getid();
319	for (max = 0, id = 0; id < blake3->getcnt(); id++) {
320		blake3->setid(id);
321		cs = &chksum_stat_data[cbid++];
322		cs->init = abd_checksum_blake3_tmpl_init;
323		cs->func = abd_checksum_blake3_native;
324		cs->free = abd_checksum_blake3_tmpl_free;
325		cs->name = blake3->name;
326		cs->impl = blake3->getname();
327		chksum_benchit(cs);
328		if (cs->bs256k > max) {
329			max = cs->bs256k;
330			blake3->set_fastest(id);
331		}
332	}
333	blake3->setid(id_save);
334}
335
336void
337chksum_init(void)
338{
339#ifdef _KERNEL
340	blake3_per_cpu_ctx_init();
341#endif
342
343	/* Benchmark supported implementations */
344	chksum_benchmark();
345
346	/* Install kstats for all implementations */
347	chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
348	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
349
350	if (chksum_kstat != NULL) {
351		chksum_kstat->ks_data = NULL;
352		chksum_kstat->ks_ndata = UINT32_MAX;
353		kstat_set_raw_ops(chksum_kstat,
354		    chksum_kstat_headers,
355		    chksum_kstat_data,
356		    chksum_kstat_addr);
357		kstat_install(chksum_kstat);
358	}
359}
360
361void
362chksum_fini(void)
363{
364	if (chksum_kstat != NULL) {
365		kstat_delete(chksum_kstat);
366		chksum_kstat = NULL;
367	}
368
369	if (chksum_stat_cnt) {
370		kmem_free(chksum_stat_data,
371		    sizeof (chksum_stat_t) * chksum_stat_cnt);
372		chksum_stat_cnt = 0;
373		chksum_stat_data = 0;
374	}
375
376#ifdef _KERNEL
377	blake3_per_cpu_ctx_fini();
378#endif
379}
380