1/*
2 * Ones' complement checksum test & benchmark
3 *
4 * Copyright (c) 2016-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8#define _GNU_SOURCE
9#include <inttypes.h>
10#include <stdbool.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15#include <sys/mman.h>
16#include <time.h>
17#include <unistd.h>
18#include "../include/networking.h"
19
20#if WANT_ASSERT
21#undef NDEBUG
22#include <assert.h>
23#define Assert(exp) assert(exp)
24#else
25#define Assert(exp) (void) (exp)
26#endif
27
28#ifdef __GNUC__
29#define may_alias __attribute__((__may_alias__))
30#else
31#define may_alias
32#endif
33
34#define CACHE_LINE 64
35#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
36
37/* Reference implementation - do not modify! */
38static uint16_t
39checksum_simple(const void *ptr, uint32_t nbytes)
40{
41    const uint16_t *may_alias hptr = ptr;
42    uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
43
44    /* Sum all halfwords, assume misaligned accesses are handled in HW */
45    for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
46    {
47	sum += *hptr++;
48    }
49
50    /* Add any trailing odd byte */
51    if ((nbytes & 0x01) != 0)
52    {
53	sum += *(uint8_t *) hptr;
54    }
55
56    /* Fold 64-bit sum to 32 bits */
57    sum = (sum & 0xffffffff) + (sum >> 32);
58    sum = (sum & 0xffffffff) + (sum >> 32);
59    Assert(sum == (uint32_t) sum);
60
61    /* Fold 32-bit sum to 16 bits */
62    sum = (sum & 0xffff) + (sum >> 16);
63    sum = (sum & 0xffff) + (sum >> 16);
64    Assert(sum == (uint16_t) sum);
65
66    return (uint16_t) sum;
67}
68
69static struct
70{
71    uint16_t (*cksum_fp)(const void *, uint32_t);
72    const char *name;
73} implementations[] =
74{
75    { checksum_simple, "simple"},
76    { __chksum, "scalar"},
77#if __arm__
78    { __chksum_arm_simd, "simd" },
79#elif __aarch64__
80    { __chksum_aarch64_simd, "simd" },
81#endif
82    { NULL, NULL}
83};
84
85static int
86find_impl(const char *name)
87{
88    for (int i = 0; implementations[i].name != NULL; i++)
89    {
90	if (strcmp(implementations[i].name, name) == 0)
91	{
92	    return i;
93	}
94    }
95    return -1;
96}
97
98static uint16_t (*CKSUM_FP)(const void *, uint32_t);
99static volatile uint16_t SINK;
100
101static bool
102verify(const void *data, uint32_t offset, uint32_t size)
103{
104
105    uint16_t csum_expected = checksum_simple(data, size);
106    uint16_t csum_actual = CKSUM_FP(data, size);
107    if (csum_actual != csum_expected)
108    {
109	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
110		"actual %04x expected %04x (valid)",
111		offset, size, csum_actual, csum_expected);
112	if (size < 65536)
113	{
114	    /* Fatal error */
115	    exit(EXIT_FAILURE);
116	}
117	/* Else some implementations only support sizes up to 2^16 */
118	return false;
119    }
120    return true;
121}
122
123static uint64_t
124clock_get_ns(void)
125{
126    struct timespec ts;
127    clock_gettime(CLOCK_MONOTONIC, &ts);
128    return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
129}
130
131static void
132benchmark(const uint8_t *base,
133	  size_t poolsize,
134	  uint32_t blksize,
135	  uint32_t numops,
136	  uint64_t cpufreq)
137{
138    printf("%11u ", (unsigned int) blksize); fflush(stdout);
139
140    uint64_t start = clock_get_ns();
141    for (uint32_t i = 0; i < numops; i ++)
142    {
143	/* Read a random value from the pool */
144	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
145	/* Generate a random starting address */
146	const void *data = &base[random % (poolsize - blksize)];
147	SINK = CKSUM_FP(data, blksize);
148    }
149    uint64_t end = clock_get_ns();
150
151#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
152    uint64_t elapsed_ns = end - start;
153    uint64_t elapsed_ms = elapsed_ns / 1000000;
154    uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
155    uint64_t accbytes = (uint64_t) numops * blksize;
156    printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
157    unsigned int cyc_per_blk = cpufreq / blks_per_s;
158    printf("%11u ", cyc_per_blk);
159    if (blksize != 0)
160    {
161	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
162	printf("%7u.%03u ",
163		cyc_per_byte / 1000, cyc_per_byte % 1000);
164    }
165    printf("\n");
166}
167
168int main(int argc, char *argv[])
169{
170    int c;
171    bool DUMP = false;
172    uint32_t IMPL = 0;/* Simple implementation */
173    uint64_t CPUFREQ = 0;
174    uint32_t BLKSIZE = 0;
175    uint32_t NUMOPS = 1000000;
176    uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
177
178    setvbuf(stdout, NULL, _IOLBF, 160);
179    while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
180    {
181	switch (c)
182	{
183	    case 'b' :
184		{
185		    int blksize = atoi(optarg);
186		    if (blksize < 1 || blksize > POOLSIZE / 2)
187		    {
188			fprintf(stderr, "Invalid block size %d\n", blksize);
189			exit(EXIT_FAILURE);
190		    }
191		    BLKSIZE = (unsigned) blksize;
192		    break;
193		}
194	    case 'd' :
195		DUMP = true;
196		break;
197	    case 'f' :
198		{
199		    int64_t cpufreq = atoll(optarg);
200		    if (cpufreq < 1)
201		    {
202			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
203				cpufreq);
204			exit(EXIT_FAILURE);
205		    }
206		    CPUFREQ = cpufreq;
207		    break;
208		}
209	    case 'i' :
210		{
211		    int impl = find_impl(optarg);
212		    if (impl < 0)
213		    {
214			fprintf(stderr, "Invalid implementation %s\n", optarg);
215			goto usage;
216		    }
217		    IMPL = (unsigned) impl;
218		    break;
219		}
220	    case 'n' :
221		{
222		    int numops = atoi(optarg);
223		    if (numops < 1)
224		    {
225			fprintf(stderr, "Invalid number of operations %d\n", numops);
226			exit(EXIT_FAILURE);
227		    }
228		    NUMOPS = (unsigned) numops;
229		    break;
230		}
231	    case 'p' :
232		{
233		    int poolsize = atoi(optarg);
234		    if (poolsize < 4096)
235		    {
236			fprintf(stderr, "Invalid pool size %d\n", poolsize);
237			exit(EXIT_FAILURE);
238		    }
239		    char c = optarg[strlen(optarg) - 1];
240		    if (c == 'M')
241		    {
242			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
243		    }
244		    else if (c == 'K')
245		    {
246			POOLSIZE = (unsigned) poolsize * 1024;
247		    }
248		    else
249		    {
250			POOLSIZE = (unsigned) poolsize;
251		    }
252		    break;
253		}
254	    default :
255usage :
256		fprintf(stderr, "Usage: checksum <options>\n"
257			"-b <blksize>    Block size\n"
258			"-d              Dump first 96 bytes of data\n"
259			"-f <cpufreq>    CPU frequency (Hz)\n"
260			"-i <impl>       Implementation\n"
261			"-n <numops>     Number of operations\n"
262			"-p <poolsize>   Pool size (K or M suffix)\n"
263		       );
264		printf("Implementations:");
265		for (int i = 0; implementations[i].name != NULL; i++)
266		{
267		    printf(" %s", implementations[i].name);
268		}
269		printf("\n");
270		exit(EXIT_FAILURE);
271	}
272    }
273    if (optind > argc)
274    {
275	goto usage;
276    }
277
278    CKSUM_FP = implementations[IMPL].cksum_fp;
279    POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
280    uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
281			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
282    if (base == MAP_FAILED)
283    {
284	perror("aligned_alloc"), exit(EXIT_FAILURE);
285    }
286    for (size_t i = 0; i < POOLSIZE / 4; i++)
287    {
288	((uint32_t *) base)[i] = rand();
289    }
290
291    printf("Implementation: %s\n", implementations[IMPL].name);
292    printf("numops %u, poolsize ", NUMOPS);
293    if (POOLSIZE % (1024 * 1024) == 0)
294    {
295	printf("%uMiB", POOLSIZE / (1024 * 1024));
296    }
297    else if (POOLSIZE % 1024 == 0)
298    {
299	printf("%uKiB", POOLSIZE / 1024);
300    }
301    else
302    {
303	printf("%uB", POOLSIZE);
304    }
305    printf(", blocksize %u, CPU frequency %juMHz\n",
306	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
307#if WANT_ASSERT
308    printf("Warning: assertions are enabled\n");
309#endif
310
311    if (DUMP)
312    {
313	/* Print out first 96 bytes of data for human debugging */
314	for (int i = 0; i < 96; i++)
315	{
316	    if (i % 8 == 0)
317		printf("%2u:", i);
318	    printf(" %02x", base[i]);
319	    if (i % 8 == 7)
320		printf("\n");
321	}
322    }
323
324    /* Verify that chosen algorithm handles all combinations of offsets and sizes */
325    printf("Verifying..."); fflush(stdout);
326    bool success = true;
327    /* Check all (relevant) combinations of size and offset */
328    for (int size = 0; size <= 256; size++)
329    {
330	for (int offset = 0; offset < 255; offset++)
331	{
332	    /* Check at start of mapped memory */
333	    success &= verify(&base[offset], offset, size);
334	    /* Check at end of mapped memory */
335	    uint8_t *p = base + POOLSIZE - (size + offset);
336	    success &= verify(p, (uintptr_t) p % 64, size);
337	}
338    }
339    /* Check increasingly larger sizes */
340    for (size_t size = 1; size < POOLSIZE; size *= 2)
341    {
342	success &= verify(base, 0, size);
343    }
344    /* Check the full size, this can detect accumulator overflows */
345    success &= verify(base, 0, POOLSIZE);
346    printf("%s\n", success ? "OK" : "failure");
347
348    /* Print throughput in decimal megabyte (1000000B) per second */
349    if (CPUFREQ != 0)
350    {
351	printf("%11s %11s %11s %11s\n",
352	       "block size", "MB/s", "cycles/blk", "cycles/byte");
353    }
354    else
355    {
356	printf("%11s %11s %11s %11s\n",
357	       "block size", "MB/s", "ns/blk", "ns/byte");
358	CPUFREQ = 1000000000;
359    }
360    if (BLKSIZE != 0)
361    {
362	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
363    }
364    else
365    {
366	static const uint16_t sizes[] =
367	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
368	for (int i = 0; sizes[i] != 0; i++)
369	{
370	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
371	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
372	}
373    }
374
375    if (munmap(base, POOLSIZE) != 0)
376    {
377	perror("munmap"), exit(EXIT_FAILURE);
378    }
379
380    return success ? EXIT_SUCCESS : EXIT_FAILURE;
381}
382