gcm128.c revision 279264
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58#  define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef	GETU32
66#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67#undef	PUTU32
68#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V)	do { \
73	if (sizeof(size_t)==8) { \
74		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75		V.lo  = (V.hi<<63)|(V.lo>>1); \
76		V.hi  = (V.hi>>1 )^T; \
77	} \
78	else { \
79		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80		V.lo  = (V.hi<<63)|(V.lo>>1); \
81		V.hi  = (V.hi>>1 )^((u64)T<<32); \
82	} \
83} while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 *   handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 *   subsystem penalties (for example on Windows large enough free
111 *   results in VM working set trimming, meaning that consequent
112 *   malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 *   performance of other code paths (not necessarily even from same
115 *   thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if	TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123	int  i, j;
124	u128 V;
125
126	Htable[0].hi = 0;
127	Htable[0].lo = 0;
128	V.hi = H[0];
129	V.lo = H[1];
130
131	for (Htable[128]=V, i=64; i>0; i>>=1) {
132		REDUCE1BIT(V);
133		Htable[i] = V;
134	}
135
136	for (i=2; i<256; i<<=1) {
137		u128 *Hi = Htable+i, H0 = *Hi;
138		for (j=1; j<i; ++j) {
139			Hi[j].hi = H0.hi^Htable[j].hi;
140			Hi[j].lo = H0.lo^Htable[j].lo;
141		}
142	}
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147	u128 Z = { 0, 0};
148	const u8 *xi = (const u8 *)Xi+15;
149	size_t rem, n = *xi;
150	const union { long one; char little; } is_endian = {1};
151	static const size_t rem_8bit[256] = {
152		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217	while (1) {
218		Z.hi ^= Htable[n].hi;
219		Z.lo ^= Htable[n].lo;
220
221		if ((u8 *)Xi==xi)	break;
222
223		n = *(--xi);
224
225		rem  = (size_t)Z.lo&0xff;
226		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227		Z.hi = (Z.hi>>8);
228		if (sizeof(size_t)==8)
229			Z.hi ^= rem_8bit[rem];
230		else
231			Z.hi ^= (u64)rem_8bit[rem]<<32;
232	}
233
234	if (is_endian.little) {
235#ifdef BSWAP8
236		Xi[0] = BSWAP8(Z.hi);
237		Xi[1] = BSWAP8(Z.lo);
238#else
239		u8 *p = (u8 *)Xi;
240		u32 v;
241		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242		v = (u32)(Z.hi);	PUTU32(p+4,v);
243		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244		v = (u32)(Z.lo);	PUTU32(p+12,v);
245#endif
246	}
247	else {
248		Xi[0] = Z.hi;
249		Xi[1] = Z.lo;
250	}
251}
252#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254#elif	TABLE_BITS==4
255
256static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257{
258	u128 V;
259#if defined(OPENSSL_SMALL_FOOTPRINT)
260	int  i;
261#endif
262
263	Htable[0].hi = 0;
264	Htable[0].lo = 0;
265	V.hi = H[0];
266	V.lo = H[1];
267
268#if defined(OPENSSL_SMALL_FOOTPRINT)
269	for (Htable[8]=V, i=4; i>0; i>>=1) {
270		REDUCE1BIT(V);
271		Htable[i] = V;
272	}
273
274	for (i=2; i<16; i<<=1) {
275		u128 *Hi = Htable+i;
276		int   j;
277		for (V=*Hi, j=1; j<i; ++j) {
278			Hi[j].hi = V.hi^Htable[j].hi;
279			Hi[j].lo = V.lo^Htable[j].lo;
280		}
281	}
282#else
283	Htable[8] = V;
284	REDUCE1BIT(V);
285	Htable[4] = V;
286	REDUCE1BIT(V);
287	Htable[2] = V;
288	REDUCE1BIT(V);
289	Htable[1] = V;
290	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291	V=Htable[4];
292	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295	V=Htable[8];
296	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303#endif
304#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305	/*
306	 * ARM assembler expects specific dword order in Htable.
307	 */
308	{
309	int j;
310	const union { long one; char little; } is_endian = {1};
311
312	if (is_endian.little)
313		for (j=0;j<16;++j) {
314			V = Htable[j];
315			Htable[j].hi = V.lo;
316			Htable[j].lo = V.hi;
317		}
318	else
319		for (j=0;j<16;++j) {
320			V = Htable[j];
321			Htable[j].hi = V.lo<<32|V.lo>>32;
322			Htable[j].lo = V.hi<<32|V.hi>>32;
323		}
324	}
325#endif
326}
327
328#ifndef GHASH_ASM
329static const size_t rem_4bit[16] = {
330	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336{
337	u128 Z;
338	int cnt = 15;
339	size_t rem, nlo, nhi;
340	const union { long one; char little; } is_endian = {1};
341
342	nlo  = ((const u8 *)Xi)[15];
343	nhi  = nlo>>4;
344	nlo &= 0xf;
345
346	Z.hi = Htable[nlo].hi;
347	Z.lo = Htable[nlo].lo;
348
349	while (1) {
350		rem  = (size_t)Z.lo&0xf;
351		Z.lo = (Z.hi<<60)|(Z.lo>>4);
352		Z.hi = (Z.hi>>4);
353		if (sizeof(size_t)==8)
354			Z.hi ^= rem_4bit[rem];
355		else
356			Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358		Z.hi ^= Htable[nhi].hi;
359		Z.lo ^= Htable[nhi].lo;
360
361		if (--cnt<0)		break;
362
363		nlo  = ((const u8 *)Xi)[cnt];
364		nhi  = nlo>>4;
365		nlo &= 0xf;
366
367		rem  = (size_t)Z.lo&0xf;
368		Z.lo = (Z.hi<<60)|(Z.lo>>4);
369		Z.hi = (Z.hi>>4);
370		if (sizeof(size_t)==8)
371			Z.hi ^= rem_4bit[rem];
372		else
373			Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375		Z.hi ^= Htable[nlo].hi;
376		Z.lo ^= Htable[nlo].lo;
377	}
378
379	if (is_endian.little) {
380#ifdef BSWAP8
381		Xi[0] = BSWAP8(Z.hi);
382		Xi[1] = BSWAP8(Z.lo);
383#else
384		u8 *p = (u8 *)Xi;
385		u32 v;
386		v = (u32)(Z.hi>>32);	PUTU32(p,v);
387		v = (u32)(Z.hi);	PUTU32(p+4,v);
388		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
389		v = (u32)(Z.lo);	PUTU32(p+12,v);
390#endif
391	}
392	else {
393		Xi[0] = Z.hi;
394		Xi[1] = Z.lo;
395	}
396}
397
398#if !defined(OPENSSL_SMALL_FOOTPRINT)
399/*
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
405 */
406static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407				const u8 *inp,size_t len)
408{
409    u128 Z;
410    int cnt;
411    size_t rem, nlo, nhi;
412    const union { long one; char little; } is_endian = {1};
413
414#if 1
415    do {
416	cnt  = 15;
417	nlo  = ((const u8 *)Xi)[15];
418	nlo ^= inp[15];
419	nhi  = nlo>>4;
420	nlo &= 0xf;
421
422	Z.hi = Htable[nlo].hi;
423	Z.lo = Htable[nlo].lo;
424
425	while (1) {
426		rem  = (size_t)Z.lo&0xf;
427		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428		Z.hi = (Z.hi>>4);
429		if (sizeof(size_t)==8)
430			Z.hi ^= rem_4bit[rem];
431		else
432			Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434		Z.hi ^= Htable[nhi].hi;
435		Z.lo ^= Htable[nhi].lo;
436
437		if (--cnt<0)		break;
438
439		nlo  = ((const u8 *)Xi)[cnt];
440		nlo ^= inp[cnt];
441		nhi  = nlo>>4;
442		nlo &= 0xf;
443
444		rem  = (size_t)Z.lo&0xf;
445		Z.lo = (Z.hi<<60)|(Z.lo>>4);
446		Z.hi = (Z.hi>>4);
447		if (sizeof(size_t)==8)
448			Z.hi ^= rem_4bit[rem];
449		else
450			Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452		Z.hi ^= Htable[nlo].hi;
453		Z.lo ^= Htable[nlo].lo;
454	}
455#else
456    /*
457     * Extra 256+16 bytes per-key plus 512 bytes shared tables
458     * [should] give ~50% improvement... One could have PACK()-ed
459     * the rem_8bit even here, but the priority is to minimize
460     * cache footprint...
461     */
462    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
463    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
464    static const unsigned short rem_8bit[256] = {
465	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497    /*
498     * This pre-processing phase slows down procedure by approximately
499     * same time as it makes each loop spin faster. In other words
500     * single block performance is approximately same as straightforward
501     * "4-bit" implementation, and then it goes only faster...
502     */
503    for (cnt=0; cnt<16; ++cnt) {
504	Z.hi = Htable[cnt].hi;
505	Z.lo = Htable[cnt].lo;
506	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507	Hshr4[cnt].hi = (Z.hi>>4);
508	Hshl4[cnt]    = (u8)(Z.lo<<4);
509    }
510
511    do {
512	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513		nlo  = ((const u8 *)Xi)[cnt];
514		nlo ^= inp[cnt];
515		nhi  = nlo>>4;
516		nlo &= 0xf;
517
518		Z.hi ^= Htable[nlo].hi;
519		Z.lo ^= Htable[nlo].lo;
520
521		rem = (size_t)Z.lo&0xff;
522
523		Z.lo = (Z.hi<<56)|(Z.lo>>8);
524		Z.hi = (Z.hi>>8);
525
526		Z.hi ^= Hshr4[nhi].hi;
527		Z.lo ^= Hshr4[nhi].lo;
528		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529	}
530
531	nlo  = ((const u8 *)Xi)[0];
532	nlo ^= inp[0];
533	nhi  = nlo>>4;
534	nlo &= 0xf;
535
536	Z.hi ^= Htable[nlo].hi;
537	Z.lo ^= Htable[nlo].lo;
538
539	rem = (size_t)Z.lo&0xf;
540
541	Z.lo = (Z.hi<<60)|(Z.lo>>4);
542	Z.hi = (Z.hi>>4);
543
544	Z.hi ^= Htable[nhi].hi;
545	Z.lo ^= Htable[nhi].lo;
546	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547#endif
548
549	if (is_endian.little) {
550#ifdef BSWAP8
551		Xi[0] = BSWAP8(Z.hi);
552		Xi[1] = BSWAP8(Z.lo);
553#else
554		u8 *p = (u8 *)Xi;
555		u32 v;
556		v = (u32)(Z.hi>>32);	PUTU32(p,v);
557		v = (u32)(Z.hi);	PUTU32(p+4,v);
558		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
559		v = (u32)(Z.lo);	PUTU32(p+12,v);
560#endif
561	}
562	else {
563		Xi[0] = Z.hi;
564		Xi[1] = Z.lo;
565	}
566    } while (inp+=16, len-=16);
567}
568#endif
569#else
570void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572#endif
573
574#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580#define GHASH_CHUNK       (3*1024)
581#endif
582
583#else	/* TABLE_BITS */
584
585static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586{
587	u128 V,Z = { 0,0 };
588	long X;
589	int  i,j;
590	const long *xi = (const long *)Xi;
591	const union { long one; char little; } is_endian = {1};
592
593	V.hi = H[0];	/* H is in host byte order, no byte swapping */
594	V.lo = H[1];
595
596	for (j=0; j<16/sizeof(long); ++j) {
597		if (is_endian.little) {
598			if (sizeof(long)==8) {
599#ifdef BSWAP8
600				X = (long)(BSWAP8(xi[j]));
601#else
602				const u8 *p = (const u8 *)(xi+j);
603				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604#endif
605			}
606			else {
607				const u8 *p = (const u8 *)(xi+j);
608				X = (long)GETU32(p);
609			}
610		}
611		else
612			X = xi[j];
613
614		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615			u64 M = (u64)(X>>(8*sizeof(long)-1));
616			Z.hi ^= V.hi&M;
617			Z.lo ^= V.lo&M;
618
619			REDUCE1BIT(V);
620		}
621	}
622
623	if (is_endian.little) {
624#ifdef BSWAP8
625		Xi[0] = BSWAP8(Z.hi);
626		Xi[1] = BSWAP8(Z.lo);
627#else
628		u8 *p = (u8 *)Xi;
629		u32 v;
630		v = (u32)(Z.hi>>32);	PUTU32(p,v);
631		v = (u32)(Z.hi);	PUTU32(p+4,v);
632		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
633		v = (u32)(Z.lo);	PUTU32(p+12,v);
634#endif
635	}
636	else {
637		Xi[0] = Z.hi;
638		Xi[1] = Z.lo;
639	}
640}
641#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643#endif
644
645#if	TABLE_BITS==4 && defined(GHASH_ASM)
646# if	!defined(I386_ONLY) && \
647	(defined(__i386)	|| defined(__i386__)	|| \
648	 defined(__x86_64)	|| defined(__x86_64__)	|| \
649	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
650#  define GHASH_ASM_X86_OR_64
651#  define GCM_FUNCREF_4BIT
652extern unsigned int OPENSSL_ia32cap_P[2];
653
654void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659#   define GHASH_ASM_X86
660void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665#  endif
666# elif defined(__arm__) || defined(__arm)
667#  include "arm_arch.h"
668#  if __ARM_ARCH__>=7
669#   define GHASH_ASM_ARM
670#   define GCM_FUNCREF_4BIT
671void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673#  endif
674# endif
675#endif
676
677#ifdef GCM_FUNCREF_4BIT
678# undef  GCM_MUL
679# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680# ifdef GHASH
681#  undef  GHASH
682#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683# endif
684#endif
685
686void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687{
688	const union { long one; char little; } is_endian = {1};
689
690	memset(ctx,0,sizeof(*ctx));
691	ctx->block = block;
692	ctx->key   = key;
693
694	(*block)(ctx->H.c,ctx->H.c,key);
695
696	if (is_endian.little) {
697		/* H is stored in host byte order */
698#ifdef BSWAP8
699		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701#else
702		u8 *p = ctx->H.c;
703		u64 hi,lo;
704		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
705		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706		ctx->H.u[0] = hi;
707		ctx->H.u[1] = lo;
708#endif
709	}
710
711#if	TABLE_BITS==8
712	gcm_init_8bit(ctx->Htable,ctx->H.u);
713#elif	TABLE_BITS==4
714# if	defined(GHASH_ASM_X86_OR_64)
715#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
717	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
718		gcm_init_clmul(ctx->Htable,ctx->H.u);
719		ctx->gmult = gcm_gmult_clmul;
720		ctx->ghash = gcm_ghash_clmul;
721		return;
722	}
723#  endif
724	gcm_init_4bit(ctx->Htable,ctx->H.u);
725#  if	defined(GHASH_ASM_X86)			/* x86 only */
726#   if	defined(OPENSSL_IA32_SSE2)
727	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
728#   else
729	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
730#   endif
731		ctx->gmult = gcm_gmult_4bit_mmx;
732		ctx->ghash = gcm_ghash_4bit_mmx;
733	} else {
734		ctx->gmult = gcm_gmult_4bit_x86;
735		ctx->ghash = gcm_ghash_4bit_x86;
736	}
737#  else
738	ctx->gmult = gcm_gmult_4bit;
739	ctx->ghash = gcm_ghash_4bit;
740#  endif
741# elif	defined(GHASH_ASM_ARM)
742	if (OPENSSL_armcap_P & ARMV7_NEON) {
743		ctx->gmult = gcm_gmult_neon;
744		ctx->ghash = gcm_ghash_neon;
745	} else {
746		gcm_init_4bit(ctx->Htable,ctx->H.u);
747		ctx->gmult = gcm_gmult_4bit;
748		ctx->ghash = gcm_ghash_4bit;
749	}
750# else
751	gcm_init_4bit(ctx->Htable,ctx->H.u);
752# endif
753#endif
754}
755
756void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757{
758	const union { long one; char little; } is_endian = {1};
759	unsigned int ctr;
760#ifdef GCM_FUNCREF_4BIT
761	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
762#endif
763
764	ctx->Yi.u[0]  = 0;
765	ctx->Yi.u[1]  = 0;
766	ctx->Xi.u[0]  = 0;
767	ctx->Xi.u[1]  = 0;
768	ctx->len.u[0] = 0;	/* AAD length */
769	ctx->len.u[1] = 0;	/* message length */
770	ctx->ares = 0;
771	ctx->mres = 0;
772
773	if (len==12) {
774		memcpy(ctx->Yi.c,iv,12);
775		ctx->Yi.c[15]=1;
776		ctr=1;
777	}
778	else {
779		size_t i;
780		u64 len0 = len;
781
782		while (len>=16) {
783			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784			GCM_MUL(ctx,Yi);
785			iv += 16;
786			len -= 16;
787		}
788		if (len) {
789			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790			GCM_MUL(ctx,Yi);
791		}
792		len0 <<= 3;
793		if (is_endian.little) {
794#ifdef BSWAP8
795			ctx->Yi.u[1]  ^= BSWAP8(len0);
796#else
797			ctx->Yi.c[8]  ^= (u8)(len0>>56);
798			ctx->Yi.c[9]  ^= (u8)(len0>>48);
799			ctx->Yi.c[10] ^= (u8)(len0>>40);
800			ctx->Yi.c[11] ^= (u8)(len0>>32);
801			ctx->Yi.c[12] ^= (u8)(len0>>24);
802			ctx->Yi.c[13] ^= (u8)(len0>>16);
803			ctx->Yi.c[14] ^= (u8)(len0>>8);
804			ctx->Yi.c[15] ^= (u8)(len0);
805#endif
806		}
807		else
808			ctx->Yi.u[1]  ^= len0;
809
810		GCM_MUL(ctx,Yi);
811
812		if (is_endian.little)
813#ifdef BSWAP4
814			ctr = BSWAP4(ctx->Yi.d[3]);
815#else
816			ctr = GETU32(ctx->Yi.c+12);
817#endif
818		else
819			ctr = ctx->Yi.d[3];
820	}
821
822	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
823	++ctr;
824	if (is_endian.little)
825#ifdef BSWAP4
826		ctx->Yi.d[3] = BSWAP4(ctr);
827#else
828		PUTU32(ctx->Yi.c+12,ctr);
829#endif
830	else
831		ctx->Yi.d[3] = ctr;
832}
833
834int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
835{
836	size_t i;
837	unsigned int n;
838	u64 alen = ctx->len.u[0];
839#ifdef GCM_FUNCREF_4BIT
840	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
841# ifdef GHASH
842	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
843				const u8 *inp,size_t len)	= ctx->ghash;
844# endif
845#endif
846
847	if (ctx->len.u[1]) return -2;
848
849	alen += len;
850	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
851		return -1;
852	ctx->len.u[0] = alen;
853
854	n = ctx->ares;
855	if (n) {
856		while (n && len) {
857			ctx->Xi.c[n] ^= *(aad++);
858			--len;
859			n = (n+1)%16;
860		}
861		if (n==0) GCM_MUL(ctx,Xi);
862		else {
863			ctx->ares = n;
864			return 0;
865		}
866	}
867
868#ifdef GHASH
869	if ((i = (len&(size_t)-16))) {
870		GHASH(ctx,aad,i);
871		aad += i;
872		len -= i;
873	}
874#else
875	while (len>=16) {
876		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
877		GCM_MUL(ctx,Xi);
878		aad += 16;
879		len -= 16;
880	}
881#endif
882	if (len) {
883		n = (unsigned int)len;
884		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
885	}
886
887	ctx->ares = n;
888	return 0;
889}
890
891int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
892		const unsigned char *in, unsigned char *out,
893		size_t len)
894{
895	const union { long one; char little; } is_endian = {1};
896	unsigned int n, ctr;
897	size_t i;
898	u64        mlen  = ctx->len.u[1];
899	block128_f block = ctx->block;
900	void      *key   = ctx->key;
901#ifdef GCM_FUNCREF_4BIT
902	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
903# ifdef GHASH
904	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
905				const u8 *inp,size_t len)	= ctx->ghash;
906# endif
907#endif
908
909#if 0
910	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
911#endif
912	mlen += len;
913	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
914		return -1;
915	ctx->len.u[1] = mlen;
916
917	if (ctx->ares) {
918		/* First call to encrypt finalizes GHASH(AAD) */
919		GCM_MUL(ctx,Xi);
920		ctx->ares = 0;
921	}
922
923	if (is_endian.little)
924#ifdef BSWAP4
925		ctr = BSWAP4(ctx->Yi.d[3]);
926#else
927		ctr = GETU32(ctx->Yi.c+12);
928#endif
929	else
930		ctr = ctx->Yi.d[3];
931
932	n = ctx->mres;
933#if !defined(OPENSSL_SMALL_FOOTPRINT)
934	if (16%sizeof(size_t) == 0) do {	/* always true actually */
935		if (n) {
936			while (n && len) {
937				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
938				--len;
939				n = (n+1)%16;
940			}
941			if (n==0) GCM_MUL(ctx,Xi);
942			else {
943				ctx->mres = n;
944				return 0;
945			}
946		}
947#if defined(STRICT_ALIGNMENT)
948		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
949			break;
950#endif
951#if defined(GHASH) && defined(GHASH_CHUNK)
952		while (len>=GHASH_CHUNK) {
953		    size_t j=GHASH_CHUNK;
954
955		    while (j) {
956		    	size_t *out_t=(size_t *)out;
957		    	const size_t *in_t=(const size_t *)in;
958
959			(*block)(ctx->Yi.c,ctx->EKi.c,key);
960			++ctr;
961			if (is_endian.little)
962#ifdef BSWAP4
963				ctx->Yi.d[3] = BSWAP4(ctr);
964#else
965				PUTU32(ctx->Yi.c+12,ctr);
966#endif
967			else
968				ctx->Yi.d[3] = ctr;
969			for (i=0; i<16/sizeof(size_t); ++i)
970				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
971			out += 16;
972			in  += 16;
973			j   -= 16;
974		    }
975		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
976		    len -= GHASH_CHUNK;
977		}
978		if ((i = (len&(size_t)-16))) {
979		    size_t j=i;
980
981		    while (len>=16) {
982		    	size_t *out_t=(size_t *)out;
983		    	const size_t *in_t=(const size_t *)in;
984
985			(*block)(ctx->Yi.c,ctx->EKi.c,key);
986			++ctr;
987			if (is_endian.little)
988#ifdef BSWAP4
989				ctx->Yi.d[3] = BSWAP4(ctr);
990#else
991				PUTU32(ctx->Yi.c+12,ctr);
992#endif
993			else
994				ctx->Yi.d[3] = ctr;
995			for (i=0; i<16/sizeof(size_t); ++i)
996				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
997			out += 16;
998			in  += 16;
999			len -= 16;
1000		    }
1001		    GHASH(ctx,out-j,j);
1002		}
1003#else
1004		while (len>=16) {
1005		    	size_t *out_t=(size_t *)out;
1006		    	const size_t *in_t=(const size_t *)in;
1007
1008			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1009			++ctr;
1010			if (is_endian.little)
1011#ifdef BSWAP4
1012				ctx->Yi.d[3] = BSWAP4(ctr);
1013#else
1014				PUTU32(ctx->Yi.c+12,ctr);
1015#endif
1016			else
1017				ctx->Yi.d[3] = ctr;
1018			for (i=0; i<16/sizeof(size_t); ++i)
1019				ctx->Xi.t[i] ^=
1020				out_t[i] = in_t[i]^ctx->EKi.t[i];
1021			GCM_MUL(ctx,Xi);
1022			out += 16;
1023			in  += 16;
1024			len -= 16;
1025		}
1026#endif
1027		if (len) {
1028			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1029			++ctr;
1030			if (is_endian.little)
1031#ifdef BSWAP4
1032				ctx->Yi.d[3] = BSWAP4(ctr);
1033#else
1034				PUTU32(ctx->Yi.c+12,ctr);
1035#endif
1036			else
1037				ctx->Yi.d[3] = ctr;
1038			while (len--) {
1039				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1040				++n;
1041			}
1042		}
1043
1044		ctx->mres = n;
1045		return 0;
1046	} while(0);
1047#endif
1048	for (i=0;i<len;++i) {
1049		if (n==0) {
1050			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1051			++ctr;
1052			if (is_endian.little)
1053#ifdef BSWAP4
1054				ctx->Yi.d[3] = BSWAP4(ctr);
1055#else
1056				PUTU32(ctx->Yi.c+12,ctr);
1057#endif
1058			else
1059				ctx->Yi.d[3] = ctr;
1060		}
1061		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1062		n = (n+1)%16;
1063		if (n==0)
1064			GCM_MUL(ctx,Xi);
1065	}
1066
1067	ctx->mres = n;
1068	return 0;
1069}
1070
1071int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1072		const unsigned char *in, unsigned char *out,
1073		size_t len)
1074{
1075	const union { long one; char little; } is_endian = {1};
1076	unsigned int n, ctr;
1077	size_t i;
1078	u64        mlen  = ctx->len.u[1];
1079	block128_f block = ctx->block;
1080	void      *key   = ctx->key;
1081#ifdef GCM_FUNCREF_4BIT
1082	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1083# ifdef GHASH
1084	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1085				const u8 *inp,size_t len)	= ctx->ghash;
1086# endif
1087#endif
1088
1089	mlen += len;
1090	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1091		return -1;
1092	ctx->len.u[1] = mlen;
1093
1094	if (ctx->ares) {
1095		/* First call to decrypt finalizes GHASH(AAD) */
1096		GCM_MUL(ctx,Xi);
1097		ctx->ares = 0;
1098	}
1099
1100	if (is_endian.little)
1101#ifdef BSWAP4
1102		ctr = BSWAP4(ctx->Yi.d[3]);
1103#else
1104		ctr = GETU32(ctx->Yi.c+12);
1105#endif
1106	else
1107		ctr = ctx->Yi.d[3];
1108
1109	n = ctx->mres;
1110#if !defined(OPENSSL_SMALL_FOOTPRINT)
1111	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1112		if (n) {
1113			while (n && len) {
1114				u8 c = *(in++);
1115				*(out++) = c^ctx->EKi.c[n];
1116				ctx->Xi.c[n] ^= c;
1117				--len;
1118				n = (n+1)%16;
1119			}
1120			if (n==0) GCM_MUL (ctx,Xi);
1121			else {
1122				ctx->mres = n;
1123				return 0;
1124			}
1125		}
1126#if defined(STRICT_ALIGNMENT)
1127		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1128			break;
1129#endif
1130#if defined(GHASH) && defined(GHASH_CHUNK)
1131		while (len>=GHASH_CHUNK) {
1132		    size_t j=GHASH_CHUNK;
1133
1134		    GHASH(ctx,in,GHASH_CHUNK);
1135		    while (j) {
1136		    	size_t *out_t=(size_t *)out;
1137		    	const size_t *in_t=(const size_t *)in;
1138
1139			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1140			++ctr;
1141			if (is_endian.little)
1142#ifdef BSWAP4
1143				ctx->Yi.d[3] = BSWAP4(ctr);
1144#else
1145				PUTU32(ctx->Yi.c+12,ctr);
1146#endif
1147			else
1148				ctx->Yi.d[3] = ctr;
1149			for (i=0; i<16/sizeof(size_t); ++i)
1150				out_t[i] = in_t[i]^ctx->EKi.t[i];
1151			out += 16;
1152			in  += 16;
1153			j   -= 16;
1154		    }
1155		    len -= GHASH_CHUNK;
1156		}
1157		if ((i = (len&(size_t)-16))) {
1158		    GHASH(ctx,in,i);
1159		    while (len>=16) {
1160		    	size_t *out_t=(size_t *)out;
1161		    	const size_t *in_t=(const size_t *)in;
1162
1163			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1164			++ctr;
1165			if (is_endian.little)
1166#ifdef BSWAP4
1167				ctx->Yi.d[3] = BSWAP4(ctr);
1168#else
1169				PUTU32(ctx->Yi.c+12,ctr);
1170#endif
1171			else
1172				ctx->Yi.d[3] = ctr;
1173			for (i=0; i<16/sizeof(size_t); ++i)
1174				out_t[i] = in_t[i]^ctx->EKi.t[i];
1175			out += 16;
1176			in  += 16;
1177			len -= 16;
1178		    }
1179		}
1180#else
1181		while (len>=16) {
1182		    	size_t *out_t=(size_t *)out;
1183		    	const size_t *in_t=(const size_t *)in;
1184
1185			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1186			++ctr;
1187			if (is_endian.little)
1188#ifdef BSWAP4
1189				ctx->Yi.d[3] = BSWAP4(ctr);
1190#else
1191				PUTU32(ctx->Yi.c+12,ctr);
1192#endif
1193			else
1194				ctx->Yi.d[3] = ctr;
1195			for (i=0; i<16/sizeof(size_t); ++i) {
1196				size_t c = in[i];
1197				out[i] = c^ctx->EKi.t[i];
1198				ctx->Xi.t[i] ^= c;
1199			}
1200			GCM_MUL(ctx,Xi);
1201			out += 16;
1202			in  += 16;
1203			len -= 16;
1204		}
1205#endif
1206		if (len) {
1207			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1208			++ctr;
1209			if (is_endian.little)
1210#ifdef BSWAP4
1211				ctx->Yi.d[3] = BSWAP4(ctr);
1212#else
1213				PUTU32(ctx->Yi.c+12,ctr);
1214#endif
1215			else
1216				ctx->Yi.d[3] = ctr;
1217			while (len--) {
1218				u8 c = in[n];
1219				ctx->Xi.c[n] ^= c;
1220				out[n] = c^ctx->EKi.c[n];
1221				++n;
1222			}
1223		}
1224
1225		ctx->mres = n;
1226		return 0;
1227	} while(0);
1228#endif
1229	for (i=0;i<len;++i) {
1230		u8 c;
1231		if (n==0) {
1232			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1233			++ctr;
1234			if (is_endian.little)
1235#ifdef BSWAP4
1236				ctx->Yi.d[3] = BSWAP4(ctr);
1237#else
1238				PUTU32(ctx->Yi.c+12,ctr);
1239#endif
1240			else
1241				ctx->Yi.d[3] = ctr;
1242		}
1243		c = in[i];
1244		out[i] = c^ctx->EKi.c[n];
1245		ctx->Xi.c[n] ^= c;
1246		n = (n+1)%16;
1247		if (n==0)
1248			GCM_MUL(ctx,Xi);
1249	}
1250
1251	ctx->mres = n;
1252	return 0;
1253}
1254
1255int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1256		const unsigned char *in, unsigned char *out,
1257		size_t len, ctr128_f stream)
1258{
1259	const union { long one; char little; } is_endian = {1};
1260	unsigned int n, ctr;
1261	size_t i;
1262	u64   mlen = ctx->len.u[1];
1263	void *key  = ctx->key;
1264#ifdef GCM_FUNCREF_4BIT
1265	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1266# ifdef GHASH
1267	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1268				const u8 *inp,size_t len)	= ctx->ghash;
1269# endif
1270#endif
1271
1272	mlen += len;
1273	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1274		return -1;
1275	ctx->len.u[1] = mlen;
1276
1277	if (ctx->ares) {
1278		/* First call to encrypt finalizes GHASH(AAD) */
1279		GCM_MUL(ctx,Xi);
1280		ctx->ares = 0;
1281	}
1282
1283	if (is_endian.little)
1284#ifdef BSWAP4
1285		ctr = BSWAP4(ctx->Yi.d[3]);
1286#else
1287		ctr = GETU32(ctx->Yi.c+12);
1288#endif
1289	else
1290		ctr = ctx->Yi.d[3];
1291
1292	n = ctx->mres;
1293	if (n) {
1294		while (n && len) {
1295			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1296			--len;
1297			n = (n+1)%16;
1298		}
1299		if (n==0) GCM_MUL(ctx,Xi);
1300		else {
1301			ctx->mres = n;
1302			return 0;
1303		}
1304	}
1305#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1306	while (len>=GHASH_CHUNK) {
1307		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1308		ctr += GHASH_CHUNK/16;
1309		if (is_endian.little)
1310#ifdef BSWAP4
1311			ctx->Yi.d[3] = BSWAP4(ctr);
1312#else
1313			PUTU32(ctx->Yi.c+12,ctr);
1314#endif
1315		else
1316			ctx->Yi.d[3] = ctr;
1317		GHASH(ctx,out,GHASH_CHUNK);
1318		out += GHASH_CHUNK;
1319		in  += GHASH_CHUNK;
1320		len -= GHASH_CHUNK;
1321	}
1322#endif
1323	if ((i = (len&(size_t)-16))) {
1324		size_t j=i/16;
1325
1326		(*stream)(in,out,j,key,ctx->Yi.c);
1327		ctr += (unsigned int)j;
1328		if (is_endian.little)
1329#ifdef BSWAP4
1330			ctx->Yi.d[3] = BSWAP4(ctr);
1331#else
1332			PUTU32(ctx->Yi.c+12,ctr);
1333#endif
1334		else
1335			ctx->Yi.d[3] = ctr;
1336		in  += i;
1337		len -= i;
1338#if defined(GHASH)
1339		GHASH(ctx,out,i);
1340		out += i;
1341#else
1342		while (j--) {
1343			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1344			GCM_MUL(ctx,Xi);
1345			out += 16;
1346		}
1347#endif
1348	}
1349	if (len) {
1350		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1351		++ctr;
1352		if (is_endian.little)
1353#ifdef BSWAP4
1354			ctx->Yi.d[3] = BSWAP4(ctr);
1355#else
1356			PUTU32(ctx->Yi.c+12,ctr);
1357#endif
1358		else
1359			ctx->Yi.d[3] = ctr;
1360		while (len--) {
1361			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1362			++n;
1363		}
1364	}
1365
1366	ctx->mres = n;
1367	return 0;
1368}
1369
1370int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1371		const unsigned char *in, unsigned char *out,
1372		size_t len,ctr128_f stream)
1373{
1374	const union { long one; char little; } is_endian = {1};
1375	unsigned int n, ctr;
1376	size_t i;
1377	u64   mlen = ctx->len.u[1];
1378	void *key  = ctx->key;
1379#ifdef GCM_FUNCREF_4BIT
1380	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1381# ifdef GHASH
1382	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1383				const u8 *inp,size_t len)	= ctx->ghash;
1384# endif
1385#endif
1386
1387	mlen += len;
1388	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1389		return -1;
1390	ctx->len.u[1] = mlen;
1391
1392	if (ctx->ares) {
1393		/* First call to decrypt finalizes GHASH(AAD) */
1394		GCM_MUL(ctx,Xi);
1395		ctx->ares = 0;
1396	}
1397
1398	if (is_endian.little)
1399#ifdef BSWAP4
1400		ctr = BSWAP4(ctx->Yi.d[3]);
1401#else
1402		ctr = GETU32(ctx->Yi.c+12);
1403#endif
1404	else
1405		ctr = ctx->Yi.d[3];
1406
1407	n = ctx->mres;
1408	if (n) {
1409		while (n && len) {
1410			u8 c = *(in++);
1411			*(out++) = c^ctx->EKi.c[n];
1412			ctx->Xi.c[n] ^= c;
1413			--len;
1414			n = (n+1)%16;
1415		}
1416		if (n==0) GCM_MUL (ctx,Xi);
1417		else {
1418			ctx->mres = n;
1419			return 0;
1420		}
1421	}
1422#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1423	while (len>=GHASH_CHUNK) {
1424		GHASH(ctx,in,GHASH_CHUNK);
1425		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1426		ctr += GHASH_CHUNK/16;
1427		if (is_endian.little)
1428#ifdef BSWAP4
1429			ctx->Yi.d[3] = BSWAP4(ctr);
1430#else
1431			PUTU32(ctx->Yi.c+12,ctr);
1432#endif
1433		else
1434			ctx->Yi.d[3] = ctr;
1435		out += GHASH_CHUNK;
1436		in  += GHASH_CHUNK;
1437		len -= GHASH_CHUNK;
1438	}
1439#endif
1440	if ((i = (len&(size_t)-16))) {
1441		size_t j=i/16;
1442
1443#if defined(GHASH)
1444		GHASH(ctx,in,i);
1445#else
1446		while (j--) {
1447			size_t k;
1448			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1449			GCM_MUL(ctx,Xi);
1450			in += 16;
1451		}
1452		j   = i/16;
1453		in -= i;
1454#endif
1455		(*stream)(in,out,j,key,ctx->Yi.c);
1456		ctr += (unsigned int)j;
1457		if (is_endian.little)
1458#ifdef BSWAP4
1459			ctx->Yi.d[3] = BSWAP4(ctr);
1460#else
1461			PUTU32(ctx->Yi.c+12,ctr);
1462#endif
1463		else
1464			ctx->Yi.d[3] = ctr;
1465		out += i;
1466		in  += i;
1467		len -= i;
1468	}
1469	if (len) {
1470		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1471		++ctr;
1472		if (is_endian.little)
1473#ifdef BSWAP4
1474			ctx->Yi.d[3] = BSWAP4(ctr);
1475#else
1476			PUTU32(ctx->Yi.c+12,ctr);
1477#endif
1478		else
1479			ctx->Yi.d[3] = ctr;
1480		while (len--) {
1481			u8 c = in[n];
1482			ctx->Xi.c[n] ^= c;
1483			out[n] = c^ctx->EKi.c[n];
1484			++n;
1485		}
1486	}
1487
1488	ctx->mres = n;
1489	return 0;
1490}
1491
1492int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1493			size_t len)
1494{
1495	const union { long one; char little; } is_endian = {1};
1496	u64 alen = ctx->len.u[0]<<3;
1497	u64 clen = ctx->len.u[1]<<3;
1498#ifdef GCM_FUNCREF_4BIT
1499	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1500#endif
1501
1502	if (ctx->mres || ctx->ares)
1503		GCM_MUL(ctx,Xi);
1504
1505	if (is_endian.little) {
1506#ifdef BSWAP8
1507		alen = BSWAP8(alen);
1508		clen = BSWAP8(clen);
1509#else
1510		u8 *p = ctx->len.c;
1511
1512		ctx->len.u[0] = alen;
1513		ctx->len.u[1] = clen;
1514
1515		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1516		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1517#endif
1518	}
1519
1520	ctx->Xi.u[0] ^= alen;
1521	ctx->Xi.u[1] ^= clen;
1522	GCM_MUL(ctx,Xi);
1523
1524	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1525	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1526
1527	if (tag && len<=sizeof(ctx->Xi))
1528		return memcmp(ctx->Xi.c,tag,len);
1529	else
1530		return -1;
1531}
1532
1533void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1534{
1535	CRYPTO_gcm128_finish(ctx, NULL, 0);
1536	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1537}
1538
1539GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1540{
1541	GCM128_CONTEXT *ret;
1542
1543	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1544		CRYPTO_gcm128_init(ret,key,block);
1545
1546	return ret;
1547}
1548
1549void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1550{
1551	if (ctx) {
1552		OPENSSL_cleanse(ctx,sizeof(*ctx));
1553		OPENSSL_free(ctx);
1554	}
1555}
1556
1557#if defined(SELFTEST)
1558#include <stdio.h>
1559#include <openssl/aes.h>
1560
1561/* Test Case 1 */
1562static const u8	K1[16],
1563		*P1=NULL,
1564		*A1=NULL,
1565		IV1[12],
1566		*C1=NULL,
1567		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1568
1569/* Test Case 2 */
1570#define K2 K1
1571#define A2 A1
1572#define IV2 IV1
1573static const u8	P2[16],
1574		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1575		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1576
1577/* Test Case 3 */
1578#define A3 A2
1579static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1580		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1584		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1585		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1586			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1587			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1588			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1589		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1590
1591/* Test Case 4 */
1592#define K4 K3
1593#define IV4 IV3
1594static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1595			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1596			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1597			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1598		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1599			0xab,0xad,0xda,0xd2},
1600		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1601			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1602			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1603			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1604		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1605
1606/* Test Case 5 */
1607#define K5 K4
1608#define P5 P4
1609#define A5 A4
1610static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1611		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1612			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1613			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1614			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1615		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1616
1617/* Test Case 6 */
1618#define K6 K5
1619#define P6 P5
1620#define A6 A5
1621static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1622			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1623			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1624			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1625		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1626			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1627			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1628			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1629		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1630
1631/* Test Case 7 */
1632static const u8 K7[24],
1633		*P7=NULL,
1634		*A7=NULL,
1635		IV7[12],
1636		*C7=NULL,
1637		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1638
1639/* Test Case 8 */
1640#define K8 K7
1641#define IV8 IV7
1642#define A8 A7
1643static const u8	P8[16],
1644		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1645		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1646
1647/* Test Case 9 */
1648#define A9 A8
1649static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1650			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1651		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1652			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1653			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1654			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1655		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1656		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1657			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1658			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1659			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1660		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1661
1662/* Test Case 10 */
1663#define K10 K9
1664#define IV10 IV9
1665static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1666			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1667			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1668			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1669		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1670			0xab,0xad,0xda,0xd2},
1671		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1672			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1673			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1674			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1675		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1676
1677/* Test Case 11 */
1678#define K11 K10
1679#define P11 P10
1680#define A11 A10
1681static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1682		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1683			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1684			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1685			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1686		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1687
1688/* Test Case 12 */
1689#define K12 K11
1690#define P12 P11
1691#define A12 A11
1692static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1693			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1694			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1695			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1696		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1697			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1698			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1699			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1700		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1701
1702/* Test Case 13 */
1703static const u8	K13[32],
1704		*P13=NULL,
1705		*A13=NULL,
1706		IV13[12],
1707		*C13=NULL,
1708		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1709
1710/* Test Case 14 */
1711#define K14 K13
1712#define A14 A13
1713static const u8	P14[16],
1714		IV14[12],
1715		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1716		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1717
1718/* Test Case 15 */
1719#define A15 A14
1720static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1721			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1722		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1723			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1724			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1725			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1726		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1727		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1728			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1729			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1730			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1731		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1732
1733/* Test Case 16 */
1734#define K16 K15
1735#define IV16 IV15
1736static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1737			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1738			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1739			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1740		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1741			0xab,0xad,0xda,0xd2},
1742		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1743			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1744			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1745			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1746		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1747
1748/* Test Case 17 */
1749#define K17 K16
1750#define P17 P16
1751#define A17 A16
1752static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1753		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1754			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1755			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1756			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1757		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1758
1759/* Test Case 18 */
1760#define K18 K17
1761#define P18 P17
1762#define A18 A17
1763static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1764			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1765			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1766			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1767		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1768			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1769			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1770			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1771		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1772
1773/* Test Case 19 */
1774#define K19 K1
1775#define P19 P1
1776#define IV19 IV1
1777#define C19 C1
1778static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1779			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1780			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1781			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1782			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1783			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1784			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1785			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1786		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1787
1788/* Test Case 20 */
1789#define K20 K1
1790#define A20 A1
1791static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
1792		P20[288],
1793		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1794			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1795			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1796			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1797			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1798			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1799			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1800			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1801			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1802			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1803			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1804			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1805			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1806			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1807			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1808			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1809			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1810			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1811		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1812
1813#define TEST_CASE(n)	do {					\
1814	u8 out[sizeof(P##n)];					\
1815	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
1816	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
1817	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1818	memset(out,0,sizeof(out));				\
1819	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1820	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
1821	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1822	    (C##n && memcmp(out,C##n,sizeof(out))))		\
1823		ret++, printf ("encrypt test#%d failed.\n",n);	\
1824	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1825	memset(out,0,sizeof(out));				\
1826	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1827	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
1828	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1829	    (P##n && memcmp(out,P##n,sizeof(out))))		\
1830		ret++, printf ("decrypt test#%d failed.\n",n);	\
1831	} while(0)
1832
1833int main()
1834{
1835	GCM128_CONTEXT ctx;
1836	AES_KEY key;
1837	int ret=0;
1838
1839	TEST_CASE(1);
1840	TEST_CASE(2);
1841	TEST_CASE(3);
1842	TEST_CASE(4);
1843	TEST_CASE(5);
1844	TEST_CASE(6);
1845	TEST_CASE(7);
1846	TEST_CASE(8);
1847	TEST_CASE(9);
1848	TEST_CASE(10);
1849	TEST_CASE(11);
1850	TEST_CASE(12);
1851	TEST_CASE(13);
1852	TEST_CASE(14);
1853	TEST_CASE(15);
1854	TEST_CASE(16);
1855	TEST_CASE(17);
1856	TEST_CASE(18);
1857	TEST_CASE(19);
1858	TEST_CASE(20);
1859
1860#ifdef OPENSSL_CPUID_OBJ
1861	{
1862	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1863	union { u64 u; u8 c[1024]; } buf;
1864	int i;
1865
1866	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1867	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1868	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1869
1870	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1871	start = OPENSSL_rdtsc();
1872	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1873	gcm_t = OPENSSL_rdtsc() - start;
1874
1875	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1876			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1877			(block128_f)AES_encrypt);
1878	start = OPENSSL_rdtsc();
1879	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1880			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1881			(block128_f)AES_encrypt);
1882	ctr_t = OPENSSL_rdtsc() - start;
1883
1884	printf("%.2f-%.2f=%.2f\n",
1885			gcm_t/(double)sizeof(buf),
1886			ctr_t/(double)sizeof(buf),
1887			(gcm_t-ctr_t)/(double)sizeof(buf));
1888#ifdef GHASH
1889	{
1890	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1891				const u8 *inp,size_t len)	= ctx.ghash;
1892
1893	GHASH((&ctx),buf.c,sizeof(buf));
1894	start = OPENSSL_rdtsc();
1895	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1896	gcm_t = OPENSSL_rdtsc() - start;
1897	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1898	}
1899#endif
1900	}
1901#endif
1902
1903	return ret;
1904}
1905#endif
1906