gcm128.c revision 296341
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58#  define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65# undef  GETU32
66# define GETU32(p)       BSWAP4(*(const u32 *)(p))
67# undef  PUTU32
68# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V)   do { \
73        if (sizeof(size_t)==8) { \
74                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                V.lo  = (V.hi<<63)|(V.lo>>1); \
76                V.hi  = (V.hi>>1 )^T; \
77        } \
78        else { \
79                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                V.lo  = (V.hi<<63)|(V.lo>>1); \
81                V.hi  = (V.hi>>1 )^((u64)T<<32); \
82        } \
83} while(0)
84
85/*-
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 *   handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 *   subsystem penalties (for example on Windows large enough free
111 *   results in VM working set trimming, meaning that consequent
112 *   malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 *   performance of other code paths (not necessarily even from same
115 *   thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if     TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123    int i, j;
124    u128 V;
125
126    Htable[0].hi = 0;
127    Htable[0].lo = 0;
128    V.hi = H[0];
129    V.lo = H[1];
130
131    for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
132        REDUCE1BIT(V);
133        Htable[i] = V;
134    }
135
136    for (i = 2; i < 256; i <<= 1) {
137        u128 *Hi = Htable + i, H0 = *Hi;
138        for (j = 1; j < i; ++j) {
139            Hi[j].hi = H0.hi ^ Htable[j].hi;
140            Hi[j].lo = H0.lo ^ Htable[j].lo;
141        }
142    }
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147    u128 Z = { 0, 0 };
148    const u8 *xi = (const u8 *)Xi + 15;
149    size_t rem, n = *xi;
150    const union {
151        long one;
152        char little;
153    } is_endian = {
154        1
155    };
156    static const size_t rem_8bit[256] = {
157        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159        PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160        PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161        PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162        PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163        PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164        PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165        PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166        PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167        PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168        PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169        PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170        PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171        PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172        PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173        PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174        PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175        PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176        PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177        PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178        PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179        PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180        PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181        PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182        PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183        PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184        PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185        PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186        PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187        PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188        PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189        PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190        PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191        PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192        PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193        PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194        PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195        PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196        PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197        PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198        PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199        PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200        PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201        PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202        PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203        PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204        PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205        PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206        PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207        PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208        PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209        PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210        PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211        PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212        PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213        PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214        PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215        PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216        PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217        PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218        PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219        PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220        PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
221    };
222
223    while (1) {
224        Z.hi ^= Htable[n].hi;
225        Z.lo ^= Htable[n].lo;
226
227        if ((u8 *)Xi == xi)
228            break;
229
230        n = *(--xi);
231
232        rem = (size_t)Z.lo & 0xff;
233        Z.lo = (Z.hi << 56) | (Z.lo >> 8);
234        Z.hi = (Z.hi >> 8);
235        if (sizeof(size_t) == 8)
236            Z.hi ^= rem_8bit[rem];
237        else
238            Z.hi ^= (u64)rem_8bit[rem] << 32;
239    }
240
241    if (is_endian.little) {
242# ifdef BSWAP8
243        Xi[0] = BSWAP8(Z.hi);
244        Xi[1] = BSWAP8(Z.lo);
245# else
246        u8 *p = (u8 *)Xi;
247        u32 v;
248        v = (u32)(Z.hi >> 32);
249        PUTU32(p, v);
250        v = (u32)(Z.hi);
251        PUTU32(p + 4, v);
252        v = (u32)(Z.lo >> 32);
253        PUTU32(p + 8, v);
254        v = (u32)(Z.lo);
255        PUTU32(p + 12, v);
256# endif
257    } else {
258        Xi[0] = Z.hi;
259        Xi[1] = Z.lo;
260    }
261}
262
263# define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
264
265#elif   TABLE_BITS==4
266
267static void gcm_init_4bit(u128 Htable[16], u64 H[2])
268{
269    u128 V;
270# if defined(OPENSSL_SMALL_FOOTPRINT)
271    int i;
272# endif
273
274    Htable[0].hi = 0;
275    Htable[0].lo = 0;
276    V.hi = H[0];
277    V.lo = H[1];
278
279# if defined(OPENSSL_SMALL_FOOTPRINT)
280    for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
281        REDUCE1BIT(V);
282        Htable[i] = V;
283    }
284
285    for (i = 2; i < 16; i <<= 1) {
286        u128 *Hi = Htable + i;
287        int j;
288        for (V = *Hi, j = 1; j < i; ++j) {
289            Hi[j].hi = V.hi ^ Htable[j].hi;
290            Hi[j].lo = V.lo ^ Htable[j].lo;
291        }
292    }
293# else
294    Htable[8] = V;
295    REDUCE1BIT(V);
296    Htable[4] = V;
297    REDUCE1BIT(V);
298    Htable[2] = V;
299    REDUCE1BIT(V);
300    Htable[1] = V;
301    Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
302    V = Htable[4];
303    Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
304    Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
305    Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
306    V = Htable[8];
307    Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
308    Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
309    Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
310    Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
311    Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
312    Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
313    Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
314# endif
315# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
316    /*
317     * ARM assembler expects specific dword order in Htable.
318     */
319    {
320        int j;
321        const union {
322            long one;
323            char little;
324        } is_endian = {
325            1
326        };
327
328        if (is_endian.little)
329            for (j = 0; j < 16; ++j) {
330                V = Htable[j];
331                Htable[j].hi = V.lo;
332                Htable[j].lo = V.hi;
333        } else
334            for (j = 0; j < 16; ++j) {
335                V = Htable[j];
336                Htable[j].hi = V.lo << 32 | V.lo >> 32;
337                Htable[j].lo = V.hi << 32 | V.hi >> 32;
338            }
339    }
340# endif
341}
342
343# ifndef GHASH_ASM
344static const size_t rem_4bit[16] = {
345    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
346    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
347    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
348    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
349};
350
351static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
352{
353    u128 Z;
354    int cnt = 15;
355    size_t rem, nlo, nhi;
356    const union {
357        long one;
358        char little;
359    } is_endian = {
360        1
361    };
362
363    nlo = ((const u8 *)Xi)[15];
364    nhi = nlo >> 4;
365    nlo &= 0xf;
366
367    Z.hi = Htable[nlo].hi;
368    Z.lo = Htable[nlo].lo;
369
370    while (1) {
371        rem = (size_t)Z.lo & 0xf;
372        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
373        Z.hi = (Z.hi >> 4);
374        if (sizeof(size_t) == 8)
375            Z.hi ^= rem_4bit[rem];
376        else
377            Z.hi ^= (u64)rem_4bit[rem] << 32;
378
379        Z.hi ^= Htable[nhi].hi;
380        Z.lo ^= Htable[nhi].lo;
381
382        if (--cnt < 0)
383            break;
384
385        nlo = ((const u8 *)Xi)[cnt];
386        nhi = nlo >> 4;
387        nlo &= 0xf;
388
389        rem = (size_t)Z.lo & 0xf;
390        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
391        Z.hi = (Z.hi >> 4);
392        if (sizeof(size_t) == 8)
393            Z.hi ^= rem_4bit[rem];
394        else
395            Z.hi ^= (u64)rem_4bit[rem] << 32;
396
397        Z.hi ^= Htable[nlo].hi;
398        Z.lo ^= Htable[nlo].lo;
399    }
400
401    if (is_endian.little) {
402#  ifdef BSWAP8
403        Xi[0] = BSWAP8(Z.hi);
404        Xi[1] = BSWAP8(Z.lo);
405#  else
406        u8 *p = (u8 *)Xi;
407        u32 v;
408        v = (u32)(Z.hi >> 32);
409        PUTU32(p, v);
410        v = (u32)(Z.hi);
411        PUTU32(p + 4, v);
412        v = (u32)(Z.lo >> 32);
413        PUTU32(p + 8, v);
414        v = (u32)(Z.lo);
415        PUTU32(p + 12, v);
416#  endif
417    } else {
418        Xi[0] = Z.hi;
419        Xi[1] = Z.lo;
420    }
421}
422
423#  if !defined(OPENSSL_SMALL_FOOTPRINT)
424/*
425 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
426 * details... Compiler-generated code doesn't seem to give any
427 * performance improvement, at least not on x86[_64]. It's here
428 * mostly as reference and a placeholder for possible future
429 * non-trivial optimization[s]...
430 */
431static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
432                           const u8 *inp, size_t len)
433{
434    u128 Z;
435    int cnt;
436    size_t rem, nlo, nhi;
437    const union {
438        long one;
439        char little;
440    } is_endian = {
441        1
442    };
443
444#   if 1
445    do {
446        cnt = 15;
447        nlo = ((const u8 *)Xi)[15];
448        nlo ^= inp[15];
449        nhi = nlo >> 4;
450        nlo &= 0xf;
451
452        Z.hi = Htable[nlo].hi;
453        Z.lo = Htable[nlo].lo;
454
455        while (1) {
456            rem = (size_t)Z.lo & 0xf;
457            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
458            Z.hi = (Z.hi >> 4);
459            if (sizeof(size_t) == 8)
460                Z.hi ^= rem_4bit[rem];
461            else
462                Z.hi ^= (u64)rem_4bit[rem] << 32;
463
464            Z.hi ^= Htable[nhi].hi;
465            Z.lo ^= Htable[nhi].lo;
466
467            if (--cnt < 0)
468                break;
469
470            nlo = ((const u8 *)Xi)[cnt];
471            nlo ^= inp[cnt];
472            nhi = nlo >> 4;
473            nlo &= 0xf;
474
475            rem = (size_t)Z.lo & 0xf;
476            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
477            Z.hi = (Z.hi >> 4);
478            if (sizeof(size_t) == 8)
479                Z.hi ^= rem_4bit[rem];
480            else
481                Z.hi ^= (u64)rem_4bit[rem] << 32;
482
483            Z.hi ^= Htable[nlo].hi;
484            Z.lo ^= Htable[nlo].lo;
485        }
486#   else
487    /*
488     * Extra 256+16 bytes per-key plus 512 bytes shared tables
489     * [should] give ~50% improvement... One could have PACK()-ed
490     * the rem_8bit even here, but the priority is to minimize
491     * cache footprint...
492     */
493    u128 Hshr4[16];             /* Htable shifted right by 4 bits */
494    u8 Hshl4[16];               /* Htable shifted left by 4 bits */
495    static const unsigned short rem_8bit[256] = {
496        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
497        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
498        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
499        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
500        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
501        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
502        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
503        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
504        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
505        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
506        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
507        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
508        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
509        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
510        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
511        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
512        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
513        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
514        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
515        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
516        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
517        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
518        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
519        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
520        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
521        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
522        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
523        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
524        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
525        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
526        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
527        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
528    };
529    /*
530     * This pre-processing phase slows down procedure by approximately
531     * same time as it makes each loop spin faster. In other words
532     * single block performance is approximately same as straightforward
533     * "4-bit" implementation, and then it goes only faster...
534     */
535    for (cnt = 0; cnt < 16; ++cnt) {
536        Z.hi = Htable[cnt].hi;
537        Z.lo = Htable[cnt].lo;
538        Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
539        Hshr4[cnt].hi = (Z.hi >> 4);
540        Hshl4[cnt] = (u8)(Z.lo << 4);
541    }
542
543    do {
544        for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
545            nlo = ((const u8 *)Xi)[cnt];
546            nlo ^= inp[cnt];
547            nhi = nlo >> 4;
548            nlo &= 0xf;
549
550            Z.hi ^= Htable[nlo].hi;
551            Z.lo ^= Htable[nlo].lo;
552
553            rem = (size_t)Z.lo & 0xff;
554
555            Z.lo = (Z.hi << 56) | (Z.lo >> 8);
556            Z.hi = (Z.hi >> 8);
557
558            Z.hi ^= Hshr4[nhi].hi;
559            Z.lo ^= Hshr4[nhi].lo;
560            Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
561        }
562
563        nlo = ((const u8 *)Xi)[0];
564        nlo ^= inp[0];
565        nhi = nlo >> 4;
566        nlo &= 0xf;
567
568        Z.hi ^= Htable[nlo].hi;
569        Z.lo ^= Htable[nlo].lo;
570
571        rem = (size_t)Z.lo & 0xf;
572
573        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
574        Z.hi = (Z.hi >> 4);
575
576        Z.hi ^= Htable[nhi].hi;
577        Z.lo ^= Htable[nhi].lo;
578        Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
579#   endif
580
581        if (is_endian.little) {
582#   ifdef BSWAP8
583            Xi[0] = BSWAP8(Z.hi);
584            Xi[1] = BSWAP8(Z.lo);
585#   else
586            u8 *p = (u8 *)Xi;
587            u32 v;
588            v = (u32)(Z.hi >> 32);
589            PUTU32(p, v);
590            v = (u32)(Z.hi);
591            PUTU32(p + 4, v);
592            v = (u32)(Z.lo >> 32);
593            PUTU32(p + 8, v);
594            v = (u32)(Z.lo);
595            PUTU32(p + 12, v);
596#   endif
597        } else {
598            Xi[0] = Z.hi;
599            Xi[1] = Z.lo;
600        }
601    } while (inp += 16, len -= 16);
602}
603#  endif
604# else
605void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
606void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
607                    size_t len);
608# endif
609
610# define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
611# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
612#  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
613/*
614 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
615 * effect. In other words idea is to hash data while it's still in L1 cache
616 * after encryption pass...
617 */
618#  define GHASH_CHUNK       (3*1024)
619# endif
620
621#else                           /* TABLE_BITS */
622
623static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
624{
625    u128 V, Z = { 0, 0 };
626    long X;
627    int i, j;
628    const long *xi = (const long *)Xi;
629    const union {
630        long one;
631        char little;
632    } is_endian = {
633        1
634    };
635
636    V.hi = H[0];                /* H is in host byte order, no byte swapping */
637    V.lo = H[1];
638
639    for (j = 0; j < 16 / sizeof(long); ++j) {
640        if (is_endian.little) {
641            if (sizeof(long) == 8) {
642# ifdef BSWAP8
643                X = (long)(BSWAP8(xi[j]));
644# else
645                const u8 *p = (const u8 *)(xi + j);
646                X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
647# endif
648            } else {
649                const u8 *p = (const u8 *)(xi + j);
650                X = (long)GETU32(p);
651            }
652        } else
653            X = xi[j];
654
655        for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
656            u64 M = (u64)(X >> (8 * sizeof(long) - 1));
657            Z.hi ^= V.hi & M;
658            Z.lo ^= V.lo & M;
659
660            REDUCE1BIT(V);
661        }
662    }
663
664    if (is_endian.little) {
665# ifdef BSWAP8
666        Xi[0] = BSWAP8(Z.hi);
667        Xi[1] = BSWAP8(Z.lo);
668# else
669        u8 *p = (u8 *)Xi;
670        u32 v;
671        v = (u32)(Z.hi >> 32);
672        PUTU32(p, v);
673        v = (u32)(Z.hi);
674        PUTU32(p + 4, v);
675        v = (u32)(Z.lo >> 32);
676        PUTU32(p + 8, v);
677        v = (u32)(Z.lo);
678        PUTU32(p + 12, v);
679# endif
680    } else {
681        Xi[0] = Z.hi;
682        Xi[1] = Z.lo;
683    }
684}
685
686# define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
687
688#endif
689
690#if     TABLE_BITS==4 && defined(GHASH_ASM)
691# if    !defined(I386_ONLY) && \
692        (defined(__i386)        || defined(__i386__)    || \
693         defined(__x86_64)      || defined(__x86_64__)  || \
694         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
695#  define GHASH_ASM_X86_OR_64
696#  define GCM_FUNCREF_4BIT
697extern unsigned int OPENSSL_ia32cap_P[2];
698
699void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
700void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
701void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
702                     size_t len);
703
704#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
705#   define GHASH_ASM_X86
706void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
707void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
708                        size_t len);
709
710void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
711void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
712                        size_t len);
713#  endif
714# elif defined(__arm__) || defined(__arm)
715#  include "arm_arch.h"
716#  if __ARM_ARCH__>=7
717#   define GHASH_ASM_ARM
718#   define GCM_FUNCREF_4BIT
719void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
720void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
721                    size_t len);
722#  endif
723# endif
724#endif
725
726#ifdef GCM_FUNCREF_4BIT
727# undef  GCM_MUL
728# define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
729# ifdef GHASH
730#  undef  GHASH
731#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
732# endif
733#endif
734
735void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
736{
737    const union {
738        long one;
739        char little;
740    } is_endian = {
741        1
742    };
743
744    memset(ctx, 0, sizeof(*ctx));
745    ctx->block = block;
746    ctx->key = key;
747
748    (*block) (ctx->H.c, ctx->H.c, key);
749
750    if (is_endian.little) {
751        /* H is stored in host byte order */
752#ifdef BSWAP8
753        ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
754        ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
755#else
756        u8 *p = ctx->H.c;
757        u64 hi, lo;
758        hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
759        lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
760        ctx->H.u[0] = hi;
761        ctx->H.u[1] = lo;
762#endif
763    }
764#if     TABLE_BITS==8
765    gcm_init_8bit(ctx->Htable, ctx->H.u);
766#elif   TABLE_BITS==4
767# if    defined(GHASH_ASM_X86_OR_64)
768#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
769    if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
770        OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
771        gcm_init_clmul(ctx->Htable, ctx->H.u);
772        ctx->gmult = gcm_gmult_clmul;
773        ctx->ghash = gcm_ghash_clmul;
774        return;
775    }
776#  endif
777    gcm_init_4bit(ctx->Htable, ctx->H.u);
778#  if   defined(GHASH_ASM_X86)  /* x86 only */
779#   if  defined(OPENSSL_IA32_SSE2)
780    if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
781#   else
782    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
783#   endif
784        ctx->gmult = gcm_gmult_4bit_mmx;
785        ctx->ghash = gcm_ghash_4bit_mmx;
786    } else {
787        ctx->gmult = gcm_gmult_4bit_x86;
788        ctx->ghash = gcm_ghash_4bit_x86;
789    }
790#  else
791    ctx->gmult = gcm_gmult_4bit;
792    ctx->ghash = gcm_ghash_4bit;
793#  endif
794# elif  defined(GHASH_ASM_ARM)
795    if (OPENSSL_armcap_P & ARMV7_NEON) {
796        ctx->gmult = gcm_gmult_neon;
797        ctx->ghash = gcm_ghash_neon;
798    } else {
799        gcm_init_4bit(ctx->Htable, ctx->H.u);
800        ctx->gmult = gcm_gmult_4bit;
801        ctx->ghash = gcm_ghash_4bit;
802    }
803# else
804    gcm_init_4bit(ctx->Htable, ctx->H.u);
805# endif
806#endif
807}
808
809void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
810                         size_t len)
811{
812    const union {
813        long one;
814        char little;
815    } is_endian = {
816        1
817    };
818    unsigned int ctr;
819#ifdef GCM_FUNCREF_4BIT
820    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
821#endif
822
823    ctx->Yi.u[0] = 0;
824    ctx->Yi.u[1] = 0;
825    ctx->Xi.u[0] = 0;
826    ctx->Xi.u[1] = 0;
827    ctx->len.u[0] = 0;          /* AAD length */
828    ctx->len.u[1] = 0;          /* message length */
829    ctx->ares = 0;
830    ctx->mres = 0;
831
832    if (len == 12) {
833        memcpy(ctx->Yi.c, iv, 12);
834        ctx->Yi.c[15] = 1;
835        ctr = 1;
836    } else {
837        size_t i;
838        u64 len0 = len;
839
840        while (len >= 16) {
841            for (i = 0; i < 16; ++i)
842                ctx->Yi.c[i] ^= iv[i];
843            GCM_MUL(ctx, Yi);
844            iv += 16;
845            len -= 16;
846        }
847        if (len) {
848            for (i = 0; i < len; ++i)
849                ctx->Yi.c[i] ^= iv[i];
850            GCM_MUL(ctx, Yi);
851        }
852        len0 <<= 3;
853        if (is_endian.little) {
854#ifdef BSWAP8
855            ctx->Yi.u[1] ^= BSWAP8(len0);
856#else
857            ctx->Yi.c[8] ^= (u8)(len0 >> 56);
858            ctx->Yi.c[9] ^= (u8)(len0 >> 48);
859            ctx->Yi.c[10] ^= (u8)(len0 >> 40);
860            ctx->Yi.c[11] ^= (u8)(len0 >> 32);
861            ctx->Yi.c[12] ^= (u8)(len0 >> 24);
862            ctx->Yi.c[13] ^= (u8)(len0 >> 16);
863            ctx->Yi.c[14] ^= (u8)(len0 >> 8);
864            ctx->Yi.c[15] ^= (u8)(len0);
865#endif
866        } else
867            ctx->Yi.u[1] ^= len0;
868
869        GCM_MUL(ctx, Yi);
870
871        if (is_endian.little)
872#ifdef BSWAP4
873            ctr = BSWAP4(ctx->Yi.d[3]);
874#else
875            ctr = GETU32(ctx->Yi.c + 12);
876#endif
877        else
878            ctr = ctx->Yi.d[3];
879    }
880
881    (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
882    ++ctr;
883    if (is_endian.little)
884#ifdef BSWAP4
885        ctx->Yi.d[3] = BSWAP4(ctr);
886#else
887        PUTU32(ctx->Yi.c + 12, ctr);
888#endif
889    else
890        ctx->Yi.d[3] = ctr;
891}
892
893int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
894                      size_t len)
895{
896    size_t i;
897    unsigned int n;
898    u64 alen = ctx->len.u[0];
899#ifdef GCM_FUNCREF_4BIT
900    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
901# ifdef GHASH
902    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
903                         const u8 *inp, size_t len) = ctx->ghash;
904# endif
905#endif
906
907    if (ctx->len.u[1])
908        return -2;
909
910    alen += len;
911    if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
912        return -1;
913    ctx->len.u[0] = alen;
914
915    n = ctx->ares;
916    if (n) {
917        while (n && len) {
918            ctx->Xi.c[n] ^= *(aad++);
919            --len;
920            n = (n + 1) % 16;
921        }
922        if (n == 0)
923            GCM_MUL(ctx, Xi);
924        else {
925            ctx->ares = n;
926            return 0;
927        }
928    }
929#ifdef GHASH
930    if ((i = (len & (size_t)-16))) {
931        GHASH(ctx, aad, i);
932        aad += i;
933        len -= i;
934    }
935#else
936    while (len >= 16) {
937        for (i = 0; i < 16; ++i)
938            ctx->Xi.c[i] ^= aad[i];
939        GCM_MUL(ctx, Xi);
940        aad += 16;
941        len -= 16;
942    }
943#endif
944    if (len) {
945        n = (unsigned int)len;
946        for (i = 0; i < len; ++i)
947            ctx->Xi.c[i] ^= aad[i];
948    }
949
950    ctx->ares = n;
951    return 0;
952}
953
954int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
955                          const unsigned char *in, unsigned char *out,
956                          size_t len)
957{
958    const union {
959        long one;
960        char little;
961    } is_endian = {
962        1
963    };
964    unsigned int n, ctr;
965    size_t i;
966    u64 mlen = ctx->len.u[1];
967    block128_f block = ctx->block;
968    void *key = ctx->key;
969#ifdef GCM_FUNCREF_4BIT
970    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
971# ifdef GHASH
972    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
973                         const u8 *inp, size_t len) = ctx->ghash;
974# endif
975#endif
976
977#if 0
978    n = (unsigned int)mlen % 16; /* alternative to ctx->mres */
979#endif
980    mlen += len;
981    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
982        return -1;
983    ctx->len.u[1] = mlen;
984
985    if (ctx->ares) {
986        /* First call to encrypt finalizes GHASH(AAD) */
987        GCM_MUL(ctx, Xi);
988        ctx->ares = 0;
989    }
990
991    if (is_endian.little)
992#ifdef BSWAP4
993        ctr = BSWAP4(ctx->Yi.d[3]);
994#else
995        ctr = GETU32(ctx->Yi.c + 12);
996#endif
997    else
998        ctr = ctx->Yi.d[3];
999
1000    n = ctx->mres;
1001#if !defined(OPENSSL_SMALL_FOOTPRINT)
1002    if (16 % sizeof(size_t) == 0) { /* always true actually */
1003        do {
1004            if (n) {
1005                while (n && len) {
1006                    ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1007                    --len;
1008                    n = (n + 1) % 16;
1009                }
1010                if (n == 0)
1011                    GCM_MUL(ctx, Xi);
1012                else {
1013                    ctx->mres = n;
1014                    return 0;
1015                }
1016            }
1017# if defined(STRICT_ALIGNMENT)
1018            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1019                break;
1020# endif
1021# if defined(GHASH) && defined(GHASH_CHUNK)
1022            while (len >= GHASH_CHUNK) {
1023                size_t j = GHASH_CHUNK;
1024
1025                while (j) {
1026                    size_t *out_t = (size_t *)out;
1027                    const size_t *in_t = (const size_t *)in;
1028
1029                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1030                    ++ctr;
1031                    if (is_endian.little)
1032#  ifdef BSWAP4
1033                        ctx->Yi.d[3] = BSWAP4(ctr);
1034#  else
1035                        PUTU32(ctx->Yi.c + 12, ctr);
1036#  endif
1037                    else
1038                        ctx->Yi.d[3] = ctr;
1039                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1040                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1041                    out += 16;
1042                    in += 16;
1043                    j -= 16;
1044                }
1045                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1046                len -= GHASH_CHUNK;
1047            }
1048            if ((i = (len & (size_t)-16))) {
1049                size_t j = i;
1050
1051                while (len >= 16) {
1052                    size_t *out_t = (size_t *)out;
1053                    const size_t *in_t = (const size_t *)in;
1054
1055                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1056                    ++ctr;
1057                    if (is_endian.little)
1058#  ifdef BSWAP4
1059                        ctx->Yi.d[3] = BSWAP4(ctr);
1060#  else
1061                        PUTU32(ctx->Yi.c + 12, ctr);
1062#  endif
1063                    else
1064                        ctx->Yi.d[3] = ctr;
1065                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1066                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1067                    out += 16;
1068                    in += 16;
1069                    len -= 16;
1070                }
1071                GHASH(ctx, out - j, j);
1072            }
1073# else
1074            while (len >= 16) {
1075                size_t *out_t = (size_t *)out;
1076                const size_t *in_t = (const size_t *)in;
1077
1078                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1079                ++ctr;
1080                if (is_endian.little)
1081#  ifdef BSWAP4
1082                    ctx->Yi.d[3] = BSWAP4(ctr);
1083#  else
1084                    PUTU32(ctx->Yi.c + 12, ctr);
1085#  endif
1086                else
1087                    ctx->Yi.d[3] = ctr;
1088                for (i = 0; i < 16 / sizeof(size_t); ++i)
1089                    ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1090                GCM_MUL(ctx, Xi);
1091                out += 16;
1092                in += 16;
1093                len -= 16;
1094            }
1095# endif
1096            if (len) {
1097                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1098                ++ctr;
1099                if (is_endian.little)
1100# ifdef BSWAP4
1101                    ctx->Yi.d[3] = BSWAP4(ctr);
1102# else
1103                    PUTU32(ctx->Yi.c + 12, ctr);
1104# endif
1105                else
1106                    ctx->Yi.d[3] = ctr;
1107                while (len--) {
1108                    ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1109                    ++n;
1110                }
1111            }
1112
1113            ctx->mres = n;
1114            return 0;
1115        } while (0);
1116    }
1117#endif
1118    for (i = 0; i < len; ++i) {
1119        if (n == 0) {
1120            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1121            ++ctr;
1122            if (is_endian.little)
1123#ifdef BSWAP4
1124                ctx->Yi.d[3] = BSWAP4(ctr);
1125#else
1126                PUTU32(ctx->Yi.c + 12, ctr);
1127#endif
1128            else
1129                ctx->Yi.d[3] = ctr;
1130        }
1131        ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1132        n = (n + 1) % 16;
1133        if (n == 0)
1134            GCM_MUL(ctx, Xi);
1135    }
1136
1137    ctx->mres = n;
1138    return 0;
1139}
1140
1141int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1142                          const unsigned char *in, unsigned char *out,
1143                          size_t len)
1144{
1145    const union {
1146        long one;
1147        char little;
1148    } is_endian = {
1149        1
1150    };
1151    unsigned int n, ctr;
1152    size_t i;
1153    u64 mlen = ctx->len.u[1];
1154    block128_f block = ctx->block;
1155    void *key = ctx->key;
1156#ifdef GCM_FUNCREF_4BIT
1157    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1158# ifdef GHASH
1159    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1160                         const u8 *inp, size_t len) = ctx->ghash;
1161# endif
1162#endif
1163
1164    mlen += len;
1165    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1166        return -1;
1167    ctx->len.u[1] = mlen;
1168
1169    if (ctx->ares) {
1170        /* First call to decrypt finalizes GHASH(AAD) */
1171        GCM_MUL(ctx, Xi);
1172        ctx->ares = 0;
1173    }
1174
1175    if (is_endian.little)
1176#ifdef BSWAP4
1177        ctr = BSWAP4(ctx->Yi.d[3]);
1178#else
1179        ctr = GETU32(ctx->Yi.c + 12);
1180#endif
1181    else
1182        ctr = ctx->Yi.d[3];
1183
1184    n = ctx->mres;
1185#if !defined(OPENSSL_SMALL_FOOTPRINT)
1186    if (16 % sizeof(size_t) == 0) { /* always true actually */
1187        do {
1188            if (n) {
1189                while (n && len) {
1190                    u8 c = *(in++);
1191                    *(out++) = c ^ ctx->EKi.c[n];
1192                    ctx->Xi.c[n] ^= c;
1193                    --len;
1194                    n = (n + 1) % 16;
1195                }
1196                if (n == 0)
1197                    GCM_MUL(ctx, Xi);
1198                else {
1199                    ctx->mres = n;
1200                    return 0;
1201                }
1202            }
1203# if defined(STRICT_ALIGNMENT)
1204            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1205                break;
1206# endif
1207# if defined(GHASH) && defined(GHASH_CHUNK)
1208            while (len >= GHASH_CHUNK) {
1209                size_t j = GHASH_CHUNK;
1210
1211                GHASH(ctx, in, GHASH_CHUNK);
1212                while (j) {
1213                    size_t *out_t = (size_t *)out;
1214                    const size_t *in_t = (const size_t *)in;
1215
1216                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1217                    ++ctr;
1218                    if (is_endian.little)
1219#  ifdef BSWAP4
1220                        ctx->Yi.d[3] = BSWAP4(ctr);
1221#  else
1222                        PUTU32(ctx->Yi.c + 12, ctr);
1223#  endif
1224                    else
1225                        ctx->Yi.d[3] = ctr;
1226                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1227                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1228                    out += 16;
1229                    in += 16;
1230                    j -= 16;
1231                }
1232                len -= GHASH_CHUNK;
1233            }
1234            if ((i = (len & (size_t)-16))) {
1235                GHASH(ctx, in, i);
1236                while (len >= 16) {
1237                    size_t *out_t = (size_t *)out;
1238                    const size_t *in_t = (const size_t *)in;
1239
1240                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1241                    ++ctr;
1242                    if (is_endian.little)
1243#  ifdef BSWAP4
1244                        ctx->Yi.d[3] = BSWAP4(ctr);
1245#  else
1246                        PUTU32(ctx->Yi.c + 12, ctr);
1247#  endif
1248                    else
1249                        ctx->Yi.d[3] = ctr;
1250                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1251                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1252                    out += 16;
1253                    in += 16;
1254                    len -= 16;
1255                }
1256            }
1257# else
1258            while (len >= 16) {
1259                size_t *out_t = (size_t *)out;
1260                const size_t *in_t = (const size_t *)in;
1261
1262                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1263                ++ctr;
1264                if (is_endian.little)
1265#  ifdef BSWAP4
1266                    ctx->Yi.d[3] = BSWAP4(ctr);
1267#  else
1268                    PUTU32(ctx->Yi.c + 12, ctr);
1269#  endif
1270                else
1271                    ctx->Yi.d[3] = ctr;
1272                for (i = 0; i < 16 / sizeof(size_t); ++i) {
1273                    size_t c = in[i];
1274                    out[i] = c ^ ctx->EKi.t[i];
1275                    ctx->Xi.t[i] ^= c;
1276                }
1277                GCM_MUL(ctx, Xi);
1278                out += 16;
1279                in += 16;
1280                len -= 16;
1281            }
1282# endif
1283            if (len) {
1284                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1285                ++ctr;
1286                if (is_endian.little)
1287# ifdef BSWAP4
1288                    ctx->Yi.d[3] = BSWAP4(ctr);
1289# else
1290                    PUTU32(ctx->Yi.c + 12, ctr);
1291# endif
1292                else
1293                    ctx->Yi.d[3] = ctr;
1294                while (len--) {
1295                    u8 c = in[n];
1296                    ctx->Xi.c[n] ^= c;
1297                    out[n] = c ^ ctx->EKi.c[n];
1298                    ++n;
1299                }
1300            }
1301
1302            ctx->mres = n;
1303            return 0;
1304        } while (0);
1305    }
1306#endif
1307    for (i = 0; i < len; ++i) {
1308        u8 c;
1309        if (n == 0) {
1310            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1311            ++ctr;
1312            if (is_endian.little)
1313#ifdef BSWAP4
1314                ctx->Yi.d[3] = BSWAP4(ctr);
1315#else
1316                PUTU32(ctx->Yi.c + 12, ctr);
1317#endif
1318            else
1319                ctx->Yi.d[3] = ctr;
1320        }
1321        c = in[i];
1322        out[i] = c ^ ctx->EKi.c[n];
1323        ctx->Xi.c[n] ^= c;
1324        n = (n + 1) % 16;
1325        if (n == 0)
1326            GCM_MUL(ctx, Xi);
1327    }
1328
1329    ctx->mres = n;
1330    return 0;
1331}
1332
1333int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1334                                const unsigned char *in, unsigned char *out,
1335                                size_t len, ctr128_f stream)
1336{
1337    const union {
1338        long one;
1339        char little;
1340    } is_endian = {
1341        1
1342    };
1343    unsigned int n, ctr;
1344    size_t i;
1345    u64 mlen = ctx->len.u[1];
1346    void *key = ctx->key;
1347#ifdef GCM_FUNCREF_4BIT
1348    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1349# ifdef GHASH
1350    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1351                         const u8 *inp, size_t len) = ctx->ghash;
1352# endif
1353#endif
1354
1355    mlen += len;
1356    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1357        return -1;
1358    ctx->len.u[1] = mlen;
1359
1360    if (ctx->ares) {
1361        /* First call to encrypt finalizes GHASH(AAD) */
1362        GCM_MUL(ctx, Xi);
1363        ctx->ares = 0;
1364    }
1365
1366    if (is_endian.little)
1367#ifdef BSWAP4
1368        ctr = BSWAP4(ctx->Yi.d[3]);
1369#else
1370        ctr = GETU32(ctx->Yi.c + 12);
1371#endif
1372    else
1373        ctr = ctx->Yi.d[3];
1374
1375    n = ctx->mres;
1376    if (n) {
1377        while (n && len) {
1378            ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1379            --len;
1380            n = (n + 1) % 16;
1381        }
1382        if (n == 0)
1383            GCM_MUL(ctx, Xi);
1384        else {
1385            ctx->mres = n;
1386            return 0;
1387        }
1388    }
1389#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1390    while (len >= GHASH_CHUNK) {
1391        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1392        ctr += GHASH_CHUNK / 16;
1393        if (is_endian.little)
1394# ifdef BSWAP4
1395            ctx->Yi.d[3] = BSWAP4(ctr);
1396# else
1397            PUTU32(ctx->Yi.c + 12, ctr);
1398# endif
1399        else
1400            ctx->Yi.d[3] = ctr;
1401        GHASH(ctx, out, GHASH_CHUNK);
1402        out += GHASH_CHUNK;
1403        in += GHASH_CHUNK;
1404        len -= GHASH_CHUNK;
1405    }
1406#endif
1407    if ((i = (len & (size_t)-16))) {
1408        size_t j = i / 16;
1409
1410        (*stream) (in, out, j, key, ctx->Yi.c);
1411        ctr += (unsigned int)j;
1412        if (is_endian.little)
1413#ifdef BSWAP4
1414            ctx->Yi.d[3] = BSWAP4(ctr);
1415#else
1416            PUTU32(ctx->Yi.c + 12, ctr);
1417#endif
1418        else
1419            ctx->Yi.d[3] = ctr;
1420        in += i;
1421        len -= i;
1422#if defined(GHASH)
1423        GHASH(ctx, out, i);
1424        out += i;
1425#else
1426        while (j--) {
1427            for (i = 0; i < 16; ++i)
1428                ctx->Xi.c[i] ^= out[i];
1429            GCM_MUL(ctx, Xi);
1430            out += 16;
1431        }
1432#endif
1433    }
1434    if (len) {
1435        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1436        ++ctr;
1437        if (is_endian.little)
1438#ifdef BSWAP4
1439            ctx->Yi.d[3] = BSWAP4(ctr);
1440#else
1441            PUTU32(ctx->Yi.c + 12, ctr);
1442#endif
1443        else
1444            ctx->Yi.d[3] = ctr;
1445        while (len--) {
1446            ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1447            ++n;
1448        }
1449    }
1450
1451    ctx->mres = n;
1452    return 0;
1453}
1454
1455int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1456                                const unsigned char *in, unsigned char *out,
1457                                size_t len, ctr128_f stream)
1458{
1459    const union {
1460        long one;
1461        char little;
1462    } is_endian = {
1463        1
1464    };
1465    unsigned int n, ctr;
1466    size_t i;
1467    u64 mlen = ctx->len.u[1];
1468    void *key = ctx->key;
1469#ifdef GCM_FUNCREF_4BIT
1470    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1471# ifdef GHASH
1472    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1473                         const u8 *inp, size_t len) = ctx->ghash;
1474# endif
1475#endif
1476
1477    mlen += len;
1478    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1479        return -1;
1480    ctx->len.u[1] = mlen;
1481
1482    if (ctx->ares) {
1483        /* First call to decrypt finalizes GHASH(AAD) */
1484        GCM_MUL(ctx, Xi);
1485        ctx->ares = 0;
1486    }
1487
1488    if (is_endian.little)
1489#ifdef BSWAP4
1490        ctr = BSWAP4(ctx->Yi.d[3]);
1491#else
1492        ctr = GETU32(ctx->Yi.c + 12);
1493#endif
1494    else
1495        ctr = ctx->Yi.d[3];
1496
1497    n = ctx->mres;
1498    if (n) {
1499        while (n && len) {
1500            u8 c = *(in++);
1501            *(out++) = c ^ ctx->EKi.c[n];
1502            ctx->Xi.c[n] ^= c;
1503            --len;
1504            n = (n + 1) % 16;
1505        }
1506        if (n == 0)
1507            GCM_MUL(ctx, Xi);
1508        else {
1509            ctx->mres = n;
1510            return 0;
1511        }
1512    }
1513#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1514    while (len >= GHASH_CHUNK) {
1515        GHASH(ctx, in, GHASH_CHUNK);
1516        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1517        ctr += GHASH_CHUNK / 16;
1518        if (is_endian.little)
1519# ifdef BSWAP4
1520            ctx->Yi.d[3] = BSWAP4(ctr);
1521# else
1522            PUTU32(ctx->Yi.c + 12, ctr);
1523# endif
1524        else
1525            ctx->Yi.d[3] = ctr;
1526        out += GHASH_CHUNK;
1527        in += GHASH_CHUNK;
1528        len -= GHASH_CHUNK;
1529    }
1530#endif
1531    if ((i = (len & (size_t)-16))) {
1532        size_t j = i / 16;
1533
1534#if defined(GHASH)
1535        GHASH(ctx, in, i);
1536#else
1537        while (j--) {
1538            size_t k;
1539            for (k = 0; k < 16; ++k)
1540                ctx->Xi.c[k] ^= in[k];
1541            GCM_MUL(ctx, Xi);
1542            in += 16;
1543        }
1544        j = i / 16;
1545        in -= i;
1546#endif
1547        (*stream) (in, out, j, key, ctx->Yi.c);
1548        ctr += (unsigned int)j;
1549        if (is_endian.little)
1550#ifdef BSWAP4
1551            ctx->Yi.d[3] = BSWAP4(ctr);
1552#else
1553            PUTU32(ctx->Yi.c + 12, ctr);
1554#endif
1555        else
1556            ctx->Yi.d[3] = ctr;
1557        out += i;
1558        in += i;
1559        len -= i;
1560    }
1561    if (len) {
1562        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1563        ++ctr;
1564        if (is_endian.little)
1565#ifdef BSWAP4
1566            ctx->Yi.d[3] = BSWAP4(ctr);
1567#else
1568            PUTU32(ctx->Yi.c + 12, ctr);
1569#endif
1570        else
1571            ctx->Yi.d[3] = ctr;
1572        while (len--) {
1573            u8 c = in[n];
1574            ctx->Xi.c[n] ^= c;
1575            out[n] = c ^ ctx->EKi.c[n];
1576            ++n;
1577        }
1578    }
1579
1580    ctx->mres = n;
1581    return 0;
1582}
1583
1584int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1585                         size_t len)
1586{
1587    const union {
1588        long one;
1589        char little;
1590    } is_endian = {
1591        1
1592    };
1593    u64 alen = ctx->len.u[0] << 3;
1594    u64 clen = ctx->len.u[1] << 3;
1595#ifdef GCM_FUNCREF_4BIT
1596    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1597#endif
1598
1599    if (ctx->mres || ctx->ares)
1600        GCM_MUL(ctx, Xi);
1601
1602    if (is_endian.little) {
1603#ifdef BSWAP8
1604        alen = BSWAP8(alen);
1605        clen = BSWAP8(clen);
1606#else
1607        u8 *p = ctx->len.c;
1608
1609        ctx->len.u[0] = alen;
1610        ctx->len.u[1] = clen;
1611
1612        alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1613        clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1614#endif
1615    }
1616
1617    ctx->Xi.u[0] ^= alen;
1618    ctx->Xi.u[1] ^= clen;
1619    GCM_MUL(ctx, Xi);
1620
1621    ctx->Xi.u[0] ^= ctx->EK0.u[0];
1622    ctx->Xi.u[1] ^= ctx->EK0.u[1];
1623
1624    if (tag && len <= sizeof(ctx->Xi))
1625        return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1626    else
1627        return -1;
1628}
1629
1630void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1631{
1632    CRYPTO_gcm128_finish(ctx, NULL, 0);
1633    memcpy(tag, ctx->Xi.c,
1634           len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1635}
1636
1637GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1638{
1639    GCM128_CONTEXT *ret;
1640
1641    if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1642        CRYPTO_gcm128_init(ret, key, block);
1643
1644    return ret;
1645}
1646
1647void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1648{
1649    if (ctx) {
1650        OPENSSL_cleanse(ctx, sizeof(*ctx));
1651        OPENSSL_free(ctx);
1652    }
1653}
1654
1655#if defined(SELFTEST)
1656# include <stdio.h>
1657# include <openssl/aes.h>
1658
1659/* Test Case 1 */
1660static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL;
1661static const u8 T1[] = {
1662    0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61,
1663    0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a
1664};
1665
1666/* Test Case 2 */
1667# define K2 K1
1668# define A2 A1
1669# define IV2 IV1
1670static const u8 P2[16];
1671static const u8 C2[] = {
1672    0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92,
1673    0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78
1674};
1675
1676static const u8 T2[] = {
1677    0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd,
1678    0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf
1679};
1680
1681/* Test Case 3 */
1682# define A3 A2
1683static const u8 K3[] = {
1684    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1685    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
1686};
1687
1688static const u8 P3[] = {
1689    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1690    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1691    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1692    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1693    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1694    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1695    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1696    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
1697};
1698
1699static const u8 IV3[] = {
1700    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
1701    0xde, 0xca, 0xf8, 0x88
1702};
1703
1704static const u8 C3[] = {
1705    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
1706    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
1707    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
1708    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
1709    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
1710    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
1711    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
1712    0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85
1713};
1714
1715static const u8 T3[] = {
1716    0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6,
1717    0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4
1718};
1719
1720/* Test Case 4 */
1721# define K4 K3
1722# define IV4 IV3
1723static const u8 P4[] = {
1724    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1725    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1726    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1727    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1728    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1729    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1730    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1731    0xba, 0x63, 0x7b, 0x39
1732};
1733
1734static const u8 A4[] = {
1735    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1736    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1737    0xab, 0xad, 0xda, 0xd2
1738};
1739
1740static const u8 C4[] = {
1741    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
1742    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
1743    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
1744    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
1745    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
1746    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
1747    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
1748    0x3d, 0x58, 0xe0, 0x91
1749};
1750
1751static const u8 T4[] = {
1752    0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb,
1753    0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47
1754};
1755
1756/* Test Case 5 */
1757# define K5 K4
1758# define P5 P4
1759# define A5 A4
1760static const u8 IV5[] = {
1761    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad
1762};
1763
1764static const u8 C5[] = {
1765    0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a,
1766    0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55,
1767    0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8,
1768    0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23,
1769    0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2,
1770    0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42,
1771    0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07,
1772    0xc2, 0x3f, 0x45, 0x98
1773};
1774
1775static const u8 T5[] = {
1776    0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85,
1777    0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb
1778};
1779
1780/* Test Case 6 */
1781# define K6 K5
1782# define P6 P5
1783# define A6 A5
1784static const u8 IV6[] = {
1785    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
1786    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
1787    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
1788    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
1789    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
1790    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
1791    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
1792    0xa6, 0x37, 0xb3, 0x9b
1793};
1794
1795static const u8 C6[] = {
1796    0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6,
1797    0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94,
1798    0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8,
1799    0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7,
1800    0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90,
1801    0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f,
1802    0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03,
1803    0x4c, 0x34, 0xae, 0xe5
1804};
1805
1806static const u8 T6[] = {
1807    0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa,
1808    0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50
1809};
1810
1811/* Test Case 7 */
1812static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL;
1813static const u8 T7[] = {
1814    0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b,
1815    0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35
1816};
1817
1818/* Test Case 8 */
1819# define K8 K7
1820# define IV8 IV7
1821# define A8 A7
1822static const u8 P8[16];
1823static const u8 C8[] = {
1824    0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41,
1825    0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00
1826};
1827
1828static const u8 T8[] = {
1829    0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab,
1830    0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb
1831};
1832
1833/* Test Case 9 */
1834# define A9 A8
1835static const u8 K9[] = {
1836    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1837    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
1838    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c
1839};
1840
1841static const u8 P9[] = {
1842    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1843    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1844    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1845    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1846    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1847    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1848    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1849    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
1850};
1851
1852static const u8 IV9[] = {
1853    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
1854    0xde, 0xca, 0xf8, 0x88
1855};
1856
1857static const u8 C9[] = {
1858    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
1859    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
1860    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
1861    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
1862    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
1863    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
1864    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
1865    0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56
1866};
1867
1868static const u8 T9[] = {
1869    0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf,
1870    0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14
1871};
1872
1873/* Test Case 10 */
1874# define K10 K9
1875# define IV10 IV9
1876static const u8 P10[] = {
1877    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1878    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1879    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1880    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1881    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1882    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1883    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1884    0xba, 0x63, 0x7b, 0x39
1885};
1886
1887static const u8 A10[] = {
1888    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1889    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1890    0xab, 0xad, 0xda, 0xd2
1891};
1892
1893static const u8 C10[] = {
1894    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
1895    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
1896    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
1897    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
1898    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
1899    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
1900    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
1901    0xcc, 0xda, 0x27, 0x10
1902};
1903
1904static const u8 T10[] = {
1905    0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f,
1906    0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c
1907};
1908
1909/* Test Case 11 */
1910# define K11 K10
1911# define P11 P10
1912# define A11 A10
1913static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
1914
1915static const u8 C11[] = {
1916    0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54,
1917    0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8,
1918    0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f,
1919    0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57,
1920    0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75,
1921    0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9,
1922    0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f,
1923    0xa0, 0xf0, 0x62, 0xf7
1924};
1925
1926static const u8 T11[] = {
1927    0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24,
1928    0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8
1929};
1930
1931/* Test Case 12 */
1932# define K12 K11
1933# define P12 P11
1934# define A12 A11
1935static const u8 IV12[] = {
1936    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
1937    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
1938    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
1939    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
1940    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
1941    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
1942    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
1943    0xa6, 0x37, 0xb3, 0x9b
1944};
1945
1946static const u8 C12[] = {
1947    0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c,
1948    0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff,
1949    0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef,
1950    0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45,
1951    0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9,
1952    0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3,
1953    0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7,
1954    0xe9, 0xb7, 0x37, 0x3b
1955};
1956
1957static const u8 T12[] = {
1958    0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb,
1959    0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9
1960};
1961
1962/* Test Case 13 */
1963static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL;
1964static const u8 T13[] = {
1965    0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9,
1966    0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b
1967};
1968
1969/* Test Case 14 */
1970# define K14 K13
1971# define A14 A13
1972static const u8 P14[16], IV14[12];
1973static const u8 C14[] = {
1974    0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
1975    0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
1976};
1977
1978static const u8 T14[] = {
1979    0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0,
1980    0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19
1981};
1982
1983/* Test Case 15 */
1984# define A15 A14
1985static const u8 K15[] = {
1986    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1987    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
1988    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1989    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
1990};
1991
1992static const u8 P15[] = {
1993    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1994    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1995    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1996    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1997    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1998    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1999    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2000    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
2001};
2002
2003static const u8 IV15[] = {
2004    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
2005    0xde, 0xca, 0xf8, 0x88
2006};
2007
2008static const u8 C15[] = {
2009    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2010    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2011    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2012    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2013    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2014    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2015    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2016    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
2017};
2018
2019static const u8 T15[] = {
2020    0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd,
2021    0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c
2022};
2023
2024/* Test Case 16 */
2025# define K16 K15
2026# define IV16 IV15
2027static const u8 P16[] = {
2028    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
2029    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
2030    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
2031    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
2032    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
2033    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
2034    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2035    0xba, 0x63, 0x7b, 0x39
2036};
2037
2038static const u8 A16[] = {
2039    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
2040    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
2041    0xab, 0xad, 0xda, 0xd2
2042};
2043
2044static const u8 C16[] = {
2045    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2046    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2047    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2048    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2049    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2050    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2051    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2052    0xbc, 0xc9, 0xf6, 0x62
2053};
2054
2055static const u8 T16[] = {
2056    0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
2057    0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b
2058};
2059
2060/* Test Case 17 */
2061# define K17 K16
2062# define P17 P16
2063# define A17 A16
2064static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
2065
2066static const u8 C17[] = {
2067    0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32,
2068    0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
2069    0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa,
2070    0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
2071    0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0,
2072    0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
2073    0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99,
2074    0xf4, 0x7c, 0x9b, 0x1f
2075};
2076
2077static const u8 T17[] = {
2078    0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4,
2079    0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2
2080};
2081
2082/* Test Case 18 */
2083# define K18 K17
2084# define P18 P17
2085# define A18 A17
2086static const u8 IV18[] = {
2087    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
2088    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
2089    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
2090    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
2091    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
2092    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
2093    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
2094    0xa6, 0x37, 0xb3, 0x9b
2095};
2096
2097static const u8 C18[] = {
2098    0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
2099    0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
2100    0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
2101    0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
2102    0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
2103    0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
2104    0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e,
2105    0x44, 0xae, 0x7e, 0x3f
2106};
2107
2108static const u8 T18[] = {
2109    0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0,
2110    0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a
2111};
2112
2113/* Test Case 19 */
2114# define K19 K1
2115# define P19 P1
2116# define IV19 IV1
2117# define C19 C1
2118static const u8 A19[] = {
2119    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
2120    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
2121    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
2122    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
2123    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
2124    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
2125    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2126    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55,
2127    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2128    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2129    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2130    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2131    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2132    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2133    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2134    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
2135};
2136
2137static const u8 T19[] = {
2138    0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d,
2139    0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92
2140};
2141
2142/* Test Case 20 */
2143# define K20 K1
2144# define A20 A1
2145/* this results in 0xff in counter LSB */
2146static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff };
2147
2148static const u8 P20[288];
2149static const u8 C20[] = {
2150    0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a,
2151    0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14,
2152    0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce,
2153    0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f,
2154    0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70,
2155    0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18,
2156    0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf,
2157    0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49,
2158    0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab,
2159    0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c,
2160    0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c,
2161    0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29,
2162    0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1,
2163    0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76,
2164    0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2,
2165    0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce,
2166    0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f,
2167    0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86,
2168    0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb,
2169    0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18,
2170    0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65,
2171    0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42,
2172    0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b,
2173    0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06,
2174    0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24,
2175    0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c,
2176    0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4,
2177    0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64,
2178    0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03,
2179    0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6,
2180    0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90,
2181    0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74,
2182    0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67,
2183    0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46,
2184    0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78,
2185    0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c
2186};
2187
2188static const u8 T20[] = {
2189    0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a,
2190    0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f
2191};
2192
2193# define TEST_CASE(n)    do {                                    \
2194        u8 out[sizeof(P##n)];                                   \
2195        AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
2196        CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
2197        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
2198        memset(out,0,sizeof(out));                              \
2199        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
2200        if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
2201        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
2202            (C##n && memcmp(out,C##n,sizeof(out))))             \
2203                ret++, printf ("encrypt test#%d failed.\n",n);  \
2204        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
2205        memset(out,0,sizeof(out));                              \
2206        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
2207        if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
2208        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
2209            (P##n && memcmp(out,P##n,sizeof(out))))             \
2210                ret++, printf ("decrypt test#%d failed.\n",n);  \
2211        } while(0)
2212
2213int main()
2214{
2215    GCM128_CONTEXT ctx;
2216    AES_KEY key;
2217    int ret = 0;
2218
2219    TEST_CASE(1);
2220    TEST_CASE(2);
2221    TEST_CASE(3);
2222    TEST_CASE(4);
2223    TEST_CASE(5);
2224    TEST_CASE(6);
2225    TEST_CASE(7);
2226    TEST_CASE(8);
2227    TEST_CASE(9);
2228    TEST_CASE(10);
2229    TEST_CASE(11);
2230    TEST_CASE(12);
2231    TEST_CASE(13);
2232    TEST_CASE(14);
2233    TEST_CASE(15);
2234    TEST_CASE(16);
2235    TEST_CASE(17);
2236    TEST_CASE(18);
2237    TEST_CASE(19);
2238    TEST_CASE(20);
2239
2240# ifdef OPENSSL_CPUID_OBJ
2241    {
2242        size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc();
2243        union {
2244            u64 u;
2245            u8 c[1024];
2246        } buf;
2247        int i;
2248
2249        AES_set_encrypt_key(K1, sizeof(K1) * 8, &key);
2250        CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt);
2251        CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1));
2252
2253        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
2254        start = OPENSSL_rdtsc();
2255        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
2256        gcm_t = OPENSSL_rdtsc() - start;
2257
2258        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
2259                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
2260                              (block128_f) AES_encrypt);
2261        start = OPENSSL_rdtsc();
2262        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
2263                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
2264                              (block128_f) AES_encrypt);
2265        ctr_t = OPENSSL_rdtsc() - start;
2266
2267        printf("%.2f-%.2f=%.2f\n",
2268               gcm_t / (double)sizeof(buf),
2269               ctr_t / (double)sizeof(buf),
2270               (gcm_t - ctr_t) / (double)sizeof(buf));
2271#  ifdef GHASH
2272        {
2273            void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
2274                                 const u8 *inp, size_t len) = ctx.ghash;
2275
2276            GHASH((&ctx), buf.c, sizeof(buf));
2277            start = OPENSSL_rdtsc();
2278            for (i = 0; i < 100; ++i)
2279                GHASH((&ctx), buf.c, sizeof(buf));
2280            gcm_t = OPENSSL_rdtsc() - start;
2281            printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i);
2282        }
2283#  endif
2284    }
2285# endif
2286
2287    return ret;
2288}
2289#endif
2290