1/*
2   BLAKE2 reference source code package - optimized C implementations
3
4   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5
6   To the extent possible under law, the author(s) have dedicated all copyright
7   and related and neighboring rights to this software to the public domain
8   worldwide. This software is distributed without any warranty.
9
10   You should have received a copy of the CC0 Public Domain Dedication along with
11   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12*/
13
14#include <stdint.h>
15#include <string.h>
16#include <stdio.h>
17
18#include "blake2.h"
19#include "blake2-impl.h"
20
21#include "blake2-config.h"
22
23#if defined(_MSC_VER)
24#include <intrin.h>
25#endif
26
27#if defined(HAVE_SSE2)
28#include <emmintrin.h>
29// MSVC only defines  _mm_set_epi64x for x86_64...
30#if defined(_MSC_VER) && !defined(_M_X64)
31static inline __m128i _mm_set_epi64x( const uint64_t u1, const uint64_t u0 )
32{
33  return _mm_set_epi32( u1 >> 32, u1, u0 >> 32, u0 );
34}
35#endif
36#endif
37
38#if defined(HAVE_SSSE3)
39#include <tmmintrin.h>
40#endif
41#if defined(HAVE_SSE4_1)
42#include <smmintrin.h>
43#endif
44#if defined(HAVE_AVX)
45#include <immintrin.h>
46#endif
47#if defined(HAVE_XOP) && !defined(_MSC_VER)
48#include <x86intrin.h>
49#endif
50
51
52
53#include "blake2b-round.h"
54
55static const uint64_t blake2b_IV[8] =
56{
57  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
58  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
59  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
60  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
61};
62
63static const uint8_t blake2b_sigma[12][16] =
64{
65  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
66  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
67  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
68  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
69  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
70  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
71  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
72  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
73  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
74  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
75  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
76  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
77};
78
79
80/* Some helper functions, not necessarily useful */
81static inline int blake2b_set_lastnode( blake2b_state *S )
82{
83  S->f[1] = ~0ULL;
84  return 0;
85}
86
87static inline int blake2b_clear_lastnode( blake2b_state *S )
88{
89  S->f[1] = 0ULL;
90  return 0;
91}
92
93static inline int blake2b_set_lastblock( blake2b_state *S )
94{
95  if( S->last_node ) blake2b_set_lastnode( S );
96
97  S->f[0] = ~0ULL;
98  return 0;
99}
100
101static inline int blake2b_clear_lastblock( blake2b_state *S )
102{
103  if( S->last_node ) blake2b_clear_lastnode( S );
104
105  S->f[0] = 0ULL;
106  return 0;
107}
108
109
110static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
111{
112#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
113  // ADD/ADC chain
114  __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
115  t += inc;
116  S->t[0] = ( uint64_t )( t >>  0 );
117  S->t[1] = ( uint64_t )( t >> 64 );
118#else
119  S->t[0] += inc;
120  S->t[1] += ( S->t[0] < inc );
121#endif
122  return 0;
123}
124
125
126// Parameter-related functions
127static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
128{
129  P->digest_length = digest_length;
130  return 0;
131}
132
133static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
134{
135  P->fanout = fanout;
136  return 0;
137}
138
139static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
140{
141  P->depth = depth;
142  return 0;
143}
144
145static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
146{
147  P->leaf_length = leaf_length;
148  return 0;
149}
150
151static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
152{
153  P->node_offset = node_offset;
154  return 0;
155}
156
157static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
158{
159  P->node_depth = node_depth;
160  return 0;
161}
162
163static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
164{
165  P->inner_length = inner_length;
166  return 0;
167}
168
169static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
170{
171  memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
172  return 0;
173}
174
175static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
176{
177  memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
178  return 0;
179}
180
181static inline int blake2b_init0( blake2b_state *S )
182{
183  memset( S, 0, sizeof( blake2b_state ) );
184
185  for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
186
187  return 0;
188}
189
190
191
192#define blake2b_init BLAKE2_IMPL_NAME(blake2b_init)
193#define blake2b_init_param BLAKE2_IMPL_NAME(blake2b_init_param)
194#define blake2b_init_key BLAKE2_IMPL_NAME(blake2b_init_key)
195#define blake2b_update BLAKE2_IMPL_NAME(blake2b_update)
196#define blake2b_final BLAKE2_IMPL_NAME(blake2b_final)
197#define blake2b BLAKE2_IMPL_NAME(blake2b)
198
199#if defined(__cplusplus)
200extern "C" {
201#endif
202  int blake2b_init( blake2b_state *S, size_t outlen );
203  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
204  int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen );
205  int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen );
206  int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen );
207  int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen );
208#if defined(__cplusplus)
209}
210#endif
211
212/* init xors IV with input parameter block */
213int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
214{
215  uint8_t *p, *h, *v;
216  //blake2b_init0( S );
217  v = ( uint8_t * )( blake2b_IV );
218  h = ( uint8_t * )( S->h );
219  p = ( uint8_t * )( P );
220  /* IV XOR ParamBlock */
221  memset( S, 0, sizeof( blake2b_state ) );
222
223  for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
224
225  S->outlen = P->digest_length;
226  return 0;
227}
228
229
230/* Some sort of default parameter block initialization, for sequential blake2b */
231
232int blake2b_init( blake2b_state *S, size_t outlen )
233{
234  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
235
236  const blake2b_param P =
237  {
238    ( uint8_t ) outlen,
239    0,
240    1,
241    1,
242    0,
243    0,
244    0,
245    0,
246    {0},
247    {0},
248    {0}
249  };
250  return blake2b_init_param( S, &P );
251}
252
253int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
254{
255  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
256
257  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
258
259  const blake2b_param P =
260  {
261    ( uint8_t ) outlen,
262    ( uint8_t ) keylen,
263    1,
264    1,
265    0,
266    0,
267    0,
268    0,
269    {0},
270    {0},
271    {0}
272  };
273
274  if( blake2b_init_param( S, &P ) < 0 )
275    return 0;
276
277  {
278    uint8_t block[BLAKE2B_BLOCKBYTES];
279    memset( block, 0, BLAKE2B_BLOCKBYTES );
280    memcpy( block, key, keylen );
281    blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
282    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
283  }
284  return 0;
285}
286
287static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
288{
289  __m128i row1l, row1h;
290  __m128i row2l, row2h;
291  __m128i row3l, row3h;
292  __m128i row4l, row4h;
293  __m128i b0, b1;
294  __m128i t0, t1;
295#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
296  const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
297  const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
298#endif
299#if defined(HAVE_SSE4_1)
300  const __m128i m0 = LOADU( block + 00 );
301  const __m128i m1 = LOADU( block + 16 );
302  const __m128i m2 = LOADU( block + 32 );
303  const __m128i m3 = LOADU( block + 48 );
304  const __m128i m4 = LOADU( block + 64 );
305  const __m128i m5 = LOADU( block + 80 );
306  const __m128i m6 = LOADU( block + 96 );
307  const __m128i m7 = LOADU( block + 112 );
308#else
309  const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
310  const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
311  const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
312  const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
313  const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
314  const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
315  const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
316  const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
317  const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
318  const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
319  const uint64_t m10 = ( ( uint64_t * )block )[10];
320  const uint64_t m11 = ( ( uint64_t * )block )[11];
321  const uint64_t m12 = ( ( uint64_t * )block )[12];
322  const uint64_t m13 = ( ( uint64_t * )block )[13];
323  const uint64_t m14 = ( ( uint64_t * )block )[14];
324  const uint64_t m15 = ( ( uint64_t * )block )[15];
325#endif
326  row1l = LOADU( &S->h[0] );
327  row1h = LOADU( &S->h[2] );
328  row2l = LOADU( &S->h[4] );
329  row2h = LOADU( &S->h[6] );
330  row3l = LOADU( &blake2b_IV[0] );
331  row3h = LOADU( &blake2b_IV[2] );
332  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
333  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
334  ROUND( 0 );
335  ROUND( 1 );
336  ROUND( 2 );
337  ROUND( 3 );
338  ROUND( 4 );
339  ROUND( 5 );
340  ROUND( 6 );
341  ROUND( 7 );
342  ROUND( 8 );
343  ROUND( 9 );
344  ROUND( 10 );
345  ROUND( 11 );
346  row1l = _mm_xor_si128( row3l, row1l );
347  row1h = _mm_xor_si128( row3h, row1h );
348  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
349  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
350  row2l = _mm_xor_si128( row4l, row2l );
351  row2h = _mm_xor_si128( row4h, row2h );
352  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
353  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
354  return 0;
355}
356
357
358int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen )
359{
360  while( inlen > 0 )
361  {
362    uint32_t left = S->buflen;
363    uint32_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
364
365    if( inlen > fill )
366    {
367      memcpy( S->buf + left, in, fill ); // Fill buffer
368      S->buflen += fill;
369      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
370      blake2b_compress( S, S->buf ); // Compress
371      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
372      S->buflen -= BLAKE2B_BLOCKBYTES;
373      in += fill;
374      inlen -= fill;
375    }
376    else // inlen <= fill
377    {
378      memcpy( S->buf + left, in, inlen );
379      S->buflen += ( uint32_t ) inlen; // Be lazy, do not compress
380      in += inlen;
381      inlen -= inlen;
382    }
383  }
384
385  return 0;
386}
387
388
389int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen )
390{
391  if(S->outlen != outlen) return -1;
392
393  if( S->buflen > BLAKE2B_BLOCKBYTES )
394  {
395    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
396    blake2b_compress( S, S->buf );
397    S->buflen -= BLAKE2B_BLOCKBYTES;
398    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
399  }
400
401  blake2b_increment_counter( S, S->buflen );
402  blake2b_set_lastblock( S );
403  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
404  blake2b_compress( S, S->buf );
405  memcpy( out, &S->h[0], outlen );
406  return 0;
407}
408
409
410int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen )
411{
412  blake2b_state S[1];
413
414  /* Verify parameters */
415  if ( NULL == in && inlen > 0 ) return -1;
416
417  if ( NULL == out ) return -1;
418
419  if( NULL == key && keylen > 0 ) return -1;
420
421  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
422
423  if( keylen > BLAKE2B_KEYBYTES ) return -1;
424
425  if( keylen )
426  {
427    if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
428  }
429  else
430  {
431    if( blake2b_init( S, outlen ) < 0 ) return -1;
432  }
433
434  if( blake2b_update( S, ( uint8_t * )in, inlen ) < 0) return -1;
435  return blake2b_final( S, out, outlen );
436}
437
438#if defined(SUPERCOP)
439int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
440{
441  return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
442}
443#endif
444