1 2#include <stdint.h> 3#include <string.h> 4 5#include "blake2.h" 6#include "private/common.h" 7#include "private/sse2_64_32.h" 8 9#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) 10 11# ifdef __GNUC__ 12# pragma GCC target("sse2") 13# pragma GCC target("ssse3") 14# endif 15 16# include <emmintrin.h> 17# include <tmmintrin.h> 18 19# include "blake2b-compress-ssse3.h" 20 21CRYPTO_ALIGN(64) 22static const uint64_t blake2b_IV[8] = { 23 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 24 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 25 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL 26}; 27 28int 29blake2b_compress_ssse3(blake2b_state *S, 30 const uint8_t block[BLAKE2B_BLOCKBYTES]) 31{ 32 __m128i row1l, row1h; 33 __m128i row2l, row2h; 34 __m128i row3l, row3h; 35 __m128i row4l, row4h; 36 __m128i b0, b1; 37 __m128i t0, t1; 38 const __m128i r16 = 39 _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); 40 const __m128i r24 = 41 _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); 42 const uint64_t m0 = ((uint64_t *) block)[0]; 43 const uint64_t m1 = ((uint64_t *) block)[1]; 44 const uint64_t m2 = ((uint64_t *) block)[2]; 45 const uint64_t m3 = ((uint64_t *) block)[3]; 46 const uint64_t m4 = ((uint64_t *) block)[4]; 47 const uint64_t m5 = ((uint64_t *) block)[5]; 48 const uint64_t m6 = ((uint64_t *) block)[6]; 49 const uint64_t m7 = ((uint64_t *) block)[7]; 50 const uint64_t m8 = ((uint64_t *) block)[8]; 51 const uint64_t m9 = ((uint64_t *) block)[9]; 52 const uint64_t m10 = ((uint64_t *) block)[10]; 53 const uint64_t m11 = ((uint64_t *) block)[11]; 54 const uint64_t m12 = ((uint64_t *) block)[12]; 55 const uint64_t m13 = ((uint64_t *) block)[13]; 56 const uint64_t m14 = ((uint64_t *) block)[14]; 57 const uint64_t m15 = ((uint64_t *) block)[15]; 58 59 row1l = LOADU(&S->h[0]); 60 row1h = LOADU(&S->h[2]); 61 row2l = LOADU(&S->h[4]); 62 row2h = LOADU(&S->h[6]); 63 row3l = LOADU(&blake2b_IV[0]); 64 row3h = LOADU(&blake2b_IV[2]); 65 row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); 66 row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); 67 ROUND(0); 68 ROUND(1); 69 ROUND(2); 70 ROUND(3); 71 ROUND(4); 72 ROUND(5); 73 ROUND(6); 74 ROUND(7); 75 ROUND(8); 76 ROUND(9); 77 ROUND(10); 78 ROUND(11); 79 row1l = _mm_xor_si128(row3l, row1l); 80 row1h = _mm_xor_si128(row3h, row1h); 81 STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); 82 STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); 83 row2l = _mm_xor_si128(row4l, row2l); 84 row2h = _mm_xor_si128(row4h, row2h); 85 STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); 86 STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); 87 return 0; 88} 89 90#endif 91