1
2#include <stdint.h>
3#include <string.h>
4
5#include "blake2.h"
6#include "private/common.h"
7#include "private/sse2_64_32.h"
8
9#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
10
11# ifdef __GNUC__
12#  pragma GCC target("sse2")
13#  pragma GCC target("ssse3")
14# endif
15
16# include <emmintrin.h>
17# include <tmmintrin.h>
18
19# include "blake2b-compress-ssse3.h"
20
21CRYPTO_ALIGN(64)
22static const uint64_t blake2b_IV[8] = {
23    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
24    0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
25    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
26};
27
28int
29blake2b_compress_ssse3(blake2b_state *S,
30                       const uint8_t  block[BLAKE2B_BLOCKBYTES])
31{
32    __m128i       row1l, row1h;
33    __m128i       row2l, row2h;
34    __m128i       row3l, row3h;
35    __m128i       row4l, row4h;
36    __m128i       b0, b1;
37    __m128i       t0, t1;
38    const __m128i r16 =
39        _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
40    const __m128i r24 =
41        _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
42    const uint64_t m0  = ((uint64_t *) block)[0];
43    const uint64_t m1  = ((uint64_t *) block)[1];
44    const uint64_t m2  = ((uint64_t *) block)[2];
45    const uint64_t m3  = ((uint64_t *) block)[3];
46    const uint64_t m4  = ((uint64_t *) block)[4];
47    const uint64_t m5  = ((uint64_t *) block)[5];
48    const uint64_t m6  = ((uint64_t *) block)[6];
49    const uint64_t m7  = ((uint64_t *) block)[7];
50    const uint64_t m8  = ((uint64_t *) block)[8];
51    const uint64_t m9  = ((uint64_t *) block)[9];
52    const uint64_t m10 = ((uint64_t *) block)[10];
53    const uint64_t m11 = ((uint64_t *) block)[11];
54    const uint64_t m12 = ((uint64_t *) block)[12];
55    const uint64_t m13 = ((uint64_t *) block)[13];
56    const uint64_t m14 = ((uint64_t *) block)[14];
57    const uint64_t m15 = ((uint64_t *) block)[15];
58
59    row1l = LOADU(&S->h[0]);
60    row1h = LOADU(&S->h[2]);
61    row2l = LOADU(&S->h[4]);
62    row2h = LOADU(&S->h[6]);
63    row3l = LOADU(&blake2b_IV[0]);
64    row3h = LOADU(&blake2b_IV[2]);
65    row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
66    row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
67    ROUND(0);
68    ROUND(1);
69    ROUND(2);
70    ROUND(3);
71    ROUND(4);
72    ROUND(5);
73    ROUND(6);
74    ROUND(7);
75    ROUND(8);
76    ROUND(9);
77    ROUND(10);
78    ROUND(11);
79    row1l = _mm_xor_si128(row3l, row1l);
80    row1h = _mm_xor_si128(row3h, row1h);
81    STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
82    STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
83    row2l = _mm_xor_si128(row4l, row2l);
84    row2h = _mm_xor_si128(row4h, row2h);
85    STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
86    STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
87    return 0;
88}
89
90#endif
91