1#ifdef IN_SANDY2X
2
3/*
4   This file is adapted from amd64-51/fe25519_square.s:
5   Adding loop to perform n squares.
6*/
7#include "fe51_namespace.h"
8#include "consts_namespace.h"
9.p2align 5
10
11#ifdef ASM_HIDE_SYMBOL
12ASM_HIDE_SYMBOL fe51_nsquare
13ASM_HIDE_SYMBOL _fe51_nsquare
14#endif
15.globl fe51_nsquare
16.globl _fe51_nsquare
17#ifdef __ELF__
18.type  fe51_nsquare, @function
19.type _fe51_nsquare, @function
20#endif
21fe51_nsquare:
22_fe51_nsquare:
23
24mov %rsp,%r11
25and $31,%r11
26add $64,%r11
27sub %r11,%rsp
28movq %r11,0(%rsp)
29movq %r12,8(%rsp)
30movq %r13,16(%rsp)
31movq %r14,24(%rsp)
32movq %r15,32(%rsp)
33movq %rbx,40(%rsp)
34movq %rbp,48(%rsp)
35movq   0(%rsi),%rcx
36movq   8(%rsi),%r8
37movq   16(%rsi),%r9
38movq   24(%rsi),%rax
39movq   32(%rsi),%rsi
40movq   %r9,16(%rdi)
41movq   %rax,24(%rdi)
42movq   %rsi,32(%rdi)
43mov  %rdx,%rsi
44
45.p2align 4
46._loop:
47sub  $1,%rsi
48mov  %rcx,%rax
49mul  %rcx
50add  %rcx,%rcx
51mov  %rax,%r9
52mov  %rdx,%r10
53mov  %rcx,%rax
54mul  %r8
55mov  %rax,%r11
56mov  %rdx,%r12
57mov  %rcx,%rax
58mulq  16(%rdi)
59mov  %rax,%r13
60mov  %rdx,%r14
61mov  %rcx,%rax
62mulq  24(%rdi)
63mov  %rax,%r15
64mov  %rdx,%rbx
65mov  %rcx,%rax
66mulq  32(%rdi)
67mov  %rax,%rcx
68mov  %rdx,%rbp
69mov  %r8,%rax
70mul  %r8
71add  %r8,%r8
72add  %rax,%r13
73adc %rdx,%r14
74mov  %r8,%rax
75mulq  16(%rdi)
76add  %rax,%r15
77adc %rdx,%rbx
78mov  %r8,%rax
79imulq  $19, %r8,%r8
80mulq  24(%rdi)
81add  %rax,%rcx
82adc %rdx,%rbp
83mov  %r8,%rax
84mulq  32(%rdi)
85add  %rax,%r9
86adc %rdx,%r10
87movq   16(%rdi),%rax
88mulq  16(%rdi)
89add  %rax,%rcx
90adc %rdx,%rbp
91shld $13,%rcx,%rbp
92movq   16(%rdi),%rax
93imulq  $38, %rax,%rax
94mulq  24(%rdi)
95add  %rax,%r9
96adc %rdx,%r10
97shld $13,%r9,%r10
98movq   16(%rdi),%rax
99imulq  $38, %rax,%rax
100mulq  32(%rdi)
101add  %rax,%r11
102adc %rdx,%r12
103movq   24(%rdi),%rax
104imulq  $19, %rax,%rax
105mulq  24(%rdi)
106add  %rax,%r11
107adc %rdx,%r12
108shld $13,%r11,%r12
109movq   24(%rdi),%rax
110imulq  $38, %rax,%rax
111mulq  32(%rdi)
112add  %rax,%r13
113adc %rdx,%r14
114shld $13,%r13,%r14
115movq   32(%rdi),%rax
116imulq  $19, %rax,%rax
117mulq  32(%rdi)
118add  %rax,%r15
119adc %rdx,%rbx
120shld $13,%r15,%rbx
121movq REDMASK51(%rip),%rdx
122and  %rdx,%rcx
123add  %rbx,%rcx
124and  %rdx,%r9
125and  %rdx,%r11
126add  %r10,%r11
127and  %rdx,%r13
128add  %r12,%r13
129and  %rdx,%r15
130add  %r14,%r15
131imulq  $19, %rbp,%rbp
132lea  (%r9,%rbp),%r9
133mov  %r9,%rax
134shr  $51,%r9
135add  %r11,%r9
136and  %rdx,%rax
137mov  %r9,%r8
138shr  $51,%r9
139add  %r13,%r9
140and  %rdx,%r8
141mov  %r9,%r10
142shr  $51,%r9
143add  %r15,%r9
144and  %rdx,%r10
145movq   %r10,16(%rdi)
146mov  %r9,%r10
147shr  $51,%r9
148add  %rcx,%r9
149and  %rdx,%r10
150movq   %r10,24(%rdi)
151mov  %r9,%r10
152shr  $51,%r9
153imulq  $19, %r9,%r9
154lea  (%rax,%r9),%rcx
155and  %rdx,%r10
156movq   %r10,32(%rdi)
157cmp  $0,%rsi
158jne ._loop
159
160movq   %rcx,0(%rdi)
161movq   %r8,8(%rdi)
162movq 0(%rsp),%r11
163movq 8(%rsp),%r12
164movq 16(%rsp),%r13
165movq 24(%rsp),%r14
166movq 32(%rsp),%r15
167movq 40(%rsp),%rbx
168movq 48(%rsp),%rbp
169add %r11,%rsp
170ret
171
172#endif
173