x86_64-mont5.pl revision 298999
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi";	# BN_ULONG *rp,
36$ap="%rsi";	# const BN_ULONG *ap,
37$bp="%rdx";	# const BN_ULONG *bp,
38$np="%rcx";	# const BN_ULONG *np,
39$n0="%r8";	# const BN_ULONG *n0,
40$num="%r9";	# int num,
41		# int idx);	# 0 to 2^5-1, "index" in $bp holding
42				# pre-computed powers of a', interlaced
43				# in such manner that b[0] is $bp[idx],
44				# b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl	bn_mul_mont_gather5
57.type	bn_mul_mont_gather5,\@function,6
58.align	64
59bn_mul_mont_gather5:
60	test	\$3,${num}d
61	jnz	.Lmul_enter
62	cmp	\$8,${num}d
63	jb	.Lmul_enter
64	jmp	.Lmul4x_enter
65
66.align	16
67.Lmul_enter:
68	mov	${num}d,${num}d
69	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
70	lea	.Linc(%rip),%r10
71	push	%rbx
72	push	%rbp
73	push	%r12
74	push	%r13
75	push	%r14
76	push	%r15
77
78.Lmul_alloca:
79	mov	%rsp,%rax
80	lea	2($num),%r11
81	neg	%r11
82	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
83	and	\$-1024,%rsp		# minimize TLB usage
84
85	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
86.Lmul_body:
87	# Some OSes, *cough*-dows, insist on stack being "wired" to
88	# physical memory in strictly sequential manner, i.e. if stack
89	# allocation spans two pages, then reference to farmost one can
90	# be punishable by SEGV. But page walking can do good even on
91	# other OSes, because it guarantees that villain thread hits
92	# the guard page before it can make damage to innocent one...
93	sub	%rsp,%rax
94	and	\$-4096,%rax
95.Lmul_page_walk:
96	mov	(%rsp,%rax),%r11
97	sub	\$4096,%rax
98	.byte	0x2e			# predict non-taken
99	jnc	.Lmul_page_walk
100
101	lea	128($bp),%r12		# reassign $bp (+size optimization)
102___
103		$bp="%r12";
104		$STRIDE=2**5*8;		# 5 is "window size"
105		$N=$STRIDE/4;		# should match cache line size
106$code.=<<___;
107	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
108	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
109	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
110	and	\$-16,%r10
111
112	pshufd	\$0,%xmm5,%xmm5		# broadcast index
113	movdqa	%xmm1,%xmm4
114	movdqa	%xmm1,%xmm2
115___
116########################################################################
117# calculate mask by comparing 0..31 to index and save result to stack
118#
119$code.=<<___;
120	paddd	%xmm0,%xmm1
121	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
122	.byte	0x67
123	movdqa	%xmm4,%xmm3
124___
125for($k=0;$k<$STRIDE/16-4;$k+=4) {
126$code.=<<___;
127	paddd	%xmm1,%xmm2
128	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
129	movdqa	%xmm0,`16*($k+0)+112`(%r10)
130	movdqa	%xmm4,%xmm0
131
132	paddd	%xmm2,%xmm3
133	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
134	movdqa	%xmm1,`16*($k+1)+112`(%r10)
135	movdqa	%xmm4,%xmm1
136
137	paddd	%xmm3,%xmm0
138	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
139	movdqa	%xmm2,`16*($k+2)+112`(%r10)
140	movdqa	%xmm4,%xmm2
141
142	paddd	%xmm0,%xmm1
143	pcmpeqd	%xmm5,%xmm0
144	movdqa	%xmm3,`16*($k+3)+112`(%r10)
145	movdqa	%xmm4,%xmm3
146___
147}
148$code.=<<___;				# last iteration can be optimized
149	paddd	%xmm1,%xmm2
150	pcmpeqd	%xmm5,%xmm1
151	movdqa	%xmm0,`16*($k+0)+112`(%r10)
152
153	paddd	%xmm2,%xmm3
154	.byte	0x67
155	pcmpeqd	%xmm5,%xmm2
156	movdqa	%xmm1,`16*($k+1)+112`(%r10)
157
158	pcmpeqd	%xmm5,%xmm3
159	movdqa	%xmm2,`16*($k+2)+112`(%r10)
160	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
161
162	pand	`16*($k+1)-128`($bp),%xmm1
163	pand	`16*($k+2)-128`($bp),%xmm2
164	movdqa	%xmm3,`16*($k+3)+112`(%r10)
165	pand	`16*($k+3)-128`($bp),%xmm3
166	por	%xmm2,%xmm0
167	por	%xmm3,%xmm1
168___
169for($k=0;$k<$STRIDE/16-4;$k+=4) {
170$code.=<<___;
171	movdqa	`16*($k+0)-128`($bp),%xmm4
172	movdqa	`16*($k+1)-128`($bp),%xmm5
173	movdqa	`16*($k+2)-128`($bp),%xmm2
174	pand	`16*($k+0)+112`(%r10),%xmm4
175	movdqa	`16*($k+3)-128`($bp),%xmm3
176	pand	`16*($k+1)+112`(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	`16*($k+2)+112`(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	`16*($k+3)+112`(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183___
184}
185$code.=<<___;
186	por	%xmm1,%xmm0
187	pshufd	\$0x4e,%xmm0,%xmm1
188	por	%xmm1,%xmm0
189	lea	$STRIDE($bp),$bp
190	movq	%xmm0,$m0		# m0=bp[0]
191
192	mov	($n0),$n0		# pull n0[0] value
193	mov	($ap),%rax
194
195	xor	$i,$i			# i=0
196	xor	$j,$j			# j=0
197
198	mov	$n0,$m1
199	mulq	$m0			# ap[0]*bp[0]
200	mov	%rax,$lo0
201	mov	($np),%rax
202
203	imulq	$lo0,$m1		# "tp[0]"*n0
204	mov	%rdx,$hi0
205
206	mulq	$m1			# np[0]*m1
207	add	%rax,$lo0		# discarded
208	mov	8($ap),%rax
209	adc	\$0,%rdx
210	mov	%rdx,$hi1
211
212	lea	1($j),$j		# j++
213	jmp	.L1st_enter
214
215.align	16
216.L1st:
217	add	%rax,$hi1
218	mov	($ap,$j,8),%rax
219	adc	\$0,%rdx
220	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
221	mov	$lo0,$hi0
222	adc	\$0,%rdx
223	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
224	mov	%rdx,$hi1
225
226.L1st_enter:
227	mulq	$m0			# ap[j]*bp[0]
228	add	%rax,$hi0
229	mov	($np,$j,8),%rax
230	adc	\$0,%rdx
231	lea	1($j),$j		# j++
232	mov	%rdx,$lo0
233
234	mulq	$m1			# np[j]*m1
235	cmp	$num,$j
236	jne	.L1st
237
238	add	%rax,$hi1
239	mov	($ap),%rax		# ap[0]
240	adc	\$0,%rdx
241	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
242	adc	\$0,%rdx
243	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
244	mov	%rdx,$hi1
245	mov	$lo0,$hi0
246
247	xor	%rdx,%rdx
248	add	$hi0,$hi1
249	adc	\$0,%rdx
250	mov	$hi1,-8(%rsp,$num,8)
251	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
252
253	lea	1($i),$i		# i++
254	jmp	.Louter
255.align	16
256.Louter:
257	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
258	and	\$-16,%rdx
259	pxor	%xmm4,%xmm4
260	pxor	%xmm5,%xmm5
261___
262for($k=0;$k<$STRIDE/16;$k+=4) {
263$code.=<<___;
264	movdqa	`16*($k+0)-128`($bp),%xmm0
265	movdqa	`16*($k+1)-128`($bp),%xmm1
266	movdqa	`16*($k+2)-128`($bp),%xmm2
267	movdqa	`16*($k+3)-128`($bp),%xmm3
268	pand	`16*($k+0)-128`(%rdx),%xmm0
269	pand	`16*($k+1)-128`(%rdx),%xmm1
270	por	%xmm0,%xmm4
271	pand	`16*($k+2)-128`(%rdx),%xmm2
272	por	%xmm1,%xmm5
273	pand	`16*($k+3)-128`(%rdx),%xmm3
274	por	%xmm2,%xmm4
275	por	%xmm3,%xmm5
276___
277}
278$code.=<<___;
279	por	%xmm5,%xmm4
280	pshufd	\$0x4e,%xmm4,%xmm0
281	por	%xmm4,%xmm0
282	lea	$STRIDE($bp),$bp
283	movq	%xmm0,$m0		# m0=bp[i]
284
285	xor	$j,$j			# j=0
286	mov	$n0,$m1
287	mov	(%rsp),$lo0
288
289	mulq	$m0			# ap[0]*bp[i]
290	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
291	mov	($np),%rax
292	adc	\$0,%rdx
293
294	imulq	$lo0,$m1		# tp[0]*n0
295	mov	%rdx,$hi0
296
297	mulq	$m1			# np[0]*m1
298	add	%rax,$lo0		# discarded
299	mov	8($ap),%rax
300	adc	\$0,%rdx
301	mov	8(%rsp),$lo0		# tp[1]
302	mov	%rdx,$hi1
303
304	lea	1($j),$j		# j++
305	jmp	.Linner_enter
306
307.align	16
308.Linner:
309	add	%rax,$hi1
310	mov	($ap,$j,8),%rax
311	adc	\$0,%rdx
312	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
313	mov	(%rsp,$j,8),$lo0
314	adc	\$0,%rdx
315	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
316	mov	%rdx,$hi1
317
318.Linner_enter:
319	mulq	$m0			# ap[j]*bp[i]
320	add	%rax,$hi0
321	mov	($np,$j,8),%rax
322	adc	\$0,%rdx
323	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
324	mov	%rdx,$hi0
325	adc	\$0,$hi0
326	lea	1($j),$j		# j++
327
328	mulq	$m1			# np[j]*m1
329	cmp	$num,$j
330	jne	.Linner
331
332	add	%rax,$hi1
333	mov	($ap),%rax		# ap[0]
334	adc	\$0,%rdx
335	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
336	mov	(%rsp,$j,8),$lo0
337	adc	\$0,%rdx
338	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
339	mov	%rdx,$hi1
340
341	xor	%rdx,%rdx
342	add	$hi0,$hi1
343	adc	\$0,%rdx
344	add	$lo0,$hi1		# pull upmost overflow bit
345	adc	\$0,%rdx
346	mov	$hi1,-8(%rsp,$num,8)
347	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
348
349	lea	1($i),$i		# i++
350	cmp	$num,$i
351	jl	.Louter
352
353	xor	$i,$i			# i=0 and clear CF!
354	mov	(%rsp),%rax		# tp[0]
355	lea	(%rsp),$ap		# borrow ap for tp
356	mov	$num,$j			# j=num
357	jmp	.Lsub
358.align	16
359.Lsub:	sbb	($np,$i,8),%rax
360	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
361	mov	8($ap,$i,8),%rax	# tp[i+1]
362	lea	1($i),$i		# i++
363	dec	$j			# doesnn't affect CF!
364	jnz	.Lsub
365
366	sbb	\$0,%rax		# handle upmost overflow bit
367	xor	$i,$i
368	and	%rax,$ap
369	not	%rax
370	mov	$rp,$np
371	and	%rax,$np
372	mov	$num,$j			# j=num
373	or	$np,$ap			# ap=borrow?tp:rp
374.align	16
375.Lcopy:					# copy or in-place refresh
376	mov	($ap,$i,8),%rax
377	mov	$i,(%rsp,$i,8)		# zap temporary vector
378	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
379	lea	1($i),$i
380	sub	\$1,$j
381	jnz	.Lcopy
382
383	mov	8(%rsp,$num,8),%rsi	# restore %rsp
384	mov	\$1,%rax
385
386	mov	(%rsi),%r15
387	mov	8(%rsi),%r14
388	mov	16(%rsi),%r13
389	mov	24(%rsi),%r12
390	mov	32(%rsi),%rbp
391	mov	40(%rsi),%rbx
392	lea	48(%rsi),%rsp
393.Lmul_epilogue:
394	ret
395.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
396___
397{{{
398my @A=("%r10","%r11");
399my @N=("%r13","%rdi");
400$code.=<<___;
401.type	bn_mul4x_mont_gather5,\@function,6
402.align	16
403bn_mul4x_mont_gather5:
404.Lmul4x_enter:
405	mov	${num}d,${num}d
406	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
407	lea	.Linc(%rip),%r10
408	push	%rbx
409	push	%rbp
410	push	%r12
411	push	%r13
412	push	%r14
413	push	%r15
414
415.Lmul4x_alloca:
416	mov	%rsp,%rax
417	lea	4($num),%r11
418	neg	%r11
419	lea	-256(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4)+256)
420	and	\$-1024,%rsp		# minimize TLB usage
421
422	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
423.Lmul4x_body:
424	sub	%rsp,%rax
425	and	\$-4096,%rax
426.Lmul4x_page_walk:
427	mov	(%rsp,%rax),%r11
428	sub	\$4096,%rax
429	.byte	0x2e			# predict non-taken
430	jnc	.Lmul4x_page_walk
431
432	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
433	lea	128(%rdx),%r12		# reassign $bp (+size optimization)
434___
435		$bp="%r12";
436		$STRIDE=2**5*8;		# 5 is "window size"
437		$N=$STRIDE/4;		# should match cache line size
438$code.=<<___;
439	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
440	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
441	lea	32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
442
443	pshufd	\$0,%xmm5,%xmm5		# broadcast index
444	movdqa	%xmm1,%xmm4
445	.byte	0x67,0x67
446	movdqa	%xmm1,%xmm2
447___
448########################################################################
449# calculate mask by comparing 0..31 to index and save result to stack
450#
451$code.=<<___;
452	paddd	%xmm0,%xmm1
453	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
454	.byte	0x67
455	movdqa	%xmm4,%xmm3
456___
457for($k=0;$k<$STRIDE/16-4;$k+=4) {
458$code.=<<___;
459	paddd	%xmm1,%xmm2
460	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
461	movdqa	%xmm0,`16*($k+0)+112`(%r10)
462	movdqa	%xmm4,%xmm0
463
464	paddd	%xmm2,%xmm3
465	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
466	movdqa	%xmm1,`16*($k+1)+112`(%r10)
467	movdqa	%xmm4,%xmm1
468
469	paddd	%xmm3,%xmm0
470	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
471	movdqa	%xmm2,`16*($k+2)+112`(%r10)
472	movdqa	%xmm4,%xmm2
473
474	paddd	%xmm0,%xmm1
475	pcmpeqd	%xmm5,%xmm0
476	movdqa	%xmm3,`16*($k+3)+112`(%r10)
477	movdqa	%xmm4,%xmm3
478___
479}
480$code.=<<___;				# last iteration can be optimized
481	paddd	%xmm1,%xmm2
482	pcmpeqd	%xmm5,%xmm1
483	movdqa	%xmm0,`16*($k+0)+112`(%r10)
484
485	paddd	%xmm2,%xmm3
486	.byte	0x67
487	pcmpeqd	%xmm5,%xmm2
488	movdqa	%xmm1,`16*($k+1)+112`(%r10)
489
490	pcmpeqd	%xmm5,%xmm3
491	movdqa	%xmm2,`16*($k+2)+112`(%r10)
492	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
493
494	pand	`16*($k+1)-128`($bp),%xmm1
495	pand	`16*($k+2)-128`($bp),%xmm2
496	movdqa	%xmm3,`16*($k+3)+112`(%r10)
497	pand	`16*($k+3)-128`($bp),%xmm3
498	por	%xmm2,%xmm0
499	por	%xmm3,%xmm1
500___
501for($k=0;$k<$STRIDE/16-4;$k+=4) {
502$code.=<<___;
503	movdqa	`16*($k+0)-128`($bp),%xmm4
504	movdqa	`16*($k+1)-128`($bp),%xmm5
505	movdqa	`16*($k+2)-128`($bp),%xmm2
506	pand	`16*($k+0)+112`(%r10),%xmm4
507	movdqa	`16*($k+3)-128`($bp),%xmm3
508	pand	`16*($k+1)+112`(%r10),%xmm5
509	por	%xmm4,%xmm0
510	pand	`16*($k+2)+112`(%r10),%xmm2
511	por	%xmm5,%xmm1
512	pand	`16*($k+3)+112`(%r10),%xmm3
513	por	%xmm2,%xmm0
514	por	%xmm3,%xmm1
515___
516}
517$code.=<<___;
518	por	%xmm1,%xmm0
519	pshufd	\$0x4e,%xmm0,%xmm1
520	por	%xmm1,%xmm0
521	lea	$STRIDE($bp),$bp
522	movq	%xmm0,$m0		# m0=bp[0]
523
524	mov	($n0),$n0		# pull n0[0] value
525	mov	($ap),%rax
526
527	xor	$i,$i			# i=0
528	xor	$j,$j			# j=0
529
530	mov	$n0,$m1
531	mulq	$m0			# ap[0]*bp[0]
532	mov	%rax,$A[0]
533	mov	($np),%rax
534
535	imulq	$A[0],$m1		# "tp[0]"*n0
536	mov	%rdx,$A[1]
537
538	mulq	$m1			# np[0]*m1
539	add	%rax,$A[0]		# discarded
540	mov	8($ap),%rax
541	adc	\$0,%rdx
542	mov	%rdx,$N[1]
543
544	mulq	$m0
545	add	%rax,$A[1]
546	mov	8($np),%rax
547	adc	\$0,%rdx
548	mov	%rdx,$A[0]
549
550	mulq	$m1
551	add	%rax,$N[1]
552	mov	16($ap),%rax
553	adc	\$0,%rdx
554	add	$A[1],$N[1]
555	lea	4($j),$j		# j++
556	adc	\$0,%rdx
557	mov	$N[1],(%rsp)
558	mov	%rdx,$N[0]
559	jmp	.L1st4x
560.align	16
561.L1st4x:
562	mulq	$m0			# ap[j]*bp[0]
563	add	%rax,$A[0]
564	mov	-16($np,$j,8),%rax
565	adc	\$0,%rdx
566	mov	%rdx,$A[1]
567
568	mulq	$m1			# np[j]*m1
569	add	%rax,$N[0]
570	mov	-8($ap,$j,8),%rax
571	adc	\$0,%rdx
572	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
573	adc	\$0,%rdx
574	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
575	mov	%rdx,$N[1]
576
577	mulq	$m0			# ap[j]*bp[0]
578	add	%rax,$A[1]
579	mov	-8($np,$j,8),%rax
580	adc	\$0,%rdx
581	mov	%rdx,$A[0]
582
583	mulq	$m1			# np[j]*m1
584	add	%rax,$N[1]
585	mov	($ap,$j,8),%rax
586	adc	\$0,%rdx
587	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
588	adc	\$0,%rdx
589	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
590	mov	%rdx,$N[0]
591
592	mulq	$m0			# ap[j]*bp[0]
593	add	%rax,$A[0]
594	mov	($np,$j,8),%rax
595	adc	\$0,%rdx
596	mov	%rdx,$A[1]
597
598	mulq	$m1			# np[j]*m1
599	add	%rax,$N[0]
600	mov	8($ap,$j,8),%rax
601	adc	\$0,%rdx
602	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
603	adc	\$0,%rdx
604	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
605	mov	%rdx,$N[1]
606
607	mulq	$m0			# ap[j]*bp[0]
608	add	%rax,$A[1]
609	mov	8($np,$j,8),%rax
610	adc	\$0,%rdx
611	lea	4($j),$j		# j++
612	mov	%rdx,$A[0]
613
614	mulq	$m1			# np[j]*m1
615	add	%rax,$N[1]
616	mov	-16($ap,$j,8),%rax
617	adc	\$0,%rdx
618	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
619	adc	\$0,%rdx
620	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
621	mov	%rdx,$N[0]
622	cmp	$num,$j
623	jl	.L1st4x
624
625	mulq	$m0			# ap[j]*bp[0]
626	add	%rax,$A[0]
627	mov	-16($np,$j,8),%rax
628	adc	\$0,%rdx
629	mov	%rdx,$A[1]
630
631	mulq	$m1			# np[j]*m1
632	add	%rax,$N[0]
633	mov	-8($ap,$j,8),%rax
634	adc	\$0,%rdx
635	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
636	adc	\$0,%rdx
637	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
638	mov	%rdx,$N[1]
639
640	mulq	$m0			# ap[j]*bp[0]
641	add	%rax,$A[1]
642	mov	-8($np,$j,8),%rax
643	adc	\$0,%rdx
644	mov	%rdx,$A[0]
645
646	mulq	$m1			# np[j]*m1
647	add	%rax,$N[1]
648	mov	($ap),%rax		# ap[0]
649	adc	\$0,%rdx
650	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
651	adc	\$0,%rdx
652	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
653	mov	%rdx,$N[0]
654
655	xor	$N[1],$N[1]
656	add	$A[0],$N[0]
657	adc	\$0,$N[1]
658	mov	$N[0],-8(%rsp,$j,8)
659	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
660
661	lea	1($i),$i		# i++
662.align	4
663.Louter4x:
664	lea	32+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
665	pxor	%xmm4,%xmm4
666	pxor	%xmm5,%xmm5
667___
668for($k=0;$k<$STRIDE/16;$k+=4) {
669$code.=<<___;
670	movdqa	`16*($k+0)-128`($bp),%xmm0
671	movdqa	`16*($k+1)-128`($bp),%xmm1
672	movdqa	`16*($k+2)-128`($bp),%xmm2
673	movdqa	`16*($k+3)-128`($bp),%xmm3
674	pand	`16*($k+0)-128`(%rdx),%xmm0
675	pand	`16*($k+1)-128`(%rdx),%xmm1
676	por	%xmm0,%xmm4
677	pand	`16*($k+2)-128`(%rdx),%xmm2
678	por	%xmm1,%xmm5
679	pand	`16*($k+3)-128`(%rdx),%xmm3
680	por	%xmm2,%xmm4
681	por	%xmm3,%xmm5
682___
683}
684$code.=<<___;
685	por	%xmm5,%xmm4
686	pshufd	\$0x4e,%xmm4,%xmm0
687	por	%xmm4,%xmm0
688	lea	$STRIDE($bp),$bp
689	movq	%xmm0,$m0		# m0=bp[i]
690
691	xor	$j,$j			# j=0
692
693	mov	(%rsp),$A[0]
694	mov	$n0,$m1
695	mulq	$m0			# ap[0]*bp[i]
696	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
697	mov	($np),%rax
698	adc	\$0,%rdx
699
700	imulq	$A[0],$m1		# tp[0]*n0
701	mov	%rdx,$A[1]
702
703	mulq	$m1			# np[0]*m1
704	add	%rax,$A[0]		# "$N[0]", discarded
705	mov	8($ap),%rax
706	adc	\$0,%rdx
707	mov	%rdx,$N[1]
708
709	mulq	$m0			# ap[j]*bp[i]
710	add	%rax,$A[1]
711	mov	8($np),%rax
712	adc	\$0,%rdx
713	add	8(%rsp),$A[1]		# +tp[1]
714	adc	\$0,%rdx
715	mov	%rdx,$A[0]
716
717	mulq	$m1			# np[j]*m1
718	add	%rax,$N[1]
719	mov	16($ap),%rax
720	adc	\$0,%rdx
721	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
722	lea	4($j),$j		# j+=2
723	adc	\$0,%rdx
724	mov	%rdx,$N[0]
725	jmp	.Linner4x
726.align	16
727.Linner4x:
728	mulq	$m0			# ap[j]*bp[i]
729	add	%rax,$A[0]
730	mov	-16($np,$j,8),%rax
731	adc	\$0,%rdx
732	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
733	adc	\$0,%rdx
734	mov	%rdx,$A[1]
735
736	mulq	$m1			# np[j]*m1
737	add	%rax,$N[0]
738	mov	-8($ap,$j,8),%rax
739	adc	\$0,%rdx
740	add	$A[0],$N[0]
741	adc	\$0,%rdx
742	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
743	mov	%rdx,$N[1]
744
745	mulq	$m0			# ap[j]*bp[i]
746	add	%rax,$A[1]
747	mov	-8($np,$j,8),%rax
748	adc	\$0,%rdx
749	add	-8(%rsp,$j,8),$A[1]
750	adc	\$0,%rdx
751	mov	%rdx,$A[0]
752
753	mulq	$m1			# np[j]*m1
754	add	%rax,$N[1]
755	mov	($ap,$j,8),%rax
756	adc	\$0,%rdx
757	add	$A[1],$N[1]
758	adc	\$0,%rdx
759	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
760	mov	%rdx,$N[0]
761
762	mulq	$m0			# ap[j]*bp[i]
763	add	%rax,$A[0]
764	mov	($np,$j,8),%rax
765	adc	\$0,%rdx
766	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
767	adc	\$0,%rdx
768	mov	%rdx,$A[1]
769
770	mulq	$m1			# np[j]*m1
771	add	%rax,$N[0]
772	mov	8($ap,$j,8),%rax
773	adc	\$0,%rdx
774	add	$A[0],$N[0]
775	adc	\$0,%rdx
776	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
777	mov	%rdx,$N[1]
778
779	mulq	$m0			# ap[j]*bp[i]
780	add	%rax,$A[1]
781	mov	8($np,$j,8),%rax
782	adc	\$0,%rdx
783	add	8(%rsp,$j,8),$A[1]
784	adc	\$0,%rdx
785	lea	4($j),$j		# j++
786	mov	%rdx,$A[0]
787
788	mulq	$m1			# np[j]*m1
789	add	%rax,$N[1]
790	mov	-16($ap,$j,8),%rax
791	adc	\$0,%rdx
792	add	$A[1],$N[1]
793	adc	\$0,%rdx
794	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
795	mov	%rdx,$N[0]
796	cmp	$num,$j
797	jl	.Linner4x
798
799	mulq	$m0			# ap[j]*bp[i]
800	add	%rax,$A[0]
801	mov	-16($np,$j,8),%rax
802	adc	\$0,%rdx
803	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
804	adc	\$0,%rdx
805	mov	%rdx,$A[1]
806
807	mulq	$m1			# np[j]*m1
808	add	%rax,$N[0]
809	mov	-8($ap,$j,8),%rax
810	adc	\$0,%rdx
811	add	$A[0],$N[0]
812	adc	\$0,%rdx
813	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
814	mov	%rdx,$N[1]
815
816	mulq	$m0			# ap[j]*bp[i]
817	add	%rax,$A[1]
818	mov	-8($np,$j,8),%rax
819	adc	\$0,%rdx
820	add	-8(%rsp,$j,8),$A[1]
821	adc	\$0,%rdx
822	lea	1($i),$i		# i++
823	mov	%rdx,$A[0]
824
825	mulq	$m1			# np[j]*m1
826	add	%rax,$N[1]
827	mov	($ap),%rax		# ap[0]
828	adc	\$0,%rdx
829	add	$A[1],$N[1]
830	adc	\$0,%rdx
831	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
832	mov	%rdx,$N[0]
833
834	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
835
836	xor	$N[1],$N[1]
837	add	$A[0],$N[0]
838	adc	\$0,$N[1]
839	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
840	adc	\$0,$N[1]
841	mov	$N[0],-8(%rsp,$j,8)
842	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
843
844	cmp	$num,$i
845	jl	.Louter4x
846___
847{
848my @ri=("%rax","%rdx",$m0,$m1);
849$code.=<<___;
850	mov	16(%rsp,$num,8),$rp	# restore $rp
851	mov	0(%rsp),@ri[0]		# tp[0]
852	pxor	%xmm0,%xmm0
853	mov	8(%rsp),@ri[1]		# tp[1]
854	shr	\$2,$num		# num/=4
855	lea	(%rsp),$ap		# borrow ap for tp
856	xor	$i,$i			# i=0 and clear CF!
857
858	sub	0($np),@ri[0]
859	mov	16($ap),@ri[2]		# tp[2]
860	mov	24($ap),@ri[3]		# tp[3]
861	sbb	8($np),@ri[1]
862	lea	-1($num),$j		# j=num/4-1
863	jmp	.Lsub4x
864.align	16
865.Lsub4x:
866	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
867	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
868	sbb	16($np,$i,8),@ri[2]
869	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
870	mov	40($ap,$i,8),@ri[1]
871	sbb	24($np,$i,8),@ri[3]
872	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
873	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
874	sbb	32($np,$i,8),@ri[0]
875	mov	48($ap,$i,8),@ri[2]
876	mov	56($ap,$i,8),@ri[3]
877	sbb	40($np,$i,8),@ri[1]
878	lea	4($i),$i		# i++
879	dec	$j			# doesnn't affect CF!
880	jnz	.Lsub4x
881
882	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
883	mov	32($ap,$i,8),@ri[0]	# load overflow bit
884	sbb	16($np,$i,8),@ri[2]
885	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
886	sbb	24($np,$i,8),@ri[3]
887	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
888
889	sbb	\$0,@ri[0]		# handle upmost overflow bit
890	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
891	xor	$i,$i			# i=0
892	and	@ri[0],$ap
893	not	@ri[0]
894	mov	$rp,$np
895	and	@ri[0],$np
896	lea	-1($num),$j
897	or	$np,$ap			# ap=borrow?tp:rp
898
899	movdqu	($ap),%xmm1
900	movdqa	%xmm0,(%rsp)
901	movdqu	%xmm1,($rp)
902	jmp	.Lcopy4x
903.align	16
904.Lcopy4x:					# copy or in-place refresh
905	movdqu	16($ap,$i),%xmm2
906	movdqu	32($ap,$i),%xmm1
907	movdqa	%xmm0,16(%rsp,$i)
908	movdqu	%xmm2,16($rp,$i)
909	movdqa	%xmm0,32(%rsp,$i)
910	movdqu	%xmm1,32($rp,$i)
911	lea	32($i),$i
912	dec	$j
913	jnz	.Lcopy4x
914
915	shl	\$2,$num
916	movdqu	16($ap,$i),%xmm2
917	movdqa	%xmm0,16(%rsp,$i)
918	movdqu	%xmm2,16($rp,$i)
919___
920}
921$code.=<<___;
922	mov	8(%rsp,$num,8),%rsi	# restore %rsp
923	mov	\$1,%rax
924
925	mov	(%rsi),%r15
926	mov	8(%rsi),%r14
927	mov	16(%rsi),%r13
928	mov	24(%rsi),%r12
929	mov	32(%rsi),%rbp
930	mov	40(%rsi),%rbx
931	lea	48(%rsi),%rsp
932.Lmul4x_epilogue:
933	ret
934.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
935___
936}}}
937
938{
939my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
940				("%rdi","%rsi","%rdx","%ecx"); # Unix order
941my $out=$inp;
942my $STRIDE=2**5*8;
943my $N=$STRIDE/4;
944
945$code.=<<___;
946.globl	bn_scatter5
947.type	bn_scatter5,\@abi-omnipotent
948.align	16
949bn_scatter5:
950	cmp	\$0, $num
951	jz	.Lscatter_epilogue
952	lea	($tbl,$idx,8),$tbl
953.Lscatter:
954	mov	($inp),%rax
955	lea	8($inp),$inp
956	mov	%rax,($tbl)
957	lea	32*8($tbl),$tbl
958	sub	\$1,$num
959	jnz	.Lscatter
960.Lscatter_epilogue:
961	ret
962.size	bn_scatter5,.-bn_scatter5
963
964.globl	bn_gather5
965.type	bn_gather5,\@abi-omnipotent
966.align	16
967bn_gather5:
968.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
969	# I can't trust assembler to use specific encoding:-(
970	.byte	0x4c,0x8d,0x14,0x24			# lea    (%rsp),%r10
971	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	# sub	$0x108,%rsp
972	lea	.Linc(%rip),%rax
973	and	\$-16,%rsp		# shouldn't be formally required
974
975	movd	$idx,%xmm5
976	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
977	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
978	lea	128($tbl),%r11		# size optimization
979	lea	128(%rsp),%rax		# size optimization
980
981	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
982	movdqa	%xmm1,%xmm4
983	movdqa	%xmm1,%xmm2
984___
985########################################################################
986# calculate mask by comparing 0..31 to $idx and save result to stack
987#
988for($i=0;$i<$STRIDE/16;$i+=4) {
989$code.=<<___;
990	paddd	%xmm0,%xmm1
991	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
992___
993$code.=<<___	if ($i);
994	movdqa	%xmm3,`16*($i-1)-128`(%rax)
995___
996$code.=<<___;
997	movdqa	%xmm4,%xmm3
998
999	paddd	%xmm1,%xmm2
1000	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
1001	movdqa	%xmm0,`16*($i+0)-128`(%rax)
1002	movdqa	%xmm4,%xmm0
1003
1004	paddd	%xmm2,%xmm3
1005	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
1006	movdqa	%xmm1,`16*($i+1)-128`(%rax)
1007	movdqa	%xmm4,%xmm1
1008
1009	paddd	%xmm3,%xmm0
1010	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
1011	movdqa	%xmm2,`16*($i+2)-128`(%rax)
1012	movdqa	%xmm4,%xmm2
1013___
1014}
1015$code.=<<___;
1016	movdqa	%xmm3,`16*($i-1)-128`(%rax)
1017	jmp	.Lgather
1018
1019.align	32
1020.Lgather:
1021	pxor	%xmm4,%xmm4
1022	pxor	%xmm5,%xmm5
1023___
1024for($i=0;$i<$STRIDE/16;$i+=4) {
1025$code.=<<___;
1026	movdqa	`16*($i+0)-128`(%r11),%xmm0
1027	movdqa	`16*($i+1)-128`(%r11),%xmm1
1028	movdqa	`16*($i+2)-128`(%r11),%xmm2
1029	pand	`16*($i+0)-128`(%rax),%xmm0
1030	movdqa	`16*($i+3)-128`(%r11),%xmm3
1031	pand	`16*($i+1)-128`(%rax),%xmm1
1032	por	%xmm0,%xmm4
1033	pand	`16*($i+2)-128`(%rax),%xmm2
1034	por	%xmm1,%xmm5
1035	pand	`16*($i+3)-128`(%rax),%xmm3
1036	por	%xmm2,%xmm4
1037	por	%xmm3,%xmm5
1038___
1039}
1040$code.=<<___;
1041	por	%xmm5,%xmm4
1042	lea	$STRIDE(%r11),%r11
1043	pshufd	\$0x4e,%xmm4,%xmm0
1044	por	%xmm4,%xmm0
1045	movq	%xmm0,($out)		# m0=bp[0]
1046	lea	8($out),$out
1047	sub	\$1,$num
1048	jnz	.Lgather
1049
1050	lea	(%r10),%rsp
1051	ret
1052.LSEH_end_bn_gather5:
1053.size	bn_gather5,.-bn_gather5
1054___
1055}
1056$code.=<<___;
1057.align	64
1058.Linc:
1059	.long	0,0, 1,1
1060	.long	2,2, 2,2
1061.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1062___
1063
1064# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1065#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1066if ($win64) {
1067$rec="%rcx";
1068$frame="%rdx";
1069$context="%r8";
1070$disp="%r9";
1071
1072$code.=<<___;
1073.extern	__imp_RtlVirtualUnwind
1074.type	mul_handler,\@abi-omnipotent
1075.align	16
1076mul_handler:
1077	push	%rsi
1078	push	%rdi
1079	push	%rbx
1080	push	%rbp
1081	push	%r12
1082	push	%r13
1083	push	%r14
1084	push	%r15
1085	pushfq
1086	sub	\$64,%rsp
1087
1088	mov	120($context),%rax	# pull context->Rax
1089	mov	248($context),%rbx	# pull context->Rip
1090
1091	mov	8($disp),%rsi		# disp->ImageBase
1092	mov	56($disp),%r11		# disp->HandlerData
1093
1094	mov	0(%r11),%r10d		# HandlerData[0]
1095	lea	(%rsi,%r10),%r10	# end of prologue label
1096	cmp	%r10,%rbx		# context->Rip<end of prologue label
1097	jb	.Lcommon_seh_tail
1098
1099	lea	48(%rax),%rax
1100
1101	mov	4(%r11),%r10d		# HandlerData[1]
1102	lea	(%rsi,%r10),%r10	# end of alloca label
1103	cmp	%r10,%rbx		# context->Rip<end of alloca label
1104	jb	.Lcommon_seh_tail
1105
1106	mov	152($context),%rax	# pull context->Rsp
1107
1108	mov	8(%r11),%r10d		# HandlerData[2]
1109	lea	(%rsi,%r10),%r10	# epilogue label
1110	cmp	%r10,%rbx		# context->Rip>=epilogue label
1111	jae	.Lcommon_seh_tail
1112
1113	mov	192($context),%r10	# pull $num
1114	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1115
1116	lea	48(%rax),%rax
1117
1118	mov	-8(%rax),%rbx
1119	mov	-16(%rax),%rbp
1120	mov	-24(%rax),%r12
1121	mov	-32(%rax),%r13
1122	mov	-40(%rax),%r14
1123	mov	-48(%rax),%r15
1124	mov	%rbx,144($context)	# restore context->Rbx
1125	mov	%rbp,160($context)	# restore context->Rbp
1126	mov	%r12,216($context)	# restore context->R12
1127	mov	%r13,224($context)	# restore context->R13
1128	mov	%r14,232($context)	# restore context->R14
1129	mov	%r15,240($context)	# restore context->R15
1130
1131.Lcommon_seh_tail:
1132	mov	8(%rax),%rdi
1133	mov	16(%rax),%rsi
1134	mov	%rax,152($context)	# restore context->Rsp
1135	mov	%rsi,168($context)	# restore context->Rsi
1136	mov	%rdi,176($context)	# restore context->Rdi
1137
1138	mov	40($disp),%rdi		# disp->ContextRecord
1139	mov	$context,%rsi		# context
1140	mov	\$154,%ecx		# sizeof(CONTEXT)
1141	.long	0xa548f3fc		# cld; rep movsq
1142
1143	mov	$disp,%rsi
1144	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1145	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1146	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1147	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1148	mov	40(%rsi),%r10		# disp->ContextRecord
1149	lea	56(%rsi),%r11		# &disp->HandlerData
1150	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1151	mov	%r10,32(%rsp)		# arg5
1152	mov	%r11,40(%rsp)		# arg6
1153	mov	%r12,48(%rsp)		# arg7
1154	mov	%rcx,56(%rsp)		# arg8, (NULL)
1155	call	*__imp_RtlVirtualUnwind(%rip)
1156
1157	mov	\$1,%eax		# ExceptionContinueSearch
1158	add	\$64,%rsp
1159	popfq
1160	pop	%r15
1161	pop	%r14
1162	pop	%r13
1163	pop	%r12
1164	pop	%rbp
1165	pop	%rbx
1166	pop	%rdi
1167	pop	%rsi
1168	ret
1169.size	mul_handler,.-mul_handler
1170
1171.section	.pdata
1172.align	4
1173	.rva	.LSEH_begin_bn_mul_mont_gather5
1174	.rva	.LSEH_end_bn_mul_mont_gather5
1175	.rva	.LSEH_info_bn_mul_mont_gather5
1176
1177	.rva	.LSEH_begin_bn_mul4x_mont_gather5
1178	.rva	.LSEH_end_bn_mul4x_mont_gather5
1179	.rva	.LSEH_info_bn_mul4x_mont_gather5
1180
1181	.rva	.LSEH_begin_bn_gather5
1182	.rva	.LSEH_end_bn_gather5
1183	.rva	.LSEH_info_bn_gather5
1184
1185.section	.xdata
1186.align	8
1187.LSEH_info_bn_mul_mont_gather5:
1188	.byte	9,0,0,0
1189	.rva	mul_handler
1190	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
1191.align	8
1192.LSEH_info_bn_mul4x_mont_gather5:
1193	.byte	9,0,0,0
1194	.rva	mul_handler
1195	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1196.align	8
1197.LSEH_info_bn_gather5:
1198	.byte	0x01,0x0b,0x03,0x0a
1199	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
1200	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp), set_frame r10
1201.align	8
1202___
1203}
1204
1205$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1206
1207print $code;
1208close STDOUT;
1209