1183234Ssimon#!/usr/bin/env perl
2183234Ssimon#
3183234Ssimon# ====================================================================
4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5183234Ssimon# project. Rights for redistribution and usage in source and binary
6183234Ssimon# forms are granted according to the OpenSSL license.
7183234Ssimon# ====================================================================
8183234Ssimon#
9183234Ssimon# sha256/512_block procedure for x86_64.
10183234Ssimon#
11183234Ssimon# 40% improvement over compiler-generated code on Opteron. On EM64T
12183234Ssimon# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13183234Ssimon# tricks, just straight implementation... I really wonder why gcc
14183234Ssimon# [being armed with inline assembler] fails to generate as fast code.
15183234Ssimon# The only thing which is cool about this module is that it's very
16183234Ssimon# same instruction sequence used for both SHA-256 and SHA-512. In
17183234Ssimon# former case the instructions operate on 32-bit operands, while in
18183234Ssimon# latter - on 64-bit ones. All I had to do is to get one flavor right,
19183234Ssimon# the other one passed the test right away:-)
20183234Ssimon#
21183234Ssimon# sha256_block runs in ~1005 cycles on Opteron, which gives you
22183234Ssimon# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23183234Ssimon# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24183234Ssimon# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25183234Ssimon# Well, if you compare it to IA-64 implementation, which maintains
26183234Ssimon# X[16] in register bank[!], tends to 4 instructions per CPU clock
27183234Ssimon# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28183234Ssimon# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29183234Ssimon# there is a way to improve it, *then* the only way would be to try to
30183234Ssimon# offload X[16] updates to SSE unit, but that would require "deeper"
31183234Ssimon# loop unroll, which in turn would naturally cause size blow-up, not
32183234Ssimon# to mention increased complexity! And once again, only *if* it's
33183234Ssimon# actually possible to noticeably improve overall ILP, instruction
34183234Ssimon# level parallelism, on a given CPU implementation in this case.
35183234Ssimon#
36183234Ssimon# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37183234Ssimon# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38183234Ssimon# [currently available] EM64T CPUs apparently are far from it. On the
39183234Ssimon# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40183234Ssimon# sha256_block:-( This is presumably because 64-bit shifts/rotates
41183234Ssimon# apparently are not atomic instructions, but implemented in microcode.
42183234Ssimon
43238405Sjkim$flavour = shift;
44238405Sjkim$output  = shift;
45238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46183234Ssimon
47238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48238405Sjkim
49183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52183234Ssimondie "can't locate x86_64-xlate.pl";
53183234Ssimon
54246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
55246772Sjkim*STDOUT=*OUT;
56183234Ssimon
57183234Ssimonif ($output =~ /512/) {
58183234Ssimon	$func="sha512_block_data_order";
59183234Ssimon	$TABLE="K512";
60183234Ssimon	$SZ=8;
61183234Ssimon	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
62183234Ssimon					"%r8", "%r9", "%r10","%r11");
63183234Ssimon	($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
64183234Ssimon	@Sigma0=(28,34,39);
65183234Ssimon	@Sigma1=(14,18,41);
66183234Ssimon	@sigma0=(1,  8, 7);
67183234Ssimon	@sigma1=(19,61, 6);
68183234Ssimon	$rounds=80;
69183234Ssimon} else {
70183234Ssimon	$func="sha256_block_data_order";
71183234Ssimon	$TABLE="K256";
72183234Ssimon	$SZ=4;
73183234Ssimon	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
74183234Ssimon					"%r8d","%r9d","%r10d","%r11d");
75183234Ssimon	($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
76183234Ssimon	@Sigma0=( 2,13,22);
77183234Ssimon	@Sigma1=( 6,11,25);
78183234Ssimon	@sigma0=( 7,18, 3);
79183234Ssimon	@sigma1=(17,19,10);
80183234Ssimon	$rounds=64;
81183234Ssimon}
82183234Ssimon
83183234Ssimon$ctx="%rdi";	# 1st arg
84183234Ssimon$round="%rdi";	# zaps $ctx
85183234Ssimon$inp="%rsi";	# 2nd arg
86183234Ssimon$Tbl="%rbp";
87183234Ssimon
88183234Ssimon$_ctx="16*$SZ+0*8(%rsp)";
89183234Ssimon$_inp="16*$SZ+1*8(%rsp)";
90183234Ssimon$_end="16*$SZ+2*8(%rsp)";
91183234Ssimon$_rsp="16*$SZ+3*8(%rsp)";
92183234Ssimon$framesz="16*$SZ+4*8";
93183234Ssimon
94183234Ssimon
95183234Ssimonsub ROUND_00_15()
96183234Ssimon{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
97183234Ssimon
98183234Ssimon$code.=<<___;
99238405Sjkim	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
100183234Ssimon	mov	$f,$a2
101238405Sjkim	mov	$T1,`$SZ*($i&0xf)`(%rsp)
102183234Ssimon
103238405Sjkim	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
104238405Sjkim	xor	$e,$a0
105183234Ssimon	xor	$g,$a2			# f^g
106183234Ssimon
107238405Sjkim	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
108238405Sjkim	add	$h,$T1			# T1+=h
109238405Sjkim	xor	$a,$a1
110238405Sjkim
111238405Sjkim	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
112183234Ssimon	and	$e,$a2			# (f^g)&e
113238405Sjkim	mov	$b,$h
114183234Ssimon
115238405Sjkim	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
116238405Sjkim	xor	$e,$a0
117183234Ssimon	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
118183234Ssimon
119238405Sjkim	xor	$c,$h			# b^c
120238405Sjkim	xor	$a,$a1
121183234Ssimon	add	$a2,$T1			# T1+=Ch(e,f,g)
122238405Sjkim	mov	$b,$a2
123183234Ssimon
124238405Sjkim	ror	\$$Sigma1[0],$a0	# Sigma1(e)
125238405Sjkim	and	$a,$h			# h=(b^c)&a
126238405Sjkim	and	$c,$a2			# b&c
127183234Ssimon
128238405Sjkim	ror	\$$Sigma0[0],$a1	# Sigma0(a)
129238405Sjkim	add	$a0,$T1			# T1+=Sigma1(e)
130238405Sjkim	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)
131183234Ssimon
132183234Ssimon	add	$T1,$d			# d+=T1
133183234Ssimon	add	$T1,$h			# h+=T1
134183234Ssimon	lea	1($round),$round	# round++
135238405Sjkim	add	$a1,$h			# h+=Sigma0(a)
136183234Ssimon
137183234Ssimon___
138183234Ssimon}
139183234Ssimon
140183234Ssimonsub ROUND_16_XX()
141183234Ssimon{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
142183234Ssimon
143183234Ssimon$code.=<<___;
144183234Ssimon	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
145238405Sjkim	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1
146238405Sjkim	mov	$a0,$T1
147238405Sjkim	mov	$a1,$a2
148183234Ssimon
149238405Sjkim	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
150238405Sjkim	xor	$a0,$T1
151183234Ssimon	shr	\$$sigma0[2],$a0
152183234Ssimon
153238405Sjkim	ror	\$$sigma0[0],$T1
154238405Sjkim	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
155238405Sjkim	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1
156183234Ssimon
157238405Sjkim	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
158238405Sjkim	xor	$a1,$a2
159238405Sjkim	shr	\$$sigma1[2],$a1
160183234Ssimon
161238405Sjkim	ror	\$$sigma1[0],$a2
162183234Ssimon	add	$a0,$T1
163238405Sjkim	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
164183234Ssimon
165183234Ssimon	add	`$SZ*($i&0xf)`(%rsp),$T1
166238405Sjkim	mov	$e,$a0
167238405Sjkim	add	$a1,$T1
168238405Sjkim	mov	$a,$a1
169183234Ssimon___
170183234Ssimon	&ROUND_00_15(@_);
171183234Ssimon}
172183234Ssimon
173183234Ssimon$code=<<___;
174183234Ssimon.text
175183234Ssimon
176183234Ssimon.globl	$func
177183234Ssimon.type	$func,\@function,4
178183234Ssimon.align	16
179183234Ssimon$func:
180183234Ssimon	push	%rbx
181183234Ssimon	push	%rbp
182183234Ssimon	push	%r12
183183234Ssimon	push	%r13
184183234Ssimon	push	%r14
185183234Ssimon	push	%r15
186238405Sjkim	mov	%rsp,%r11		# copy %rsp
187183234Ssimon	shl	\$4,%rdx		# num*16
188183234Ssimon	sub	\$$framesz,%rsp
189183234Ssimon	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
190183234Ssimon	and	\$-64,%rsp		# align stack frame
191183234Ssimon	mov	$ctx,$_ctx		# save ctx, 1st arg
192183234Ssimon	mov	$inp,$_inp		# save inp, 2nd arh
193183234Ssimon	mov	%rdx,$_end		# save end pointer, "3rd" arg
194238405Sjkim	mov	%r11,$_rsp		# save copy of %rsp
195238405Sjkim.Lprologue:
196183234Ssimon
197238405Sjkim	lea	$TABLE(%rip),$Tbl
198183234Ssimon
199183234Ssimon	mov	$SZ*0($ctx),$A
200183234Ssimon	mov	$SZ*1($ctx),$B
201183234Ssimon	mov	$SZ*2($ctx),$C
202183234Ssimon	mov	$SZ*3($ctx),$D
203183234Ssimon	mov	$SZ*4($ctx),$E
204183234Ssimon	mov	$SZ*5($ctx),$F
205183234Ssimon	mov	$SZ*6($ctx),$G
206183234Ssimon	mov	$SZ*7($ctx),$H
207183234Ssimon	jmp	.Lloop
208183234Ssimon
209183234Ssimon.align	16
210183234Ssimon.Lloop:
211183234Ssimon	xor	$round,$round
212183234Ssimon___
213183234Ssimon	for($i=0;$i<16;$i++) {
214183234Ssimon		$code.="	mov	$SZ*$i($inp),$T1\n";
215238405Sjkim		$code.="	mov	@ROT[4],$a0\n";
216238405Sjkim		$code.="	mov	@ROT[0],$a1\n";
217183234Ssimon		$code.="	bswap	$T1\n";
218183234Ssimon		&ROUND_00_15($i,@ROT);
219183234Ssimon		unshift(@ROT,pop(@ROT));
220183234Ssimon	}
221183234Ssimon$code.=<<___;
222183234Ssimon	jmp	.Lrounds_16_xx
223183234Ssimon.align	16
224183234Ssimon.Lrounds_16_xx:
225183234Ssimon___
226183234Ssimon	for(;$i<32;$i++) {
227183234Ssimon		&ROUND_16_XX($i,@ROT);
228183234Ssimon		unshift(@ROT,pop(@ROT));
229183234Ssimon	}
230183234Ssimon
231183234Ssimon$code.=<<___;
232183234Ssimon	cmp	\$$rounds,$round
233183234Ssimon	jb	.Lrounds_16_xx
234183234Ssimon
235183234Ssimon	mov	$_ctx,$ctx
236183234Ssimon	lea	16*$SZ($inp),$inp
237183234Ssimon
238183234Ssimon	add	$SZ*0($ctx),$A
239183234Ssimon	add	$SZ*1($ctx),$B
240183234Ssimon	add	$SZ*2($ctx),$C
241183234Ssimon	add	$SZ*3($ctx),$D
242183234Ssimon	add	$SZ*4($ctx),$E
243183234Ssimon	add	$SZ*5($ctx),$F
244183234Ssimon	add	$SZ*6($ctx),$G
245183234Ssimon	add	$SZ*7($ctx),$H
246183234Ssimon
247183234Ssimon	cmp	$_end,$inp
248183234Ssimon
249183234Ssimon	mov	$A,$SZ*0($ctx)
250183234Ssimon	mov	$B,$SZ*1($ctx)
251183234Ssimon	mov	$C,$SZ*2($ctx)
252183234Ssimon	mov	$D,$SZ*3($ctx)
253183234Ssimon	mov	$E,$SZ*4($ctx)
254183234Ssimon	mov	$F,$SZ*5($ctx)
255183234Ssimon	mov	$G,$SZ*6($ctx)
256183234Ssimon	mov	$H,$SZ*7($ctx)
257183234Ssimon	jb	.Lloop
258183234Ssimon
259238405Sjkim	mov	$_rsp,%rsi
260238405Sjkim	mov	(%rsi),%r15
261238405Sjkim	mov	8(%rsi),%r14
262238405Sjkim	mov	16(%rsi),%r13
263238405Sjkim	mov	24(%rsi),%r12
264238405Sjkim	mov	32(%rsi),%rbp
265238405Sjkim	mov	40(%rsi),%rbx
266238405Sjkim	lea	48(%rsi),%rsp
267238405Sjkim.Lepilogue:
268183234Ssimon	ret
269183234Ssimon.size	$func,.-$func
270183234Ssimon___
271183234Ssimon
272183234Ssimonif ($SZ==4) {
273183234Ssimon$code.=<<___;
274183234Ssimon.align	64
275183234Ssimon.type	$TABLE,\@object
276183234Ssimon$TABLE:
277183234Ssimon	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
278183234Ssimon	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
279183234Ssimon	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
280183234Ssimon	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
281183234Ssimon	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
282183234Ssimon	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
283183234Ssimon	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
284183234Ssimon	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
285183234Ssimon	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
286183234Ssimon	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
287183234Ssimon	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
288183234Ssimon	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
289183234Ssimon	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
290183234Ssimon	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
291183234Ssimon	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
292183234Ssimon	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
293183234Ssimon___
294183234Ssimon} else {
295183234Ssimon$code.=<<___;
296183234Ssimon.align	64
297183234Ssimon.type	$TABLE,\@object
298183234Ssimon$TABLE:
299183234Ssimon	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
300183234Ssimon	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
301183234Ssimon	.quad	0x3956c25bf348b538,0x59f111f1b605d019
302183234Ssimon	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
303183234Ssimon	.quad	0xd807aa98a3030242,0x12835b0145706fbe
304183234Ssimon	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
305183234Ssimon	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
306183234Ssimon	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
307183234Ssimon	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
308183234Ssimon	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
309183234Ssimon	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
310183234Ssimon	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
311183234Ssimon	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
312183234Ssimon	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
313183234Ssimon	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
314183234Ssimon	.quad	0x06ca6351e003826f,0x142929670a0e6e70
315183234Ssimon	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
316183234Ssimon	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
317183234Ssimon	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
318183234Ssimon	.quad	0x81c2c92e47edaee6,0x92722c851482353b
319183234Ssimon	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
320183234Ssimon	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
321183234Ssimon	.quad	0xd192e819d6ef5218,0xd69906245565a910
322183234Ssimon	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
323183234Ssimon	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
324183234Ssimon	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
325183234Ssimon	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
326183234Ssimon	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
327183234Ssimon	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
328183234Ssimon	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
329183234Ssimon	.quad	0x90befffa23631e28,0xa4506cebde82bde9
330183234Ssimon	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
331183234Ssimon	.quad	0xca273eceea26619c,0xd186b8c721c0c207
332183234Ssimon	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
333183234Ssimon	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
334183234Ssimon	.quad	0x113f9804bef90dae,0x1b710b35131c471b
335183234Ssimon	.quad	0x28db77f523047d84,0x32caab7b40c72493
336183234Ssimon	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
337183234Ssimon	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
338183234Ssimon	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
339183234Ssimon___
340183234Ssimon}
341183234Ssimon
342238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
343238405Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
344238405Sjkimif ($win64) {
345238405Sjkim$rec="%rcx";
346238405Sjkim$frame="%rdx";
347238405Sjkim$context="%r8";
348238405Sjkim$disp="%r9";
349238405Sjkim
350238405Sjkim$code.=<<___;
351238405Sjkim.extern	__imp_RtlVirtualUnwind
352238405Sjkim.type	se_handler,\@abi-omnipotent
353238405Sjkim.align	16
354238405Sjkimse_handler:
355238405Sjkim	push	%rsi
356238405Sjkim	push	%rdi
357238405Sjkim	push	%rbx
358238405Sjkim	push	%rbp
359238405Sjkim	push	%r12
360238405Sjkim	push	%r13
361238405Sjkim	push	%r14
362238405Sjkim	push	%r15
363238405Sjkim	pushfq
364238405Sjkim	sub	\$64,%rsp
365238405Sjkim
366238405Sjkim	mov	120($context),%rax	# pull context->Rax
367238405Sjkim	mov	248($context),%rbx	# pull context->Rip
368238405Sjkim
369238405Sjkim	lea	.Lprologue(%rip),%r10
370238405Sjkim	cmp	%r10,%rbx		# context->Rip<.Lprologue
371238405Sjkim	jb	.Lin_prologue
372238405Sjkim
373238405Sjkim	mov	152($context),%rax	# pull context->Rsp
374238405Sjkim
375238405Sjkim	lea	.Lepilogue(%rip),%r10
376238405Sjkim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
377238405Sjkim	jae	.Lin_prologue
378238405Sjkim
379238405Sjkim	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
380238405Sjkim	lea	48(%rax),%rax
381238405Sjkim
382238405Sjkim	mov	-8(%rax),%rbx
383238405Sjkim	mov	-16(%rax),%rbp
384238405Sjkim	mov	-24(%rax),%r12
385238405Sjkim	mov	-32(%rax),%r13
386238405Sjkim	mov	-40(%rax),%r14
387238405Sjkim	mov	-48(%rax),%r15
388238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
389238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
390238405Sjkim	mov	%r12,216($context)	# restore context->R12
391238405Sjkim	mov	%r13,224($context)	# restore context->R13
392238405Sjkim	mov	%r14,232($context)	# restore context->R14
393238405Sjkim	mov	%r15,240($context)	# restore context->R15
394238405Sjkim
395238405Sjkim.Lin_prologue:
396238405Sjkim	mov	8(%rax),%rdi
397238405Sjkim	mov	16(%rax),%rsi
398238405Sjkim	mov	%rax,152($context)	# restore context->Rsp
399238405Sjkim	mov	%rsi,168($context)	# restore context->Rsi
400238405Sjkim	mov	%rdi,176($context)	# restore context->Rdi
401238405Sjkim
402238405Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
403238405Sjkim	mov	$context,%rsi		# context
404238405Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
405238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
406238405Sjkim
407238405Sjkim	mov	$disp,%rsi
408238405Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
409238405Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
410238405Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
411238405Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
412238405Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
413238405Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
414238405Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
415238405Sjkim	mov	%r10,32(%rsp)		# arg5
416238405Sjkim	mov	%r11,40(%rsp)		# arg6
417238405Sjkim	mov	%r12,48(%rsp)		# arg7
418238405Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
419238405Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
420238405Sjkim
421238405Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
422238405Sjkim	add	\$64,%rsp
423238405Sjkim	popfq
424238405Sjkim	pop	%r15
425238405Sjkim	pop	%r14
426238405Sjkim	pop	%r13
427238405Sjkim	pop	%r12
428238405Sjkim	pop	%rbp
429238405Sjkim	pop	%rbx
430238405Sjkim	pop	%rdi
431238405Sjkim	pop	%rsi
432238405Sjkim	ret
433238405Sjkim.size	se_handler,.-se_handler
434238405Sjkim
435238405Sjkim.section	.pdata
436238405Sjkim.align	4
437238405Sjkim	.rva	.LSEH_begin_$func
438238405Sjkim	.rva	.LSEH_end_$func
439238405Sjkim	.rva	.LSEH_info_$func
440238405Sjkim
441238405Sjkim.section	.xdata
442238405Sjkim.align	8
443238405Sjkim.LSEH_info_$func:
444238405Sjkim	.byte	9,0,0,0
445238405Sjkim	.rva	se_handler
446238405Sjkim___
447238405Sjkim}
448238405Sjkim
449183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem;
450183234Ssimonprint $code;
451183234Ssimonclose STDOUT;
452