x86cpuid.pl revision 279264
1#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC, "${dir}perlasm", "perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],"x86cpuid");
8
9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11&function_begin("OPENSSL_ia32_cpuid");
12	&xor	("edx","edx");
13	&pushf	();
14	&pop	("eax");
15	&mov	("ecx","eax");
16	&xor	("eax",1<<21);
17	&push	("eax");
18	&popf	();
19	&pushf	();
20	&pop	("eax");
21	&xor	("ecx","eax");
22	&xor	("eax","eax");
23	&bt	("ecx",21);
24	&jnc	(&label("nocpuid"));
25	&cpuid	();
26	&mov	("edi","eax");		# max value for standard query level
27
28	&xor	("eax","eax");
29	&cmp	("ebx",0x756e6547);	# "Genu"
30	&setne	(&LB("eax"));
31	&mov	("ebp","eax");
32	&cmp	("edx",0x49656e69);	# "ineI"
33	&setne	(&LB("eax"));
34	&or	("ebp","eax");
35	&cmp	("ecx",0x6c65746e);	# "ntel"
36	&setne	(&LB("eax"));
37	&or	("ebp","eax");		# 0 indicates Intel CPU
38	&jz	(&label("intel"));
39
40	&cmp	("ebx",0x68747541);	# "Auth"
41	&setne	(&LB("eax"));
42	&mov	("esi","eax");
43	&cmp	("edx",0x69746E65);	# "enti"
44	&setne	(&LB("eax"));
45	&or	("esi","eax");
46	&cmp	("ecx",0x444D4163);	# "cAMD"
47	&setne	(&LB("eax"));
48	&or	("esi","eax");		# 0 indicates AMD CPU
49	&jnz	(&label("intel"));
50
51	# AMD specific
52	&mov	("eax",0x80000000);
53	&cpuid	();
54	&cmp	("eax",0x80000001);
55	&jb	(&label("intel"));
56	&mov	("esi","eax");
57	&mov	("eax",0x80000001);
58	&cpuid	();
59	&or	("ebp","ecx");
60	&and	("ebp",1<<11|1);	# isolate XOP bit
61	&cmp	("esi",0x80000008);
62	&jb	(&label("intel"));
63
64	&mov	("eax",0x80000008);
65	&cpuid	();
66	&movz	("esi",&LB("ecx"));	# number of cores - 1
67	&inc	("esi");		# number of cores
68
69	&mov	("eax",1);
70	&xor	("ecx","ecx");
71	&cpuid	();
72	&bt	("edx",28);
73	&jnc	(&label("generic"));
74	&shr	("ebx",16);
75	&and	("ebx",0xff);
76	&cmp	("ebx","esi");
77	&ja	(&label("generic"));
78	&and	("edx",0xefffffff);	# clear hyper-threading bit
79	&jmp	(&label("generic"));
80
81&set_label("intel");
82	&cmp	("edi",4);
83	&mov	("edi",-1);
84	&jb	(&label("nocacheinfo"));
85
86	&mov	("eax",4);
87	&mov	("ecx",0);		# query L1D
88	&cpuid	();
89	&mov	("edi","eax");
90	&shr	("edi",14);
91	&and	("edi",0xfff);		# number of cores -1 per L1D
92
93&set_label("nocacheinfo");
94	&mov	("eax",1);
95	&xor	("ecx","ecx");
96	&cpuid	();
97	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
98	&cmp	("ebp",0);
99	&jne	(&label("notintel"));
100	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
101	&and	(&HB("eax"),15);	# familiy ID
102	&cmp	(&HB("eax"),15);	# P4?
103	&jne	(&label("notintel"));
104	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
105&set_label("notintel");
106	&bt	("edx",28);		# test hyper-threading bit
107	&jnc	(&label("generic"));
108	&and	("edx",0xefffffff);
109	&cmp	("edi",0);
110	&je	(&label("generic"));
111
112	&or	("edx",0x10000000);
113	&shr	("ebx",16);
114	&cmp	(&LB("ebx"),1);
115	&ja	(&label("generic"));
116	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
117
118&set_label("generic");
119	&and	("ebp",1<<11);		# isolate AMD XOP flag
120	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
121	&mov	("esi","edx");
122	&or	("ebp","ecx");		# merge AMD XOP flag
123
124	&bt	("ecx",27);		# check OSXSAVE bit
125	&jnc	(&label("clear_avx"));
126	&xor	("ecx","ecx");
127	&data_byte(0x0f,0x01,0xd0);	# xgetbv
128	&and	("eax",6);
129	&cmp	("eax",6);
130	&je	(&label("done"));
131	&cmp	("eax",2);
132	&je	(&label("clear_avx"));
133&set_label("clear_xmm");
134	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
135	&and	("esi",0xfeffffff);	# clear FXSR
136&set_label("clear_avx");
137	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
138&set_label("done");
139	&mov	("eax","esi");
140	&mov	("edx","ebp");
141&set_label("nocpuid");
142&function_end("OPENSSL_ia32_cpuid");
143
144&external_label("OPENSSL_ia32cap_P");
145
146&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
147	&xor	("eax","eax");
148	&xor	("edx","edx");
149	&picmeup("ecx","OPENSSL_ia32cap_P");
150	&bt	(&DWP(0,"ecx"),4);
151	&jnc	(&label("notsc"));
152	&rdtsc	();
153&set_label("notsc");
154	&ret	();
155&function_end_B("OPENSSL_rdtsc");
156
157# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host],
158# but it's safe to call it on any [supported] 32-bit platform...
159# Just check for [non-]zero return value...
160&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
161	&picmeup("ecx","OPENSSL_ia32cap_P");
162	&bt	(&DWP(0,"ecx"),4);
163	&jnc	(&label("nohalt"));	# no TSC
164
165	&data_word(0x9058900e);		# push %cs; pop %eax
166	&and	("eax",3);
167	&jnz	(&label("nohalt"));	# not enough privileges
168
169	&pushf	();
170	&pop	("eax");
171	&bt	("eax",9);
172	&jnc	(&label("nohalt"));	# interrupts are disabled
173
174	&rdtsc	();
175	&push	("edx");
176	&push	("eax");
177	&halt	();
178	&rdtsc	();
179
180	&sub	("eax",&DWP(0,"esp"));
181	&sbb	("edx",&DWP(4,"esp"));
182	&add	("esp",8);
183	&ret	();
184
185&set_label("nohalt");
186	&xor	("eax","eax");
187	&xor	("edx","edx");
188	&ret	();
189&function_end_B("OPENSSL_instrument_halt");
190
191# Essentially there is only one use for this function. Under DJGPP:
192#
193#	#include <go32.h>
194#	...
195#	i=OPENSSL_far_spin(_dos_ds,0x46c);
196#	...
197# to obtain the number of spins till closest timer interrupt.
198
199&function_begin_B("OPENSSL_far_spin");
200	&pushf	();
201	&pop	("eax")
202	&bt	("eax",9);
203	&jnc	(&label("nospin"));	# interrupts are disabled
204
205	&mov	("eax",&DWP(4,"esp"));
206	&mov	("ecx",&DWP(8,"esp"));
207	&data_word (0x90d88e1e);	# push %ds, mov %eax,%ds
208	&xor	("eax","eax");
209	&mov	("edx",&DWP(0,"ecx"));
210	&jmp	(&label("spin"));
211
212	&align	(16);
213&set_label("spin");
214	&inc	("eax");
215	&cmp	("edx",&DWP(0,"ecx"));
216	&je	(&label("spin"));
217
218	&data_word (0x1f909090);	# pop	%ds
219	&ret	();
220
221&set_label("nospin");
222	&xor	("eax","eax");
223	&xor	("edx","edx");
224	&ret	();
225&function_end_B("OPENSSL_far_spin");
226
227&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
228	&xor	("eax","eax");
229	&xor	("edx","edx");
230	&picmeup("ecx","OPENSSL_ia32cap_P");
231	&mov	("ecx",&DWP(0,"ecx"));
232	&bt	(&DWP(0,"ecx"),1);
233	&jnc	(&label("no_x87"));
234	if ($sse2) {
235		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
236		&cmp	("ecx",1<<26|1<<24);
237		&jne	(&label("no_sse2"));
238		&pxor	("xmm0","xmm0");
239		&pxor	("xmm1","xmm1");
240		&pxor	("xmm2","xmm2");
241		&pxor	("xmm3","xmm3");
242		&pxor	("xmm4","xmm4");
243		&pxor	("xmm5","xmm5");
244		&pxor	("xmm6","xmm6");
245		&pxor	("xmm7","xmm7");
246	&set_label("no_sse2");
247	}
248	# just a bunch of fldz to zap the fp/mm bank followed by finit...
249	&data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b);
250&set_label("no_x87");
251	&lea	("eax",&DWP(4,"esp"));
252	&ret	();
253&function_end_B("OPENSSL_wipe_cpu");
254
255&function_begin_B("OPENSSL_atomic_add");
256	&mov	("edx",&DWP(4,"esp"));	# fetch the pointer, 1st arg
257	&mov	("ecx",&DWP(8,"esp"));	# fetch the increment, 2nd arg
258	&push	("ebx");
259	&nop	();
260	&mov	("eax",&DWP(0,"edx"));
261&set_label("spin");
262	&lea	("ebx",&DWP(0,"eax","ecx"));
263	&nop	();
264	&data_word(0x1ab10ff0);	# lock;	cmpxchg	%ebx,(%edx)	# %eax is envolved and is always reloaded
265	&jne	(&label("spin"));
266	&mov	("eax","ebx");	# OpenSSL expects the new value
267	&pop	("ebx");
268	&ret	();
269&function_end_B("OPENSSL_atomic_add");
270
271# This function can become handy under Win32 in situations when
272# we don't know which calling convention, __stdcall or __cdecl(*),
273# indirect callee is using. In C it can be deployed as
274#
275#ifdef OPENSSL_CPUID_OBJ
276#	type OPENSSL_indirect_call(void *f,...);
277#	...
278#	OPENSSL_indirect_call(func,[up to $max arguments]);
279#endif
280#
281# (*)	it's designed to work even for __fastcall if number of
282#	arguments is 1 or 2!
283&function_begin_B("OPENSSL_indirect_call");
284	{
285	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
286				# in order to preserve eventual
287				# stack alignment
288	&push	("ebp");
289	&mov	("ebp","esp");
290	&sub	("esp",$max*4);
291	&mov	("ecx",&DWP(12,"ebp"));
292	&mov	(&DWP(0,"esp"),"ecx");
293	&mov	("edx",&DWP(16,"ebp"));
294	&mov	(&DWP(4,"esp"),"edx");
295	for($i=2;$i<$max;$i++)
296		{
297		# Some copies will be redundant/bogus...
298		&mov	("eax",&DWP(12+$i*4,"ebp"));
299		&mov	(&DWP(0+$i*4,"esp"),"eax");
300		}
301	&call_ptr	(&DWP(8,"ebp"));# make the call...
302	&mov	("esp","ebp");	# ... and just restore the stack pointer
303				# without paying attention to what we called,
304				# (__cdecl *func) or (__stdcall *one).
305	&pop	("ebp");
306	&ret	();
307	}
308&function_end_B("OPENSSL_indirect_call");
309
310&function_begin_B("OPENSSL_cleanse");
311	&mov	("edx",&wparam(0));
312	&mov	("ecx",&wparam(1));
313	&xor	("eax","eax");
314	&cmp	("ecx",7);
315	&jae	(&label("lot"));
316	&cmp	("ecx",0);
317	&je	(&label("ret"));
318&set_label("little");
319	&mov	(&BP(0,"edx"),"al");
320	&sub	("ecx",1);
321	&lea	("edx",&DWP(1,"edx"));
322	&jnz	(&label("little"));
323&set_label("ret");
324	&ret	();
325
326&set_label("lot",16);
327	&test	("edx",3);
328	&jz	(&label("aligned"));
329	&mov	(&BP(0,"edx"),"al");
330	&lea	("ecx",&DWP(-1,"ecx"));
331	&lea	("edx",&DWP(1,"edx"));
332	&jmp	(&label("lot"));
333&set_label("aligned");
334	&mov	(&DWP(0,"edx"),"eax");
335	&lea	("ecx",&DWP(-4,"ecx"));
336	&test	("ecx",-4);
337	&lea	("edx",&DWP(4,"edx"));
338	&jnz	(&label("aligned"));
339	&cmp	("ecx",0);
340	&jne	(&label("little"));
341	&ret	();
342&function_end_B("OPENSSL_cleanse");
343
344&function_begin_B("OPENSSL_ia32_rdrand");
345	&mov	("ecx",8);
346&set_label("loop");
347	&rdrand	("eax");
348	&jc	(&label("break"));
349	&loop	(&label("loop"));
350&set_label("break");
351	&cmp	("eax",0);
352	&cmove	("eax","ecx");
353	&ret	();
354&function_end_B("OPENSSL_ia32_rdrand");
355
356&initseg("OPENSSL_cpuid_setup");
357
358&asm_finish();
359