1/* $Id: U3copy_from_user.S,v 1.1.1.1 2008/10/15 03:26:19 james26_jang Exp $
2 * U3memcpy.S: UltraSparc-III optimized copy from userspace.
3 *
4 * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
5 */
6
7#ifdef __KERNEL__
8#include <asm/visasm.h>
9#include <asm/asi.h>
10#include <asm/dcu.h>
11#include <asm/spitfire.h>
12#undef SMALL_COPY_USES_FPU
13#define EXNV(x,y,a,b)			\
1498:	x,y;				\
15	.section .fixup;		\
16	.align 4;			\
1799:	VISExitHalf;			\
18	ba U3cfu_fixup;			\
19	 a, b, %o1;			\
20	.section __ex_table;		\
21	.align 4;			\
22	.word 98b, 99b;			\
23	.text;				\
24	.align 4;
25#define EX(x,y,a,b)			\
2698:	x,y;				\
27	.section .fixup;		\
28	.align 4;			\
2999:	VISExitHalf;			\
30	ba U3cfu_fixup;			\
31	 a, b, %o1;			\
32	.section __ex_table;		\
33	.align 4;			\
34	.word 98b, 99b;			\
35	.text;				\
36	.align 4;
37#define EX2(x,y)			\
3898:	x,y;				\
39	.section .fixup;		\
40	.align 4;			\
4199:	VISExitHalf;			\
42	and %o2, (0x40 - 1), %o1;	\
43	add %o1, %o4, %o1;		\
44	ba U3cfu_fixup;			\
45	 add %o1, 0x1c0, %o1;		\
46	.section __ex_table;		\
47	.align 4;			\
48	.word 98b, 99b;			\
49	.text;				\
50	.align 4;
51#define EX3(x,y)			\
5298:	x,y;				\
53	.section .fixup;		\
54	.align 4;			\
5599:	VISExitHalf;			\
56	and %o2, (0x40 - 1), %o1;	\
57	sll %g3, 6, %g3;		\
58	add %o1, 0x80, %o1;		\
59	ba U3cfu_fixup;			\
60	 add %o1, %g3, %o1;		\
61	.section __ex_table;		\
62	.align 4;			\
63	.word 98b, 99b;			\
64	.text;				\
65	.align 4;
66#define EX4(x,y)			\
6798:	x,y;				\
68	.section .fixup;		\
69	.align 4;			\
7099:	VISExitHalf;			\
71	and %o2, (0x40 - 1), %o1;	\
72	add %o1, 0x40, %o1;		\
73	ba U3cfu_fixup;			\
74	 add %o1, %g3, %o1;		\
75	.section __ex_table;		\
76	.align 4;			\
77	.word 98b, 99b;			\
78	.text;				\
79	.align 4;
80#else
81#define ASI_BLK_P 0xf0
82#define FPRS_FEF  0x04
83#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
84#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
85#define SMALL_COPY_USES_FPU
86#define EXNV(x,y,a,b)	x,y;
87#define EX(x,y,a,b)	x,y;
88#define EX2(x,y)	x,y;
89#define EX3(x,y)	x,y;
90#define EX4(x,y)	x,y;
91#endif
92
93	/* Special/non-trivial issues of this code:
94	 *
95	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
96	 * 2) Only low 32 FPU registers are used so that only the
97	 *    lower half of the FPU register set is dirtied by this
98	 *    code.  This is especially important in the kernel.
99	 * 3) This code never prefetches cachelines past the end
100	 *    of the source buffer.
101	 */
102
103	.text
104	.align	32
105
106	/* The cheetah's flexible spine, oversized liver, enlarged heart,
107	 * slender muscular body, and claws make it the swiftest hunter
108	 * in Africa and the fastest animal on land.  Can reach speeds
109	 * of up to 2.4GB per second.
110	 */
111
112	.globl		U3copy_from_user
113U3copy_from_user: /* %o0=dst, %o1=src, %o2=len */
114#ifndef __KERNEL__
115	/* Save away original 'dst' for memcpy return value. */
116	mov		%o0, %g3			! A0	Group
117#endif
118	/* Anything to copy at all? */
119	cmp		%o2, 0				! A1
120	ble,pn		%icc, U3copy_from_user_short_ret! BR
121
122	/* Extremely small copy? */
123	 cmp		%o2, 31				! A0	Group
124	ble,pn		%icc, U3copy_from_user_short	! BR
125
126	/* Large enough to use unrolled prefetch loops? */
127	 cmp		%o2, 0x100			! A1
128	bge,a,pt	%icc, U3copy_from_user_enter	! BR	Group
129	 andcc		%o0, 0x3f, %g2			! A0
130
131	ba,pt		%xcc, U3copy_from_user_toosmall	! BR	Group
132	 andcc		%o0, 0x7, %g2			! A0
133
134	.align		32
135U3copy_from_user_short:
136	/* Copy %o2 bytes from src to dst, one byte at a time. */
137	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS	Group
138	add		%o1, 0x1, %o1			! A0
139	add		%o0, 0x1, %o0			! A1
140	subcc		%o2, 1, %o2			! A0	Group
141
142	bg,pt		%icc, U3copy_from_user_short	! BR
143	 stb		%o3, [%o0 + -1]			! MS	Group (1-cycle stall)
144
145U3copy_from_user_short_ret:
146#ifdef __KERNEL__
147	retl						! BR	Group (0-4 cycle stall)
148	 clr		%o0				! A0
149#else
150	retl						! BR	Group (0-4 cycle stall)
151	 mov		%g3, %o0			! A0
152#endif
153
154	/* Here len >= (6 * 64) and condition codes reflect execution
155	 * of "andcc %o0, 0x7, %g2", done by caller.
156	 */
157	.align		64
158U3copy_from_user_enter:
159	/* Is 'dst' already aligned on an 64-byte boundary? */
160	be,pt		%xcc, 2f			! BR
161
162	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
163	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
164	 * subtract this from 'len'.
165	 */
166	 sub		%g2, 0x40, %g2			! A0	Group
167	sub		%g0, %g2, %g2			! A0	Group
168	sub		%o2, %g2, %o2			! A0	Group
169
170	/* Copy %g2 bytes from src to dst, one byte at a time. */
1711:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group)
172	add		%o1, 0x1, %o1			! A1
173	add		%o0, 0x1, %o0			! A0	Group
174	subcc		%g2, 0x1, %g2			! A1
175
176	bg,pt		%icc, 1b			! BR	Group
177	 stb		%o3, [%o0 + -1]			! MS	Group
178
1792:	VISEntryHalf					! MS+MS
180	and		%o1, 0x7, %g1			! A1
181	ba,pt		%xcc, U3copy_from_user_begin		! BR
182	 alignaddr	%o1, %g0, %o1			! MS	      (Break-after)
183
184	.align		64
185U3copy_from_user_begin:
186#ifdef __KERNEL__
187	.globl		U3copy_from_user_nop_1_6
188U3copy_from_user_nop_1_6:
189	ldxa		[%g0] ASI_DCU_CONTROL_REG, %g3
190	sethi		%uhi(DCU_PE), %o3
191	sllx		%o3, 32, %o3
192	or		%g3, %o3, %o3
193	stxa		%o3, [%g0] ASI_DCU_CONTROL_REG	! Enable P-cache
194	membar		#Sync
195#endif
196	prefetcha	[%o1 + 0x000] %asi, #one_read	! MS	Group1
197	prefetcha	[%o1 + 0x040] %asi, #one_read	! MS	Group2
198	andn		%o2, (0x40 - 1), %o4		! A0
199	prefetcha	[%o1 + 0x080] %asi, #one_read	! MS	Group3
200	cmp		%o4, 0x140			! A0
201	prefetcha	[%o1 + 0x0c0] %asi, #one_read	! MS	Group4
202	EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)	! MS	Group5 (%f0 results at G8)
203	bge,a,pt	%icc, 1f			! BR
204
205	prefetcha	[%o1 + 0x100] %asi, #one_read	! MS	Group6
2061:	EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)	! AX	       (%f2 results at G9)
207	cmp		%o4, 0x180			! A1
208	bge,a,pt	%icc, 1f			! BR
209	 prefetcha	[%o1 + 0x140] %asi, #one_read	! MS	Group7
2101:	EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)	! AX	       (%f4 results at G10)
211	cmp		%o4, 0x1c0			! A1
212	bge,a,pt	%icc, 1f			! BR
213
214	 prefetcha	[%o1 + 0x180] %asi, #one_read	! MS	Group8
2151:	faligndata	%f0, %f2, %f16			! FGA	Group9 (%f16 at G12)
216	EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)	! AX	       (%f6 results at G12)
217	faligndata	%f2, %f4, %f18			! FGA	Group10 (%f18 results at G13)
218	EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)	! MS	        (%f8 results at G13)
219	faligndata	%f4, %f6, %f20			! FGA	Group12	(1-cycle stall,%f20 at G15)
220	EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0)	! MS		(%f10 results at G15)
221	faligndata	%f6, %f8, %f22			! FGA	Group13 (%f22 results at G16)
222
223	EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0)	! MS		(%f12 results at G16)
224	faligndata	%f8, %f10, %f24			! FGA	Group15 (1-cycle stall,%f24 at G18)
225	EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0)	! MS		(%f14 results at G18)
226	faligndata	%f10, %f12, %f26		! FGA	Group16 (%f26 results at G19)
227	EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)	! MS		(%f0 results at G19)
228
229	/* We only use the first loop if len > (7 * 64). */
230	subcc		%o4, 0x1c0, %o4			! A0	Group17
231	bg,pt		%icc, U3copy_from_user_loop1	! BR
232	 add		%o1, 0x40, %o1			! A1
233
234	add		%o4, 0x140, %o4			! A0	Group18
235	ba,pt		%xcc, U3copy_from_user_loop2	! BR
236	 srl		%o4, 6, %o3			! A0	Group19
237	nop
238	nop
239	nop
240	nop
241	nop
242
243	nop
244	nop
245
246	/* This loop performs the copy and queues new prefetches.
247	 * We drop into the second loop when len <= (5 * 64).  Note
248	 * that this (5 * 64) factor has been subtracted from len
249	 * already.
250	 */
251U3copy_from_user_loop1:
252	EX2(ldda [%o1 + 0x008] %asi, %f2)		! MS	Group2	(%f2 results at G5)
253	faligndata	%f12, %f14, %f28		! FGA		(%f28 results at G5)
254	EX2(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group3	(%f4 results at G6)
255	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall, %f30 at G7)
256	stda		%f16, [%o0] ASI_BLK_P		! MS
257	EX2(ldda [%o1 + 0x018] %asi, %f6)		! AX		(%f6 results at G7)
258
259	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
260	EX2(ldda [%o1 + 0x020] %asi, %f8)		! MS	        (%f8 results at G15)
261	faligndata	%f2, %f4, %f18			! FGA	Group13	(%f18 results at G16)
262	EX2(ldda [%o1 + 0x028] %asi, %f10)		! MS		(%f10 results at G16)
263	faligndata	%f4, %f6, %f20			! FGA	Group14	(%f20 results at G17)
264	EX2(ldda [%o1 + 0x030] %asi, %f12)		! MS		(%f12 results at G17)
265	faligndata	%f6, %f8, %f22			! FGA	Group15	(%f22 results at G18)
266	EX2(ldda [%o1 + 0x038] %asi, %f14)		! MS		(%f14 results at G18)
267
268	faligndata	%f8, %f10, %f24			! FGA	Group16	(%f24 results at G19)
269	EX2(ldda [%o1 + 0x040] %asi, %f0)		! AX		(%f0 results at G19)
270	prefetcha	[%o1 + 0x180] %asi, #one_read	! MS
271	faligndata	%f10, %f12, %f26		! FGA	Group17	(%f26 results at G20)
272	subcc		%o4, 0x40, %o4			! A0
273	add		%o1, 0x40, %o1			! A1
274	bg,pt		%xcc, U3copy_from_user_loop1	! BR
275	 add		%o0, 0x40, %o0			! A0	Group18
276
277U3copy_from_user_loop2_enter:
278	mov		5, %o3				! A1
279
280	/* This loop performs on the copy, no new prefetches are
281	 * queued.  We do things this way so that we do not perform
282	 * any spurious prefetches past the end of the src buffer.
283	 */
284U3copy_from_user_loop2:
285	EX3(ldda [%o1 + 0x008] %asi, %f2)		! MS
286	faligndata	%f12, %f14, %f28		! FGA	Group2
287	EX3(ldda [%o1 + 0x010] %asi, %f4)		! MS
288	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall)
289	stda		%f16, [%o0] ASI_BLK_P		! MS
290	EX3(ldda [%o1 + 0x018] %asi, %f6)		! AX
291	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
292
293	EX3(ldda [%o1 + 0x020] %asi, %f8)		! MS
294	faligndata	%f2, %f4, %f18			! FGA	Group13
295	EX3(ldda [%o1 + 0x028] %asi, %f10)		! MS
296	faligndata	%f4, %f6, %f20			! FGA	Group14
297	EX3(ldda [%o1 + 0x030] %asi, %f12)		! MS
298	faligndata	%f6, %f8, %f22			! FGA	Group15
299	EX3(ldda [%o1 + 0x038] %asi, %f14)		! MS
300	faligndata	%f8, %f10, %f24			! FGA	Group16
301
302	EX3(ldda [%o1 + 0x040] %asi, %f0)		! AX
303	faligndata	%f10, %f12, %f26		! FGA	Group17
304	subcc		%o3, 0x01, %o3			! A0
305	add		%o1, 0x40, %o1			! A1
306	bg,pt		%xcc, U3copy_from_user_loop2	! BR
307	 add		%o0, 0x40, %o0			! A0	Group18
308
309	/* Finally we copy the last full 64-byte block. */
310U3copy_from_user_loopfini:
311	EX3(ldda [%o1 + 0x008] %asi, %f2)		! MS
312	faligndata	%f12, %f14, %f28		! FGA
313	EX3(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group19
314	faligndata	%f14, %f0, %f30			! FGA
315	stda		%f16, [%o0] ASI_BLK_P		! MS	Group20
316	EX3(ldda [%o1 + 0x018] %asi, %f6)		! AX
317	faligndata	%f0, %f2, %f16			! FGA	Group11 (7-cycle stall)
318	EX3(ldda [%o1 + 0x020] %asi, %f8)		! MS
319	faligndata	%f2, %f4, %f18			! FGA	Group12
320	EX3(ldda [%o1 + 0x028] %asi, %f10)		! MS
321	faligndata	%f4, %f6, %f20			! FGA	Group13
322	EX3(ldda [%o1 + 0x030] %asi, %f12)		! MS
323	faligndata	%f6, %f8, %f22			! FGA	Group14
324	EX3(ldda [%o1 + 0x038] %asi, %f14)		! MS
325	faligndata	%f8, %f10, %f24			! FGA	Group15
326	cmp		%g1, 0				! A0
327	be,pt		%icc, 1f			! BR
328	 add		%o0, 0x40, %o0			! A1
329	EX4(ldda [%o1 + 0x040] %asi, %f0)		! MS
3301:	faligndata	%f10, %f12, %f26		! FGA	Group16
331	faligndata	%f12, %f14, %f28		! FGA	Group17
332	faligndata	%f14, %f0, %f30			! FGA	Group18
333	stda		%f16, [%o0] ASI_BLK_P		! MS
334	add		%o0, 0x40, %o0			! A0
335	add		%o1, 0x40, %o1			! A1
336#ifdef __KERNEL__
337	.globl		U3copy_from_user_nop_2_3
338U3copy_from_user_nop_2_3:
339	mov		PRIMARY_CONTEXT, %o3
340	stxa		%g0, [%o3] ASI_DMMU		! Flush P-cache
341	stxa		%g3, [%g0] ASI_DCU_CONTROL_REG	! Disable P-cache
342#endif
343	membar		#Sync				! MS	Group26 (7-cycle stall)
344
345	/* Now we copy the (len modulo 64) bytes at the end.
346	 * Note how we borrow the %f0 loaded above.
347	 *
348	 * Also notice how this code is careful not to perform a
349	 * load past the end of the src buffer just like similar
350	 * code found in U3copy_from_user_toosmall processing.
351	 */
352U3copy_from_user_loopend:
353	and		%o2, 0x3f, %o2			! A0	Group
354	andcc		%o2, 0x38, %g2			! A0	Group
355	be,pn		%icc, U3copy_from_user_endcruft	! BR
356	 subcc		%g2, 0x8, %g2			! A1
357	be,pn		%icc, U3copy_from_user_endcruft	! BR	Group
358	 cmp		%g1, 0				! A0
359
360	be,a,pt		%icc, 1f			! BR	Group
361	 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)	! MS
362
3631:	EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)	! MS	Group
364	add		%o1, 0x8, %o1			! A0
365	sub		%o2, 0x8, %o2			! A1
366	subcc		%g2, 0x8, %g2			! A0	Group
367	faligndata	%f0, %f2, %f8			! FGA	Group
368	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
369	be,pn		%icc, U3copy_from_user_endcruft	! BR
370	 add		%o0, 0x8, %o0			! A0
371	EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)	! MS	Group
372	add		%o1, 0x8, %o1			! A0
373	sub		%o2, 0x8, %o2			! A1
374	subcc		%g2, 0x8, %g2			! A0	Group
375	faligndata	%f2, %f0, %f8			! FGA
376	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
377	bne,pn		%icc, 1b			! BR
378	 add		%o0, 0x8, %o0			! A0	Group
379
380	/* If anything is left, we copy it one byte at a time.
381	 * Note that %g1 is (src & 0x3) saved above before the
382	 * alignaddr was performed.
383	 */
384U3copy_from_user_endcruft:
385	cmp		%o2, 0
386	add		%o1, %g1, %o1
387	VISExitHalf
388	be,pn		%icc, U3copy_from_user_short_ret
389	 nop
390	ba,a,pt		%xcc, U3copy_from_user_short
391
392	/* If we get here, then 32 <= len < (6 * 64) */
393U3copy_from_user_toosmall:
394
395#ifdef SMALL_COPY_USES_FPU
396
397	/* Is 'dst' already aligned on an 8-byte boundary? */
398	be,pt		%xcc, 2f			! BR	Group
399
400	/* Compute abs((dst & 7) - 8) into %g2.  This is the number
401	 * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
402	 * subtract this from 'len'.
403	 */
404	 sub		%g2, 0x8, %g2			! A0
405	sub		%g0, %g2, %g2			! A0	Group (reg-dep)
406	sub		%o2, %g2, %o2			! A0	Group (reg-dep)
407
408	/* Copy %g2 bytes from src to dst, one byte at a time. */
4091:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group) (%o3 in 3 cycles)
410	add		%o1, 0x1, %o1			! A1
411	add		%o0, 0x1, %o0			! A0	Group
412	subcc		%g2, 0x1, %g2			! A1
413
414	bg,pt		%icc, 1b			! BR	Group
415	 stb		%o3, [%o0 + -1]			! MS	Group
416
4172:	VISEntryHalf					! MS+MS
418
419	/* Compute (len - (len % 8)) into %g2.  This is guarenteed
420	 * to be nonzero.
421	 */
422	andn		%o2, 0x7, %g2			! A0	Group
423
424	/* You may read this and believe that it allows reading
425	 * one 8-byte longword past the end of src.  It actually
426	 * does not, as %g2 is subtracted as loads are done from
427	 * src, so we always stop before running off the end.
428	 * Also, we are guarenteed to have at least 0x10 bytes
429	 * to move here.
430	 */
431	sub		%g2, 0x8, %g2			! A0	Group (reg-dep)
432	alignaddr	%o1, %g0, %g1			! MS	      (Break-after)
433	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group (1-cycle stall)
434	add		%g1, 0x8, %g1			! A0
435
4361:	EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0)	! MS	Group
437	add		%g1, 0x8, %g1			! A0
438	sub		%o2, 0x8, %o2			! A1
439	subcc		%g2, 0x8, %g2			! A0	Group
440
441	faligndata	%f0, %f2, %f8			! FGA	Group (1-cycle stall)
442	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
443	add		%o1, 0x8, %o1			! A0
444	be,pn		%icc, 2f			! BR
445
446	 add		%o0, 0x8, %o0			! A1
447	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group
448	add		%g1, 0x8, %g1			! A0
449	sub		%o2, 0x8, %o2			! A1
450
451	subcc		%g2, 0x8, %g2			! A0	Group
452	faligndata	%f2, %f0, %f8			! FGA	Group (1-cycle stall)
453	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
454	add		%o1, 0x8, %o1			! A0
455
456	bne,pn		%icc, 1b			! BR
457	 add		%o0, 0x8, %o0			! A1
458
459	/* Nothing left to copy? */
4602:	cmp		%o2, 0				! A0	Group
461	VISExitHalf					! A0+MS
462	be,pn		%icc, U3copy_from_user_short_ret! BR	Group
463	 nop						! A0
464	ba,a,pt		%xcc, U3copy_from_user_short	! BR	Group
465
466#else /* !(SMALL_COPY_USES_FPU) */
467
468	xor		%o1, %o0, %g2
469	andcc		%g2, 0x7, %g0
470	bne,pn		%icc, U3copy_from_user_short
471	 andcc		%o1, 0x7, %g2
472
473	be,pt		%xcc, 2f
474	 sub		%g2, 0x8, %g2
475	sub		%g0, %g2, %g2
476	sub		%o2, %g2, %o2
477
4781:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
479	add		%o1, 0x1, %o1
480	add		%o0, 0x1, %o0
481	subcc		%g2, 0x1, %g2
482	bg,pt		%icc, 1b
483	 stb		%o3, [%o0 + -1]
484
4852:	andn		%o2, 0x7, %g2
486	sub		%o2, %g2, %o2
487
4883:	EXNV(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
489	add		%o1, 0x8, %o1
490	add		%o0, 0x8, %o0
491	subcc		%g2, 0x8, %g2
492	bg,pt		%icc, 3b
493	 stx		%o3, [%o0 + -8]
494
495	cmp		%o2, 0
496	bne,pn		%icc, U3copy_from_user_short
497	 nop
498	ba,a,pt		%xcc, U3copy_from_user_short_ret
499
500#endif /* !(SMALL_COPY_USES_FPU) */
501
502#ifdef __KERNEL__
503	.globl		U3cfu_fixup
504U3cfu_fixup:
505	/* Since this is copy_from_user(), zero out the rest of the
506	 * kernel buffer.
507	 */
508	cmp		%o1, 0
509	ble,pn		%icc, 2f
510	 mov		%o1, %g2
511
5121:	subcc		%g2, 1, %g2
513	stb		%g0, [%o0]
514	bne,pt		%icc, 1b
515	 add		%o0, 1, %o0
516
5172:	retl
518	 mov		%o1, %o0
519#endif
520