1/*
2 * arch/alpha/lib/ev6-strncpy_from_user.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Just like strncpy except in the return value:
6 *
7 * -EFAULT       if an exception occurs before the terminator is copied.
8 * N             if the buffer filled.
9 *
10 * Otherwise the length of the string is returned.
11 *
12 * Much of the information about 21264 scheduling/coding comes from:
13 *	Compiler Writer's Guide for the Alpha 21264
14 *	abbreviated as 'CWG' in other comments here
15 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16 * Scheduling notation:
17 *	E	- either cluster
18 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20 * A bunch of instructions got moved and temp registers were changed
21 * to aid in scheduling.  Control flow was also re-arranged to eliminate
22 * branches, and to provide longer code sequences to enable better scheduling.
23 * A total rewrite (using byte load/stores for start & tail sequences)
24 * is desirable, but very difficult to do without a from-scratch rewrite.
25 * Save that for the future.
26 */
27
28
29#include <asm/errno.h>
30#include <asm/regdef.h>
31
32
33/* Allow an exception for an insn; exit if we get one.  */
34#define EX(x,y...)			\
35	99: x,##y;			\
36	.section __ex_table,"a";	\
37	.long 99b - .;			\
38	lda $31, $exception-99b($0); 	\
39	.previous
40
41
42	.set noat
43	.set noreorder
44	.text
45
46	.globl __strncpy_from_user
47	.ent __strncpy_from_user
48	.frame $30, 0, $26
49	.prologue 0
50
51	.align 4
52__strncpy_from_user:
53	and	a0, 7, t3	# E : find dest misalignment
54	beq	a2, $zerolength	# U :
55
56	/* Are source and destination co-aligned?  */
57	mov	a0, v0		# E : save the string start
58	xor	a0, a1, t4	# E :
59	EX( ldq_u t1, 0(a1) )	# L : Latency=3 load first quadword
60	ldq_u	t0, 0(a0)	# L : load first (partial) aligned dest quadword
61
62	addq	a2, t3, a2	# E : bias count by dest misalignment
63	subq	a2, 1, a3	# E :
64	addq	zero, 1, t10	# E :
65	and	t4, 7, t4	# E : misalignment between the two
66
67	and	a3, 7, t6	# E : number of tail bytes
68	sll	t10, t6, t10	# E : t10 = bitmask of last count byte
69	bne	t4, $unaligned	# U :
70	lda	t2, -1		# E : build a mask against false zero
71
72	/*
73	 * We are co-aligned; take care of a partial first word.
74	 * On entry to this basic block:
75	 * t0 == the first destination word for masking back in
76	 * t1 == the first source word.
77	 */
78
79	srl	a3, 3, a2	# E : a2 = loop counter = (count - 1)/8
80	addq	a1, 8, a1	# E :
81	mskqh	t2, a1, t2	# U :   detection in the src word
82	nop
83
84	/* Create the 1st output word and detect 0's in the 1st input word.  */
85	mskqh	t1, a1, t3	# U :
86	mskql	t0, a1, t0	# U : assemble the first output word
87	ornot	t1, t2, t2	# E :
88	nop
89
90	cmpbge	zero, t2, t8	# E : bits set iff null found
91	or	t0, t3, t0	# E :
92	beq	a2, $a_eoc	# U :
93	bne	t8, $a_eos	# U : 2nd branch in a quad.  Bad.
94
95	/* On entry to this basic block:
96	 * t0 == a source quad not containing a null.
97	 * a0 - current aligned destination address
98	 * a1 - current aligned source address
99	 * a2 - count of quadwords to move.
100	 * NOTE: Loop improvement - unrolling this is going to be
101	 *	a huge win, since we're going to stall otherwise.
102	 *	Fix this later.  For _really_ large copies, look
103	 *	at using wh64 on a look-ahead basis.  See the code
104	 *	in clear_user.S and copy_user.S.
105	 * Presumably, since (a0) and (a1) do not overlap (by C definition)
106	 * Lots of nops here:
107	 *	- Separate loads from stores
108	 *	- Keep it to 1 branch/quadpack so the branch predictor
109	 *	  can train.
110	 */
111$a_loop:
112	stq_u	t0, 0(a0)	# L :
113	addq	a0, 8, a0	# E :
114	nop
115	subq	a2, 1, a2	# E :
116
117	EX( ldq_u t0, 0(a1) )	# L :
118	addq	a1, 8, a1	# E :
119	cmpbge	zero, t0, t8	# E : Stall 2 cycles on t0
120	beq	a2, $a_eoc      # U :
121
122	beq	t8, $a_loop	# U :
123	nop
124	nop
125	nop
126
127	/* Take care of the final (partial) word store.  At this point
128	 * the end-of-count bit is set in t8 iff it applies.
129	 *
130	 * On entry to this basic block we have:
131	 * t0 == the source word containing the null
132	 * t8 == the cmpbge mask that found it.
133	 */
134$a_eos:
135	negq	t8, t12		# E : find low bit set
136	and	t8, t12, t12	# E :
137
138	/* We're doing a partial word store and so need to combine
139	   our source and original destination words.  */
140	ldq_u	t1, 0(a0)	# L :
141	subq	t12, 1, t6	# E :
142
143	or	t12, t6, t8	# E :
144	zapnot	t0, t8, t0	# U : clear src bytes > null
145	zap	t1, t8, t1	# U : clear dst bytes <= null
146	or	t0, t1, t0	# E :
147
148	stq_u	t0, 0(a0)	# L :
149	br	$finish_up	# L0 :
150	nop
151	nop
152
153	/* Add the end-of-count bit to the eos detection bitmask.  */
154	.align 4
155$a_eoc:
156	or	t10, t8, t8
157	br	$a_eos
158	nop
159	nop
160
161
162/* The source and destination are not co-aligned.  Align the destination
163   and cope.  We have to be very careful about not reading too much and
164   causing a SEGV.  */
165
166	.align 4
167$u_head:
168	/* We know just enough now to be able to assemble the first
169	   full source word.  We can still find a zero at the end of it
170	   that prevents us from outputting the whole thing.
171
172	   On entry to this basic block:
173	   t0 == the first dest word, unmasked
174	   t1 == the shifted low bits of the first source word
175	   t6 == bytemask that is -1 in dest word bytes */
176
177	EX( ldq_u t2, 8(a1) )	# L : load second src word
178	addq	a1, 8, a1	# E :
179	mskql	t0, a0, t0	# U : mask trailing garbage in dst
180	extqh	t2, a1, t4	# U :
181
182	or	t1, t4, t1	# E : first aligned src word complete
183	mskqh	t1, a0, t1	# U : mask leading garbage in src
184	or	t0, t1, t0	# E : first output word complete
185	or	t0, t6, t6	# E : mask original data for zero test
186
187	cmpbge	zero, t6, t8	# E :
188	beq	a2, $u_eocfin	# U :
189	bne	t8, $u_final	# U : bad news - 2nd branch in a quad
190	lda	t6, -1		# E : mask out the bits we have
191
192	mskql	t6, a1, t6	# U :   already seen
193	stq_u	t0, 0(a0)	# L : store first output word
194	or      t6, t2, t2	# E :
195	cmpbge	zero, t2, t8	# E : find nulls in second partial
196
197	addq	a0, 8, a0		# E :
198	subq	a2, 1, a2		# E :
199	bne	t8, $u_late_head_exit	# U :
200	nop
201
202	/* Finally, we've got all the stupid leading edge cases taken care
203	   of and we can set up to enter the main loop.  */
204
205	extql	t2, a1, t1	# U : position hi-bits of lo word
206	EX( ldq_u t2, 8(a1) )	# L : read next high-order source word
207	addq	a1, 8, a1	# E :
208	cmpbge	zero, t2, t8	# E :
209
210	beq	a2, $u_eoc	# U :
211	bne	t8, $u_eos	# U :
212	nop
213	nop
214
215	/* Unaligned copy main loop.  In order to avoid reading too much,
216	   the loop is structured to detect zeros in aligned source words.
217	   This has, unfortunately, effectively pulled half of a loop
218	   iteration out into the head and half into the tail, but it does
219	   prevent nastiness from accumulating in the very thing we want
220	   to run as fast as possible.
221
222	   On entry to this basic block:
223	   t1 == the shifted high-order bits from the previous source word
224	   t2 == the unshifted current source word
225
226	   We further know that t2 does not contain a null terminator.  */
227
228	/*
229	 * Extra nops here:
230	 *	separate load quads from store quads
231	 *	only one branch/quad to permit predictor training
232	 */
233
234	.align 4
235$u_loop:
236	extqh	t2, a1, t0	# U : extract high bits for current word
237	addq	a1, 8, a1	# E :
238	extql	t2, a1, t3	# U : extract low bits for next time
239	addq	a0, 8, a0	# E :
240
241	or	t0, t1, t0	# E : current dst word now complete
242	EX( ldq_u t2, 0(a1) )	# L : load high word for next time
243	subq	a2, 1, a2	# E :
244	nop
245
246	stq_u	t0, -8(a0)	# L : save the current word
247	mov	t3, t1		# E :
248	cmpbge	zero, t2, t8	# E : test new word for eos
249	beq	a2, $u_eoc	# U :
250
251	beq	t8, $u_loop	# U :
252	nop
253	nop
254	nop
255
256	/* We've found a zero somewhere in the source word we just read.
257	   If it resides in the lower half, we have one (probably partial)
258	   word to write out, and if it resides in the upper half, we
259	   have one full and one partial word left to write out.
260
261	   On entry to this basic block:
262	   t1 == the shifted high-order bits from the previous source word
263	   t2 == the unshifted current source word.  */
264	.align 4
265$u_eos:
266	extqh	t2, a1, t0	# U :
267	or	t0, t1, t0	# E : first (partial) source word complete
268	cmpbge	zero, t0, t8	# E : is the null in this first bit?
269	nop
270
271	bne	t8, $u_final	# U :
272	stq_u	t0, 0(a0)	# L : the null was in the high-order bits
273	addq	a0, 8, a0	# E :
274	subq	a2, 1, a2	# E :
275
276	.align 4
277$u_late_head_exit:
278	extql	t2, a1, t0	# U :
279	cmpbge	zero, t0, t8	# E :
280	or	t8, t10, t6	# E :
281	cmoveq	a2, t6, t8	# E :
282
283	/* Take care of a final (probably partial) result word.
284	   On entry to this basic block:
285	   t0 == assembled source word
286	   t8 == cmpbge mask that found the null.  */
287	.align 4
288$u_final:
289	negq	t8, t6		# E : isolate low bit set
290	and	t6, t8, t12	# E :
291	ldq_u	t1, 0(a0)	# L :
292	subq	t12, 1, t6	# E :
293
294	or	t6, t12, t8	# E :
295	zapnot	t0, t8, t0	# U : kill source bytes > null
296	zap	t1, t8, t1	# U : kill dest bytes <= null
297	or	t0, t1, t0	# E :
298
299	stq_u	t0, 0(a0)	# E :
300	br	$finish_up	# U :
301	nop
302	nop
303
304	.align 4
305$u_eoc:				# end-of-count
306	extqh	t2, a1, t0	# U :
307	or	t0, t1, t0	# E :
308	cmpbge	zero, t0, t8	# E :
309	nop
310
311	.align 4
312$u_eocfin:			# end-of-count, final word
313	or	t10, t8, t8	# E :
314	br	$u_final	# U :
315	nop
316	nop
317
318	/* Unaligned copy entry point.  */
319	.align 4
320$unaligned:
321
322	srl	a3, 3, a2	# U : a2 = loop counter = (count - 1)/8
323	and	a0, 7, t4	# E : find dest misalignment
324	and	a1, 7, t5	# E : find src misalignment
325	mov	zero, t0	# E :
326
327	/* Conditionally load the first destination word and a bytemask
328	   with 0xff indicating that the destination byte is sacrosanct.  */
329
330	mov	zero, t6	# E :
331	beq	t4, 1f		# U :
332	ldq_u	t0, 0(a0)	# L :
333	lda	t6, -1		# E :
334
335	mskql	t6, a0, t6	# E :
336	nop
337	nop
338	nop
339
340	.align 4
3411:
342	subq	a1, t4, a1	# E : sub dest misalignment from src addr
343	/* If source misalignment is larger than dest misalignment, we need
344	   extra startup checks to avoid SEGV.  */
345	cmplt	t4, t5, t12	# E :
346	extql	t1, a1, t1	# U : shift src into place
347	lda	t2, -1		# E : for creating masks later
348
349	beq	t12, $u_head	# U :
350	mskqh	t2, t5, t2	# U : begin src byte validity mask
351	cmpbge	zero, t1, t8	# E : is there a zero?
352	nop
353
354	extql	t2, a1, t2	# U :
355	or	t8, t10, t5	# E : test for end-of-count too
356	cmpbge	zero, t2, t3	# E :
357	cmoveq	a2, t5, t8	# E : Latency=2, extra map slot
358
359	nop			# E : goes with cmov
360	andnot	t8, t3, t8	# E :
361	beq	t8, $u_head	# U :
362	nop
363
364	/* At this point we've found a zero in the first partial word of
365	   the source.  We need to isolate the valid source data and mask
366	   it into the original destination data.  (Incidentally, we know
367	   that we'll need at least one byte of that original dest word.) */
368
369	ldq_u	t0, 0(a0)	# L :
370	negq	t8, t6		# E : build bitmask of bytes <= zero
371	mskqh	t1, t4, t1	# U :
372	and	t6, t8, t12	# E :
373
374	subq	t12, 1, t6	# E :
375	or	t6, t12, t8	# E :
376	zapnot	t2, t8, t2	# U : prepare source word; mirror changes
377	zapnot	t1, t8, t1	# U : to source validity mask
378
379	andnot	t0, t2, t0	# E : zero place for source to reside
380	or	t0, t1, t0	# E : and put it there
381	stq_u	t0, 0(a0)	# L :
382	nop
383
384	.align 4
385$finish_up:
386	zapnot	t0, t12, t4	# U : was last byte written null?
387	and	t12, 0xf0, t3	# E : binary search for the address of the
388	cmovne	t4, 1, t4	# E : Latency=2, extra map slot
389	nop			# E : with cmovne
390
391	and	t12, 0xcc, t2	# E : last byte written
392	and	t12, 0xaa, t1	# E :
393	cmovne	t3, 4, t3	# E : Latency=2, extra map slot
394	nop			# E : with cmovne
395
396	bic	a0, 7, t0
397	cmovne	t2, 2, t2	# E : Latency=2, extra map slot
398	nop			# E : with cmovne
399	nop
400
401	cmovne	t1, 1, t1	# E : Latency=2, extra map slot
402	nop			# E : with cmovne
403	addq	t0, t3, t0	# E :
404	addq	t1, t2, t1	# E :
405
406	addq	t0, t1, t0	# E :
407	addq	t0, t4, t0	# add one if we filled the buffer
408	subq	t0, v0, v0	# find string length
409	ret			# L0 :
410
411	.align 4
412$zerolength:
413	nop
414	nop
415	nop
416	clr	v0
417
418$exception:
419	nop
420	nop
421	nop
422	ret
423
424	.end __strncpy_from_user
425