1205194Sdelphij/*
2205194Sdelphij;uInt longest_match_x64(
3205194Sdelphij;    deflate_state *s,
4205194Sdelphij;    IPos cur_match);                             // current match
5205194Sdelphij
6205194Sdelphij; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
7205194Sdelphij;  (AMD64 on Athlon 64, Opteron, Phenom
8205194Sdelphij;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
9205194Sdelphij; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
10205194Sdelphij; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
11205194Sdelphij;
12205194Sdelphij; File written by Gilles Vollant, by converting to assembly the longest_match
13205194Sdelphij;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
14205194Sdelphij;  and by taking inspiration on asm686 with masm, optimised assembly code
15205194Sdelphij;        from Brian Raiter, written 1998
16205194Sdelphij;
17205194Sdelphij;  This software is provided 'as-is', without any express or implied
18205194Sdelphij;  warranty.  In no event will the authors be held liable for any damages
19205194Sdelphij;  arising from the use of this software.
20205194Sdelphij;
21205194Sdelphij;  Permission is granted to anyone to use this software for any purpose,
22205194Sdelphij;  including commercial applications, and to alter it and redistribute it
23205194Sdelphij;  freely, subject to the following restrictions:
24205194Sdelphij;
25205194Sdelphij;  1. The origin of this software must not be misrepresented; you must not
26205194Sdelphij;     claim that you wrote the original software. If you use this software
27205194Sdelphij;     in a product, an acknowledgment in the product documentation would be
28205194Sdelphij;     appreciated but is not required.
29205194Sdelphij;  2. Altered source versions must be plainly marked as such, and must not be
30205194Sdelphij;     misrepresented as being the original software
31205194Sdelphij;  3. This notice may not be removed or altered from any source distribution.
32205194Sdelphij;
33205194Sdelphij;         http://www.zlib.net
34205194Sdelphij;         http://www.winimage.com/zLibDll
35205194Sdelphij;         http://www.muppetlabs.com/~breadbox/software/assembly.html
36205194Sdelphij;
37205194Sdelphij; to compile this file for zLib, I use option:
38205194Sdelphij;   gcc -c -arch x86_64 gvmat64.S
39205194Sdelphij
40205194Sdelphij
41205194Sdelphij;uInt longest_match(s, cur_match)
42205194Sdelphij;    deflate_state *s;
43205194Sdelphij;    IPos cur_match;                             // current match /
44205194Sdelphij;
45205194Sdelphij; with XCode for Mac, I had strange error with some jump on intel syntax
46205194Sdelphij; this is why BEFORE_JMP and AFTER_JMP are used
47205194Sdelphij */
48205194Sdelphij
49205194Sdelphij
50205194Sdelphij#define BEFORE_JMP .att_syntax
51205194Sdelphij#define AFTER_JMP .intel_syntax noprefix
52205194Sdelphij
53205194Sdelphij#ifndef NO_UNDERLINE
54205194Sdelphij#	define	match_init	_match_init
55205194Sdelphij#	define	longest_match	_longest_match
56205194Sdelphij#endif
57205194Sdelphij
58205194Sdelphij.intel_syntax noprefix
59205194Sdelphij
60205194Sdelphij.globl	match_init, longest_match
61205194Sdelphij.text
62205194Sdelphijlongest_match:
63205194Sdelphij
64205194Sdelphij
65205194Sdelphij
66205194Sdelphij#define LocalVarsSize 96
67205194Sdelphij/*
68205194Sdelphij; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
69205194Sdelphij; free register :  r14,r15
70205194Sdelphij; register can be saved : rsp
71205194Sdelphij*/
72205194Sdelphij
73205194Sdelphij#define chainlenwmask     (rsp + 8 - LocalVarsSize)
74205194Sdelphij#define nicematch         (rsp + 16 - LocalVarsSize)
75205194Sdelphij
76205194Sdelphij#define save_rdi        (rsp + 24 - LocalVarsSize)
77205194Sdelphij#define save_rsi        (rsp + 32 - LocalVarsSize)
78205194Sdelphij#define save_rbx        (rsp + 40 - LocalVarsSize)
79205194Sdelphij#define save_rbp        (rsp + 48 - LocalVarsSize)
80205194Sdelphij#define save_r12        (rsp + 56 - LocalVarsSize)
81205194Sdelphij#define save_r13        (rsp + 64 - LocalVarsSize)
82205194Sdelphij#define save_r14        (rsp + 72 - LocalVarsSize)
83205194Sdelphij#define save_r15        (rsp + 80 - LocalVarsSize)
84205194Sdelphij
85205194Sdelphij
86205194Sdelphij/*
87205194Sdelphij;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
88205194Sdelphij;  in the deflate_state structure since the asm code was first written
89205194Sdelphij;  (if you compile with zlib 1.0.4 or older, remove the +4).
90205194Sdelphij;  Note : these value are good with a 8 bytes boundary pack structure
91205194Sdelphij*/
92205194Sdelphij
93205194Sdelphij#define    MAX_MATCH              258
94205194Sdelphij#define    MIN_MATCH              3
95205194Sdelphij#define    MIN_LOOKAHEAD          (MAX_MATCH+MIN_MATCH+1)
96205194Sdelphij
97205194Sdelphij/*
98205194Sdelphij;;; Offsets for fields in the deflate_state structure. These numbers
99205194Sdelphij;;; are calculated from the definition of deflate_state, with the
100205194Sdelphij;;; assumption that the compiler will dword-align the fields. (Thus,
101205194Sdelphij;;; changing the definition of deflate_state could easily cause this
102205194Sdelphij;;; program to crash horribly, without so much as a warning at
103205194Sdelphij;;; compile time. Sigh.)
104205194Sdelphij
105205194Sdelphij;  all the +zlib1222add offsets are due to the addition of fields
106205194Sdelphij;  in zlib in the deflate_state structure since the asm code was first written
107205194Sdelphij;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
108205194Sdelphij;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
109205194Sdelphij;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
110205194Sdelphij*/
111205194Sdelphij
112205194Sdelphij
113205194Sdelphij
114205194Sdelphij/* you can check the structure offset by running
115205194Sdelphij
116205194Sdelphij#include <stdlib.h>
117205194Sdelphij#include <stdio.h>
118205194Sdelphij#include "deflate.h"
119205194Sdelphij
120205194Sdelphijvoid print_depl()
121205194Sdelphij{
122205194Sdelphijdeflate_state ds;
123205194Sdelphijdeflate_state *s=&ds;
124205194Sdelphijprintf("size pointer=%u\n",(int)sizeof(void*));
125205194Sdelphij
126205194Sdelphijprintf("#define dsWSize         %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
127205194Sdelphijprintf("#define dsWMask         %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
128205194Sdelphijprintf("#define dsWindow        %u\n",(int)(((char*)&(s->window))-((char*)s)));
129205194Sdelphijprintf("#define dsPrev          %u\n",(int)(((char*)&(s->prev))-((char*)s)));
130205194Sdelphijprintf("#define dsMatchLen      %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
131205194Sdelphijprintf("#define dsPrevMatch     %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
132205194Sdelphijprintf("#define dsStrStart      %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
133205194Sdelphijprintf("#define dsMatchStart    %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
134205194Sdelphijprintf("#define dsLookahead     %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
135205194Sdelphijprintf("#define dsPrevLen       %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
136205194Sdelphijprintf("#define dsMaxChainLen   %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
137205194Sdelphijprintf("#define dsGoodMatch     %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
138205194Sdelphijprintf("#define dsNiceMatch     %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
139205194Sdelphij}
140205194Sdelphij*/
141205194Sdelphij
142205194Sdelphij#define dsWSize          68
143205194Sdelphij#define dsWMask          76
144205194Sdelphij#define dsWindow         80
145205194Sdelphij#define dsPrev           96
146205194Sdelphij#define dsMatchLen       144
147205194Sdelphij#define dsPrevMatch      148
148205194Sdelphij#define dsStrStart       156
149205194Sdelphij#define dsMatchStart     160
150205194Sdelphij#define dsLookahead      164
151205194Sdelphij#define dsPrevLen        168
152205194Sdelphij#define dsMaxChainLen    172
153205194Sdelphij#define dsGoodMatch      188
154205194Sdelphij#define dsNiceMatch      192
155205194Sdelphij
156205194Sdelphij#define window_size      [ rcx + dsWSize]
157205194Sdelphij#define WMask            [ rcx + dsWMask]
158205194Sdelphij#define window_ad        [ rcx + dsWindow]
159205194Sdelphij#define prev_ad          [ rcx + dsPrev]
160205194Sdelphij#define strstart         [ rcx + dsStrStart]
161205194Sdelphij#define match_start      [ rcx + dsMatchStart]
162205194Sdelphij#define Lookahead        [ rcx + dsLookahead] //; 0ffffffffh on infozip
163205194Sdelphij#define prev_length      [ rcx + dsPrevLen]
164205194Sdelphij#define max_chain_length [ rcx + dsMaxChainLen]
165205194Sdelphij#define good_match       [ rcx + dsGoodMatch]
166205194Sdelphij#define nice_match       [ rcx + dsNiceMatch]
167205194Sdelphij
168205194Sdelphij/*
169205194Sdelphij; windows:
170205194Sdelphij; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
171205194Sdelphij
172205194Sdelphij; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
173205194Sdelphij; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
174205194Sdelphij;
175205194Sdelphij; All registers must be preserved across the call, except for
176205194Sdelphij;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
177205194Sdelphij
178205194Sdelphij;
179205194Sdelphij; gcc on macosx-linux:
180205194Sdelphij; see http://www.x86-64.org/documentation/abi-0.99.pdf
181205194Sdelphij; param 1 in rdi, param 2 in rsi
182205194Sdelphij; rbx, rsp, rbp, r12 to r15 must be preserved
183205194Sdelphij
184205194Sdelphij;;; Save registers that the compiler may be using, and adjust esp to
185205194Sdelphij;;; make room for our stack frame.
186205194Sdelphij
187205194Sdelphij
188205194Sdelphij;;; Retrieve the function arguments. r8d will hold cur_match
189205194Sdelphij;;; throughout the entire function. edx will hold the pointer to the
190205194Sdelphij;;; deflate_state structure during the function's setup (before
191205194Sdelphij;;; entering the main loop.
192205194Sdelphij
193205194Sdelphij; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
194205194Sdelphij; mac: param 1 in rdi, param 2 rsi
195205194Sdelphij; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
196205194Sdelphij*/
197205194Sdelphij        mov [save_rbx],rbx
198205194Sdelphij        mov [save_rbp],rbp
199205194Sdelphij
200205194Sdelphij
201205194Sdelphij        mov rcx,rdi
202205194Sdelphij
203205194Sdelphij        mov r8d,esi
204205194Sdelphij
205205194Sdelphij
206205194Sdelphij        mov [save_r12],r12
207205194Sdelphij        mov [save_r13],r13
208205194Sdelphij        mov [save_r14],r14
209205194Sdelphij        mov [save_r15],r15
210205194Sdelphij
211205194Sdelphij
212205194Sdelphij//;;; uInt wmask = s->w_mask;
213205194Sdelphij//;;; unsigned chain_length = s->max_chain_length;
214205194Sdelphij//;;; if (s->prev_length >= s->good_match) {
215205194Sdelphij//;;;     chain_length >>= 2;
216205194Sdelphij//;;; }
217205194Sdelphij
218205194Sdelphij
219205194Sdelphij        mov edi, prev_length
220205194Sdelphij        mov esi, good_match
221205194Sdelphij        mov eax, WMask
222205194Sdelphij        mov ebx, max_chain_length
223205194Sdelphij        cmp edi, esi
224205194Sdelphij        jl  LastMatchGood
225205194Sdelphij        shr ebx, 2
226205194SdelphijLastMatchGood:
227205194Sdelphij
228205194Sdelphij//;;; chainlen is decremented once beforehand so that the function can
229205194Sdelphij//;;; use the sign flag instead of the zero flag for the exit test.
230205194Sdelphij//;;; It is then shifted into the high word, to make room for the wmask
231205194Sdelphij//;;; value, which it will always accompany.
232205194Sdelphij
233205194Sdelphij        dec ebx
234205194Sdelphij        shl ebx, 16
235205194Sdelphij        or  ebx, eax
236205194Sdelphij
237205194Sdelphij//;;; on zlib only
238205194Sdelphij//;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
239205194Sdelphij
240205194Sdelphij
241205194Sdelphij
242205194Sdelphij        mov eax, nice_match
243205194Sdelphij        mov [chainlenwmask], ebx
244205194Sdelphij        mov r10d, Lookahead
245205194Sdelphij        cmp r10d, eax
246205194Sdelphij        cmovnl r10d, eax
247205194Sdelphij        mov [nicematch],r10d
248205194Sdelphij
249205194Sdelphij
250205194Sdelphij
251205194Sdelphij//;;; register Bytef *scan = s->window + s->strstart;
252205194Sdelphij        mov r10, window_ad
253205194Sdelphij        mov ebp, strstart
254205194Sdelphij        lea r13, [r10 + rbp]
255205194Sdelphij
256205194Sdelphij//;;; Determine how many bytes the scan ptr is off from being
257205194Sdelphij//;;; dword-aligned.
258205194Sdelphij
259205194Sdelphij         mov r9,r13
260205194Sdelphij         neg r13
261205194Sdelphij         and r13,3
262205194Sdelphij
263205194Sdelphij//;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
264205194Sdelphij//;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
265205194Sdelphij
266205194Sdelphij
267205194Sdelphij        mov eax, window_size
268205194Sdelphij        sub eax, MIN_LOOKAHEAD
269205194Sdelphij
270205194Sdelphij
271205194Sdelphij        xor edi,edi
272205194Sdelphij        sub ebp, eax
273205194Sdelphij
274205194Sdelphij        mov r11d, prev_length
275205194Sdelphij
276205194Sdelphij        cmovng ebp,edi
277205194Sdelphij
278205194Sdelphij//;;; int best_len = s->prev_length;
279205194Sdelphij
280205194Sdelphij
281205194Sdelphij//;;; Store the sum of s->window + best_len in esi locally, and in esi.
282205194Sdelphij
283205194Sdelphij       lea  rsi,[r10+r11]
284205194Sdelphij
285205194Sdelphij//;;; register ush scan_start = *(ushf*)scan;
286205194Sdelphij//;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
287205194Sdelphij//;;; Posf *prev = s->prev;
288205194Sdelphij
289205194Sdelphij        movzx r12d,word ptr [r9]
290205194Sdelphij        movzx ebx, word ptr [r9 + r11 - 1]
291205194Sdelphij
292205194Sdelphij        mov rdi, prev_ad
293205194Sdelphij
294205194Sdelphij//;;; Jump into the main loop.
295205194Sdelphij
296205194Sdelphij        mov edx, [chainlenwmask]
297205194Sdelphij
298205194Sdelphij        cmp bx,word ptr [rsi + r8 - 1]
299205194Sdelphij        jz  LookupLoopIsZero
300205194Sdelphij
301205194Sdelphij
302205194Sdelphij
303205194SdelphijLookupLoop1:
304205194Sdelphij        and r8d, edx
305205194Sdelphij
306205194Sdelphij        movzx   r8d, word ptr [rdi + r8*2]
307205194Sdelphij        cmp r8d, ebp
308205194Sdelphij        jbe LeaveNow
309205194Sdelphij
310205194Sdelphij
311205194Sdelphij
312205194Sdelphij        sub edx, 0x00010000
313205194Sdelphij		BEFORE_JMP
314205194Sdelphij        js  LeaveNow
315205194Sdelphij		AFTER_JMP
316205194Sdelphij
317205194SdelphijLoopEntry1:
318205194Sdelphij        cmp bx,word ptr [rsi + r8 - 1]
319205194Sdelphij		BEFORE_JMP
320205194Sdelphij        jz  LookupLoopIsZero
321205194Sdelphij		AFTER_JMP
322205194Sdelphij
323205194SdelphijLookupLoop2:
324205194Sdelphij        and r8d, edx
325205194Sdelphij
326205194Sdelphij        movzx   r8d, word ptr [rdi + r8*2]
327205194Sdelphij        cmp r8d, ebp
328205194Sdelphij		BEFORE_JMP
329205194Sdelphij        jbe LeaveNow
330205194Sdelphij		AFTER_JMP
331205194Sdelphij        sub edx, 0x00010000
332205194Sdelphij		BEFORE_JMP
333205194Sdelphij        js  LeaveNow
334205194Sdelphij		AFTER_JMP
335205194Sdelphij
336205194SdelphijLoopEntry2:
337205194Sdelphij        cmp bx,word ptr [rsi + r8 - 1]
338205194Sdelphij		BEFORE_JMP
339205194Sdelphij        jz  LookupLoopIsZero
340205194Sdelphij		AFTER_JMP
341205194Sdelphij
342205194SdelphijLookupLoop4:
343205194Sdelphij        and r8d, edx
344205194Sdelphij
345205194Sdelphij        movzx   r8d, word ptr [rdi + r8*2]
346205194Sdelphij        cmp r8d, ebp
347205194Sdelphij		BEFORE_JMP
348205194Sdelphij        jbe LeaveNow
349205194Sdelphij		AFTER_JMP
350205194Sdelphij        sub edx, 0x00010000
351205194Sdelphij		BEFORE_JMP
352205194Sdelphij        js  LeaveNow
353205194Sdelphij		AFTER_JMP
354205194Sdelphij
355205194SdelphijLoopEntry4:
356205194Sdelphij
357205194Sdelphij        cmp bx,word ptr [rsi + r8 - 1]
358205194Sdelphij		BEFORE_JMP
359205194Sdelphij        jnz LookupLoop1
360205194Sdelphij        jmp LookupLoopIsZero
361205194Sdelphij		AFTER_JMP
362205194Sdelphij/*
363205194Sdelphij;;; do {
364205194Sdelphij;;;     match = s->window + cur_match;
365205194Sdelphij;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
366205194Sdelphij;;;         *(ushf*)match != scan_start) continue;
367205194Sdelphij;;;     [...]
368205194Sdelphij;;; } while ((cur_match = prev[cur_match & wmask]) > limit
369205194Sdelphij;;;          && --chain_length != 0);
370205194Sdelphij;;;
371205194Sdelphij;;; Here is the inner loop of the function. The function will spend the
372205194Sdelphij;;; majority of its time in this loop, and majority of that time will
373205194Sdelphij;;; be spent in the first ten instructions.
374205194Sdelphij;;;
375205194Sdelphij;;; Within this loop:
376205194Sdelphij;;; ebx = scanend
377205194Sdelphij;;; r8d = curmatch
378205194Sdelphij;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
379205194Sdelphij;;; esi = windowbestlen - i.e., (window + bestlen)
380205194Sdelphij;;; edi = prev
381205194Sdelphij;;; ebp = limit
382205194Sdelphij*/
383205194Sdelphij.balign 16
384205194SdelphijLookupLoop:
385205194Sdelphij        and r8d, edx
386205194Sdelphij
387205194Sdelphij        movzx   r8d, word ptr [rdi + r8*2]
388205194Sdelphij        cmp r8d, ebp
389205194Sdelphij		BEFORE_JMP
390205194Sdelphij        jbe LeaveNow
391205194Sdelphij		AFTER_JMP
392205194Sdelphij        sub edx, 0x00010000
393205194Sdelphij		BEFORE_JMP
394205194Sdelphij        js  LeaveNow
395205194Sdelphij		AFTER_JMP
396205194Sdelphij
397205194SdelphijLoopEntry:
398205194Sdelphij
399205194Sdelphij        cmp bx,word ptr [rsi + r8 - 1]
400205194Sdelphij		BEFORE_JMP
401205194Sdelphij        jnz LookupLoop1
402205194Sdelphij		AFTER_JMP
403205194SdelphijLookupLoopIsZero:
404205194Sdelphij        cmp     r12w, word ptr [r10 + r8]
405205194Sdelphij		BEFORE_JMP
406205194Sdelphij        jnz LookupLoop1
407205194Sdelphij		AFTER_JMP
408205194Sdelphij
409205194Sdelphij
410205194Sdelphij//;;; Store the current value of chainlen.
411205194Sdelphij        mov [chainlenwmask], edx
412205194Sdelphij/*
413205194Sdelphij;;; Point edi to the string under scrutiny, and esi to the string we
414205194Sdelphij;;; are hoping to match it up with. In actuality, esi and edi are
415205194Sdelphij;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
416205194Sdelphij;;; initialized to -(MAX_MATCH_8 - scanalign).
417205194Sdelphij*/
418205194Sdelphij        lea rsi,[r8+r10]
419205194Sdelphij        mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
420205194Sdelphij        lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
421205194Sdelphij        lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
422205194Sdelphij
423205194Sdelphij        prefetcht1 [rsi+rdx]
424205194Sdelphij        prefetcht1 [rdi+rdx]
425205194Sdelphij
426205194Sdelphij/*
427205194Sdelphij;;; Test the strings for equality, 8 bytes at a time. At the end,
428205194Sdelphij;;; adjust rdx so that it is offset to the exact byte that mismatched.
429205194Sdelphij;;;
430205194Sdelphij;;; We already know at this point that the first three bytes of the
431205194Sdelphij;;; strings match each other, and they can be safely passed over before
432205194Sdelphij;;; starting the compare loop. So what this code does is skip over 0-3
433205194Sdelphij;;; bytes, as much as necessary in order to dword-align the edi
434205194Sdelphij;;; pointer. (rsi will still be misaligned three times out of four.)
435205194Sdelphij;;;
436205194Sdelphij;;; It should be confessed that this loop usually does not represent
437205194Sdelphij;;; much of the total running time. Replacing it with a more
438205194Sdelphij;;; straightforward "rep cmpsb" would not drastically degrade
439205194Sdelphij;;; performance.
440205194Sdelphij*/
441205194Sdelphij
442205194SdelphijLoopCmps:
443205194Sdelphij        mov rax, [rsi + rdx]
444205194Sdelphij        xor rax, [rdi + rdx]
445205194Sdelphij        jnz LeaveLoopCmps
446205194Sdelphij
447205194Sdelphij        mov rax, [rsi + rdx + 8]
448205194Sdelphij        xor rax, [rdi + rdx + 8]
449205194Sdelphij        jnz LeaveLoopCmps8
450205194Sdelphij
451205194Sdelphij
452205194Sdelphij        mov rax, [rsi + rdx + 8+8]
453205194Sdelphij        xor rax, [rdi + rdx + 8+8]
454205194Sdelphij        jnz LeaveLoopCmps16
455205194Sdelphij
456205194Sdelphij        add rdx,8+8+8
457205194Sdelphij
458205194Sdelphij		BEFORE_JMP
459205194Sdelphij        jnz  LoopCmps
460205194Sdelphij        jmp  LenMaximum
461205194Sdelphij		AFTER_JMP
462205194Sdelphij
463205194SdelphijLeaveLoopCmps16: add rdx,8
464205194SdelphijLeaveLoopCmps8: add rdx,8
465205194SdelphijLeaveLoopCmps:
466205194Sdelphij
467205194Sdelphij        test    eax, 0x0000FFFF
468205194Sdelphij        jnz LenLower
469205194Sdelphij
470205194Sdelphij        test eax,0xffffffff
471205194Sdelphij
472205194Sdelphij        jnz LenLower32
473205194Sdelphij
474205194Sdelphij        add rdx,4
475205194Sdelphij        shr rax,32
476205194Sdelphij        or ax,ax
477205194Sdelphij		BEFORE_JMP
478205194Sdelphij        jnz LenLower
479205194Sdelphij		AFTER_JMP
480205194Sdelphij
481205194SdelphijLenLower32:
482205194Sdelphij        shr eax,16
483205194Sdelphij        add rdx,2
484205194Sdelphij
485205194SdelphijLenLower:
486205194Sdelphij        sub al, 1
487205194Sdelphij        adc rdx, 0
488205194Sdelphij//;;; Calculate the length of the match. If it is longer than MAX_MATCH,
489205194Sdelphij//;;; then automatically accept it as the best possible match and leave.
490205194Sdelphij
491205194Sdelphij        lea rax, [rdi + rdx]
492205194Sdelphij        sub rax, r9
493205194Sdelphij        cmp eax, MAX_MATCH
494205194Sdelphij		BEFORE_JMP
495205194Sdelphij        jge LenMaximum
496205194Sdelphij		AFTER_JMP
497205194Sdelphij/*
498205194Sdelphij;;; If the length of the match is not longer than the best match we
499205194Sdelphij;;; have so far, then forget it and return to the lookup loop.
500205194Sdelphij;///////////////////////////////////
501205194Sdelphij*/
502205194Sdelphij        cmp eax, r11d
503205194Sdelphij        jg  LongerMatch
504205194Sdelphij
505205194Sdelphij        lea rsi,[r10+r11]
506205194Sdelphij
507205194Sdelphij        mov rdi, prev_ad
508205194Sdelphij        mov edx, [chainlenwmask]
509205194Sdelphij		BEFORE_JMP
510205194Sdelphij        jmp LookupLoop
511205194Sdelphij		AFTER_JMP
512205194Sdelphij/*
513205194Sdelphij;;;         s->match_start = cur_match;
514205194Sdelphij;;;         best_len = len;
515205194Sdelphij;;;         if (len >= nice_match) break;
516205194Sdelphij;;;         scan_end = *(ushf*)(scan+best_len-1);
517205194Sdelphij*/
518205194SdelphijLongerMatch:
519205194Sdelphij        mov r11d, eax
520205194Sdelphij        mov match_start, r8d
521205194Sdelphij        cmp eax, [nicematch]
522205194Sdelphij		BEFORE_JMP
523205194Sdelphij        jge LeaveNow
524205194Sdelphij		AFTER_JMP
525205194Sdelphij
526205194Sdelphij        lea rsi,[r10+rax]
527205194Sdelphij
528205194Sdelphij        movzx   ebx, word ptr [r9 + rax - 1]
529205194Sdelphij        mov rdi, prev_ad
530205194Sdelphij        mov edx, [chainlenwmask]
531205194Sdelphij		BEFORE_JMP
532205194Sdelphij        jmp LookupLoop
533205194Sdelphij		AFTER_JMP
534205194Sdelphij
535205194Sdelphij//;;; Accept the current string, with the maximum possible length.
536205194Sdelphij
537205194SdelphijLenMaximum:
538205194Sdelphij        mov r11d,MAX_MATCH
539205194Sdelphij        mov match_start, r8d
540205194Sdelphij
541205194Sdelphij//;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
542205194Sdelphij//;;; return s->lookahead;
543205194Sdelphij
544205194SdelphijLeaveNow:
545205194Sdelphij        mov eax, Lookahead
546205194Sdelphij        cmp r11d, eax
547205194Sdelphij        cmovng eax, r11d
548205194Sdelphij
549205194Sdelphij
550205194Sdelphij
551205194Sdelphij//;;; Restore the stack and return from whence we came.
552205194Sdelphij
553205194Sdelphij
554205194Sdelphij//        mov rsi,[save_rsi]
555205194Sdelphij//        mov rdi,[save_rdi]
556205194Sdelphij        mov rbx,[save_rbx]
557205194Sdelphij        mov rbp,[save_rbp]
558205194Sdelphij        mov r12,[save_r12]
559205194Sdelphij        mov r13,[save_r13]
560205194Sdelphij        mov r14,[save_r14]
561205194Sdelphij        mov r15,[save_r15]
562205194Sdelphij
563205194Sdelphij
564205194Sdelphij        ret 0
565205194Sdelphij//; please don't remove this string !
566205194Sdelphij//; Your can freely use gvmat64 in any free or commercial app
567205194Sdelphij//; but it is far better don't remove the string in the binary!
568205194Sdelphij //   db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
569205194Sdelphij
570205194Sdelphij
571205194Sdelphijmatch_init:
572205194Sdelphij  ret 0
573205194Sdelphij
574205194Sdelphij
575