1205194Sdelphij/* 2205194Sdelphij;uInt longest_match_x64( 3205194Sdelphij; deflate_state *s, 4205194Sdelphij; IPos cur_match); // current match 5205194Sdelphij 6205194Sdelphij; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64 7205194Sdelphij; (AMD64 on Athlon 64, Opteron, Phenom 8205194Sdelphij; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) 9205194Sdelphij; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode) 10205194Sdelphij; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 11205194Sdelphij; 12205194Sdelphij; File written by Gilles Vollant, by converting to assembly the longest_match 13205194Sdelphij; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 14205194Sdelphij; and by taking inspiration on asm686 with masm, optimised assembly code 15205194Sdelphij; from Brian Raiter, written 1998 16205194Sdelphij; 17205194Sdelphij; This software is provided 'as-is', without any express or implied 18205194Sdelphij; warranty. In no event will the authors be held liable for any damages 19205194Sdelphij; arising from the use of this software. 20205194Sdelphij; 21205194Sdelphij; Permission is granted to anyone to use this software for any purpose, 22205194Sdelphij; including commercial applications, and to alter it and redistribute it 23205194Sdelphij; freely, subject to the following restrictions: 24205194Sdelphij; 25205194Sdelphij; 1. The origin of this software must not be misrepresented; you must not 26205194Sdelphij; claim that you wrote the original software. If you use this software 27205194Sdelphij; in a product, an acknowledgment in the product documentation would be 28205194Sdelphij; appreciated but is not required. 29205194Sdelphij; 2. Altered source versions must be plainly marked as such, and must not be 30205194Sdelphij; misrepresented as being the original software 31205194Sdelphij; 3. This notice may not be removed or altered from any source distribution. 32205194Sdelphij; 33205194Sdelphij; http://www.zlib.net 34205194Sdelphij; http://www.winimage.com/zLibDll 35205194Sdelphij; http://www.muppetlabs.com/~breadbox/software/assembly.html 36205194Sdelphij; 37205194Sdelphij; to compile this file for zLib, I use option: 38205194Sdelphij; gcc -c -arch x86_64 gvmat64.S 39205194Sdelphij 40205194Sdelphij 41205194Sdelphij;uInt longest_match(s, cur_match) 42205194Sdelphij; deflate_state *s; 43205194Sdelphij; IPos cur_match; // current match / 44205194Sdelphij; 45205194Sdelphij; with XCode for Mac, I had strange error with some jump on intel syntax 46205194Sdelphij; this is why BEFORE_JMP and AFTER_JMP are used 47205194Sdelphij */ 48205194Sdelphij 49205194Sdelphij 50205194Sdelphij#define BEFORE_JMP .att_syntax 51205194Sdelphij#define AFTER_JMP .intel_syntax noprefix 52205194Sdelphij 53205194Sdelphij#ifndef NO_UNDERLINE 54205194Sdelphij# define match_init _match_init 55205194Sdelphij# define longest_match _longest_match 56205194Sdelphij#endif 57205194Sdelphij 58205194Sdelphij.intel_syntax noprefix 59205194Sdelphij 60205194Sdelphij.globl match_init, longest_match 61205194Sdelphij.text 62205194Sdelphijlongest_match: 63205194Sdelphij 64205194Sdelphij 65205194Sdelphij 66205194Sdelphij#define LocalVarsSize 96 67205194Sdelphij/* 68205194Sdelphij; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 69205194Sdelphij; free register : r14,r15 70205194Sdelphij; register can be saved : rsp 71205194Sdelphij*/ 72205194Sdelphij 73205194Sdelphij#define chainlenwmask (rsp + 8 - LocalVarsSize) 74205194Sdelphij#define nicematch (rsp + 16 - LocalVarsSize) 75205194Sdelphij 76205194Sdelphij#define save_rdi (rsp + 24 - LocalVarsSize) 77205194Sdelphij#define save_rsi (rsp + 32 - LocalVarsSize) 78205194Sdelphij#define save_rbx (rsp + 40 - LocalVarsSize) 79205194Sdelphij#define save_rbp (rsp + 48 - LocalVarsSize) 80205194Sdelphij#define save_r12 (rsp + 56 - LocalVarsSize) 81205194Sdelphij#define save_r13 (rsp + 64 - LocalVarsSize) 82205194Sdelphij#define save_r14 (rsp + 72 - LocalVarsSize) 83205194Sdelphij#define save_r15 (rsp + 80 - LocalVarsSize) 84205194Sdelphij 85205194Sdelphij 86205194Sdelphij/* 87205194Sdelphij; all the +4 offsets are due to the addition of pending_buf_size (in zlib 88205194Sdelphij; in the deflate_state structure since the asm code was first written 89205194Sdelphij; (if you compile with zlib 1.0.4 or older, remove the +4). 90205194Sdelphij; Note : these value are good with a 8 bytes boundary pack structure 91205194Sdelphij*/ 92205194Sdelphij 93205194Sdelphij#define MAX_MATCH 258 94205194Sdelphij#define MIN_MATCH 3 95205194Sdelphij#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) 96205194Sdelphij 97205194Sdelphij/* 98205194Sdelphij;;; Offsets for fields in the deflate_state structure. These numbers 99205194Sdelphij;;; are calculated from the definition of deflate_state, with the 100205194Sdelphij;;; assumption that the compiler will dword-align the fields. (Thus, 101205194Sdelphij;;; changing the definition of deflate_state could easily cause this 102205194Sdelphij;;; program to crash horribly, without so much as a warning at 103205194Sdelphij;;; compile time. Sigh.) 104205194Sdelphij 105205194Sdelphij; all the +zlib1222add offsets are due to the addition of fields 106205194Sdelphij; in zlib in the deflate_state structure since the asm code was first written 107205194Sdelphij; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 108205194Sdelphij; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 109205194Sdelphij; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 110205194Sdelphij*/ 111205194Sdelphij 112205194Sdelphij 113205194Sdelphij 114205194Sdelphij/* you can check the structure offset by running 115205194Sdelphij 116205194Sdelphij#include <stdlib.h> 117205194Sdelphij#include <stdio.h> 118205194Sdelphij#include "deflate.h" 119205194Sdelphij 120205194Sdelphijvoid print_depl() 121205194Sdelphij{ 122205194Sdelphijdeflate_state ds; 123205194Sdelphijdeflate_state *s=&ds; 124205194Sdelphijprintf("size pointer=%u\n",(int)sizeof(void*)); 125205194Sdelphij 126205194Sdelphijprintf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s))); 127205194Sdelphijprintf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s))); 128205194Sdelphijprintf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s))); 129205194Sdelphijprintf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s))); 130205194Sdelphijprintf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s))); 131205194Sdelphijprintf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s))); 132205194Sdelphijprintf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s))); 133205194Sdelphijprintf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s))); 134205194Sdelphijprintf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s))); 135205194Sdelphijprintf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s))); 136205194Sdelphijprintf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); 137205194Sdelphijprintf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s))); 138205194Sdelphijprintf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s))); 139205194Sdelphij} 140205194Sdelphij*/ 141205194Sdelphij 142205194Sdelphij#define dsWSize 68 143205194Sdelphij#define dsWMask 76 144205194Sdelphij#define dsWindow 80 145205194Sdelphij#define dsPrev 96 146205194Sdelphij#define dsMatchLen 144 147205194Sdelphij#define dsPrevMatch 148 148205194Sdelphij#define dsStrStart 156 149205194Sdelphij#define dsMatchStart 160 150205194Sdelphij#define dsLookahead 164 151205194Sdelphij#define dsPrevLen 168 152205194Sdelphij#define dsMaxChainLen 172 153205194Sdelphij#define dsGoodMatch 188 154205194Sdelphij#define dsNiceMatch 192 155205194Sdelphij 156205194Sdelphij#define window_size [ rcx + dsWSize] 157205194Sdelphij#define WMask [ rcx + dsWMask] 158205194Sdelphij#define window_ad [ rcx + dsWindow] 159205194Sdelphij#define prev_ad [ rcx + dsPrev] 160205194Sdelphij#define strstart [ rcx + dsStrStart] 161205194Sdelphij#define match_start [ rcx + dsMatchStart] 162205194Sdelphij#define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip 163205194Sdelphij#define prev_length [ rcx + dsPrevLen] 164205194Sdelphij#define max_chain_length [ rcx + dsMaxChainLen] 165205194Sdelphij#define good_match [ rcx + dsGoodMatch] 166205194Sdelphij#define nice_match [ rcx + dsNiceMatch] 167205194Sdelphij 168205194Sdelphij/* 169205194Sdelphij; windows: 170205194Sdelphij; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match) 171205194Sdelphij 172205194Sdelphij; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 173205194Sdelphij; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 174205194Sdelphij; 175205194Sdelphij; All registers must be preserved across the call, except for 176205194Sdelphij; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 177205194Sdelphij 178205194Sdelphij; 179205194Sdelphij; gcc on macosx-linux: 180205194Sdelphij; see http://www.x86-64.org/documentation/abi-0.99.pdf 181205194Sdelphij; param 1 in rdi, param 2 in rsi 182205194Sdelphij; rbx, rsp, rbp, r12 to r15 must be preserved 183205194Sdelphij 184205194Sdelphij;;; Save registers that the compiler may be using, and adjust esp to 185205194Sdelphij;;; make room for our stack frame. 186205194Sdelphij 187205194Sdelphij 188205194Sdelphij;;; Retrieve the function arguments. r8d will hold cur_match 189205194Sdelphij;;; throughout the entire function. edx will hold the pointer to the 190205194Sdelphij;;; deflate_state structure during the function's setup (before 191205194Sdelphij;;; entering the main loop. 192205194Sdelphij 193205194Sdelphij; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 194205194Sdelphij; mac: param 1 in rdi, param 2 rsi 195205194Sdelphij; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 196205194Sdelphij*/ 197205194Sdelphij mov [save_rbx],rbx 198205194Sdelphij mov [save_rbp],rbp 199205194Sdelphij 200205194Sdelphij 201205194Sdelphij mov rcx,rdi 202205194Sdelphij 203205194Sdelphij mov r8d,esi 204205194Sdelphij 205205194Sdelphij 206205194Sdelphij mov [save_r12],r12 207205194Sdelphij mov [save_r13],r13 208205194Sdelphij mov [save_r14],r14 209205194Sdelphij mov [save_r15],r15 210205194Sdelphij 211205194Sdelphij 212205194Sdelphij//;;; uInt wmask = s->w_mask; 213205194Sdelphij//;;; unsigned chain_length = s->max_chain_length; 214205194Sdelphij//;;; if (s->prev_length >= s->good_match) { 215205194Sdelphij//;;; chain_length >>= 2; 216205194Sdelphij//;;; } 217205194Sdelphij 218205194Sdelphij 219205194Sdelphij mov edi, prev_length 220205194Sdelphij mov esi, good_match 221205194Sdelphij mov eax, WMask 222205194Sdelphij mov ebx, max_chain_length 223205194Sdelphij cmp edi, esi 224205194Sdelphij jl LastMatchGood 225205194Sdelphij shr ebx, 2 226205194SdelphijLastMatchGood: 227205194Sdelphij 228205194Sdelphij//;;; chainlen is decremented once beforehand so that the function can 229205194Sdelphij//;;; use the sign flag instead of the zero flag for the exit test. 230205194Sdelphij//;;; It is then shifted into the high word, to make room for the wmask 231205194Sdelphij//;;; value, which it will always accompany. 232205194Sdelphij 233205194Sdelphij dec ebx 234205194Sdelphij shl ebx, 16 235205194Sdelphij or ebx, eax 236205194Sdelphij 237205194Sdelphij//;;; on zlib only 238205194Sdelphij//;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 239205194Sdelphij 240205194Sdelphij 241205194Sdelphij 242205194Sdelphij mov eax, nice_match 243205194Sdelphij mov [chainlenwmask], ebx 244205194Sdelphij mov r10d, Lookahead 245205194Sdelphij cmp r10d, eax 246205194Sdelphij cmovnl r10d, eax 247205194Sdelphij mov [nicematch],r10d 248205194Sdelphij 249205194Sdelphij 250205194Sdelphij 251205194Sdelphij//;;; register Bytef *scan = s->window + s->strstart; 252205194Sdelphij mov r10, window_ad 253205194Sdelphij mov ebp, strstart 254205194Sdelphij lea r13, [r10 + rbp] 255205194Sdelphij 256205194Sdelphij//;;; Determine how many bytes the scan ptr is off from being 257205194Sdelphij//;;; dword-aligned. 258205194Sdelphij 259205194Sdelphij mov r9,r13 260205194Sdelphij neg r13 261205194Sdelphij and r13,3 262205194Sdelphij 263205194Sdelphij//;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 264205194Sdelphij//;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 265205194Sdelphij 266205194Sdelphij 267205194Sdelphij mov eax, window_size 268205194Sdelphij sub eax, MIN_LOOKAHEAD 269205194Sdelphij 270205194Sdelphij 271205194Sdelphij xor edi,edi 272205194Sdelphij sub ebp, eax 273205194Sdelphij 274205194Sdelphij mov r11d, prev_length 275205194Sdelphij 276205194Sdelphij cmovng ebp,edi 277205194Sdelphij 278205194Sdelphij//;;; int best_len = s->prev_length; 279205194Sdelphij 280205194Sdelphij 281205194Sdelphij//;;; Store the sum of s->window + best_len in esi locally, and in esi. 282205194Sdelphij 283205194Sdelphij lea rsi,[r10+r11] 284205194Sdelphij 285205194Sdelphij//;;; register ush scan_start = *(ushf*)scan; 286205194Sdelphij//;;; register ush scan_end = *(ushf*)(scan+best_len-1); 287205194Sdelphij//;;; Posf *prev = s->prev; 288205194Sdelphij 289205194Sdelphij movzx r12d,word ptr [r9] 290205194Sdelphij movzx ebx, word ptr [r9 + r11 - 1] 291205194Sdelphij 292205194Sdelphij mov rdi, prev_ad 293205194Sdelphij 294205194Sdelphij//;;; Jump into the main loop. 295205194Sdelphij 296205194Sdelphij mov edx, [chainlenwmask] 297205194Sdelphij 298205194Sdelphij cmp bx,word ptr [rsi + r8 - 1] 299205194Sdelphij jz LookupLoopIsZero 300205194Sdelphij 301205194Sdelphij 302205194Sdelphij 303205194SdelphijLookupLoop1: 304205194Sdelphij and r8d, edx 305205194Sdelphij 306205194Sdelphij movzx r8d, word ptr [rdi + r8*2] 307205194Sdelphij cmp r8d, ebp 308205194Sdelphij jbe LeaveNow 309205194Sdelphij 310205194Sdelphij 311205194Sdelphij 312205194Sdelphij sub edx, 0x00010000 313205194Sdelphij BEFORE_JMP 314205194Sdelphij js LeaveNow 315205194Sdelphij AFTER_JMP 316205194Sdelphij 317205194SdelphijLoopEntry1: 318205194Sdelphij cmp bx,word ptr [rsi + r8 - 1] 319205194Sdelphij BEFORE_JMP 320205194Sdelphij jz LookupLoopIsZero 321205194Sdelphij AFTER_JMP 322205194Sdelphij 323205194SdelphijLookupLoop2: 324205194Sdelphij and r8d, edx 325205194Sdelphij 326205194Sdelphij movzx r8d, word ptr [rdi + r8*2] 327205194Sdelphij cmp r8d, ebp 328205194Sdelphij BEFORE_JMP 329205194Sdelphij jbe LeaveNow 330205194Sdelphij AFTER_JMP 331205194Sdelphij sub edx, 0x00010000 332205194Sdelphij BEFORE_JMP 333205194Sdelphij js LeaveNow 334205194Sdelphij AFTER_JMP 335205194Sdelphij 336205194SdelphijLoopEntry2: 337205194Sdelphij cmp bx,word ptr [rsi + r8 - 1] 338205194Sdelphij BEFORE_JMP 339205194Sdelphij jz LookupLoopIsZero 340205194Sdelphij AFTER_JMP 341205194Sdelphij 342205194SdelphijLookupLoop4: 343205194Sdelphij and r8d, edx 344205194Sdelphij 345205194Sdelphij movzx r8d, word ptr [rdi + r8*2] 346205194Sdelphij cmp r8d, ebp 347205194Sdelphij BEFORE_JMP 348205194Sdelphij jbe LeaveNow 349205194Sdelphij AFTER_JMP 350205194Sdelphij sub edx, 0x00010000 351205194Sdelphij BEFORE_JMP 352205194Sdelphij js LeaveNow 353205194Sdelphij AFTER_JMP 354205194Sdelphij 355205194SdelphijLoopEntry4: 356205194Sdelphij 357205194Sdelphij cmp bx,word ptr [rsi + r8 - 1] 358205194Sdelphij BEFORE_JMP 359205194Sdelphij jnz LookupLoop1 360205194Sdelphij jmp LookupLoopIsZero 361205194Sdelphij AFTER_JMP 362205194Sdelphij/* 363205194Sdelphij;;; do { 364205194Sdelphij;;; match = s->window + cur_match; 365205194Sdelphij;;; if (*(ushf*)(match+best_len-1) != scan_end || 366205194Sdelphij;;; *(ushf*)match != scan_start) continue; 367205194Sdelphij;;; [...] 368205194Sdelphij;;; } while ((cur_match = prev[cur_match & wmask]) > limit 369205194Sdelphij;;; && --chain_length != 0); 370205194Sdelphij;;; 371205194Sdelphij;;; Here is the inner loop of the function. The function will spend the 372205194Sdelphij;;; majority of its time in this loop, and majority of that time will 373205194Sdelphij;;; be spent in the first ten instructions. 374205194Sdelphij;;; 375205194Sdelphij;;; Within this loop: 376205194Sdelphij;;; ebx = scanend 377205194Sdelphij;;; r8d = curmatch 378205194Sdelphij;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 379205194Sdelphij;;; esi = windowbestlen - i.e., (window + bestlen) 380205194Sdelphij;;; edi = prev 381205194Sdelphij;;; ebp = limit 382205194Sdelphij*/ 383205194Sdelphij.balign 16 384205194SdelphijLookupLoop: 385205194Sdelphij and r8d, edx 386205194Sdelphij 387205194Sdelphij movzx r8d, word ptr [rdi + r8*2] 388205194Sdelphij cmp r8d, ebp 389205194Sdelphij BEFORE_JMP 390205194Sdelphij jbe LeaveNow 391205194Sdelphij AFTER_JMP 392205194Sdelphij sub edx, 0x00010000 393205194Sdelphij BEFORE_JMP 394205194Sdelphij js LeaveNow 395205194Sdelphij AFTER_JMP 396205194Sdelphij 397205194SdelphijLoopEntry: 398205194Sdelphij 399205194Sdelphij cmp bx,word ptr [rsi + r8 - 1] 400205194Sdelphij BEFORE_JMP 401205194Sdelphij jnz LookupLoop1 402205194Sdelphij AFTER_JMP 403205194SdelphijLookupLoopIsZero: 404205194Sdelphij cmp r12w, word ptr [r10 + r8] 405205194Sdelphij BEFORE_JMP 406205194Sdelphij jnz LookupLoop1 407205194Sdelphij AFTER_JMP 408205194Sdelphij 409205194Sdelphij 410205194Sdelphij//;;; Store the current value of chainlen. 411205194Sdelphij mov [chainlenwmask], edx 412205194Sdelphij/* 413205194Sdelphij;;; Point edi to the string under scrutiny, and esi to the string we 414205194Sdelphij;;; are hoping to match it up with. In actuality, esi and edi are 415205194Sdelphij;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 416205194Sdelphij;;; initialized to -(MAX_MATCH_8 - scanalign). 417205194Sdelphij*/ 418205194Sdelphij lea rsi,[r8+r10] 419205194Sdelphij mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8) 420205194Sdelphij lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8] 421205194Sdelphij lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8] 422205194Sdelphij 423205194Sdelphij prefetcht1 [rsi+rdx] 424205194Sdelphij prefetcht1 [rdi+rdx] 425205194Sdelphij 426205194Sdelphij/* 427205194Sdelphij;;; Test the strings for equality, 8 bytes at a time. At the end, 428205194Sdelphij;;; adjust rdx so that it is offset to the exact byte that mismatched. 429205194Sdelphij;;; 430205194Sdelphij;;; We already know at this point that the first three bytes of the 431205194Sdelphij;;; strings match each other, and they can be safely passed over before 432205194Sdelphij;;; starting the compare loop. So what this code does is skip over 0-3 433205194Sdelphij;;; bytes, as much as necessary in order to dword-align the edi 434205194Sdelphij;;; pointer. (rsi will still be misaligned three times out of four.) 435205194Sdelphij;;; 436205194Sdelphij;;; It should be confessed that this loop usually does not represent 437205194Sdelphij;;; much of the total running time. Replacing it with a more 438205194Sdelphij;;; straightforward "rep cmpsb" would not drastically degrade 439205194Sdelphij;;; performance. 440205194Sdelphij*/ 441205194Sdelphij 442205194SdelphijLoopCmps: 443205194Sdelphij mov rax, [rsi + rdx] 444205194Sdelphij xor rax, [rdi + rdx] 445205194Sdelphij jnz LeaveLoopCmps 446205194Sdelphij 447205194Sdelphij mov rax, [rsi + rdx + 8] 448205194Sdelphij xor rax, [rdi + rdx + 8] 449205194Sdelphij jnz LeaveLoopCmps8 450205194Sdelphij 451205194Sdelphij 452205194Sdelphij mov rax, [rsi + rdx + 8+8] 453205194Sdelphij xor rax, [rdi + rdx + 8+8] 454205194Sdelphij jnz LeaveLoopCmps16 455205194Sdelphij 456205194Sdelphij add rdx,8+8+8 457205194Sdelphij 458205194Sdelphij BEFORE_JMP 459205194Sdelphij jnz LoopCmps 460205194Sdelphij jmp LenMaximum 461205194Sdelphij AFTER_JMP 462205194Sdelphij 463205194SdelphijLeaveLoopCmps16: add rdx,8 464205194SdelphijLeaveLoopCmps8: add rdx,8 465205194SdelphijLeaveLoopCmps: 466205194Sdelphij 467205194Sdelphij test eax, 0x0000FFFF 468205194Sdelphij jnz LenLower 469205194Sdelphij 470205194Sdelphij test eax,0xffffffff 471205194Sdelphij 472205194Sdelphij jnz LenLower32 473205194Sdelphij 474205194Sdelphij add rdx,4 475205194Sdelphij shr rax,32 476205194Sdelphij or ax,ax 477205194Sdelphij BEFORE_JMP 478205194Sdelphij jnz LenLower 479205194Sdelphij AFTER_JMP 480205194Sdelphij 481205194SdelphijLenLower32: 482205194Sdelphij shr eax,16 483205194Sdelphij add rdx,2 484205194Sdelphij 485205194SdelphijLenLower: 486205194Sdelphij sub al, 1 487205194Sdelphij adc rdx, 0 488205194Sdelphij//;;; Calculate the length of the match. If it is longer than MAX_MATCH, 489205194Sdelphij//;;; then automatically accept it as the best possible match and leave. 490205194Sdelphij 491205194Sdelphij lea rax, [rdi + rdx] 492205194Sdelphij sub rax, r9 493205194Sdelphij cmp eax, MAX_MATCH 494205194Sdelphij BEFORE_JMP 495205194Sdelphij jge LenMaximum 496205194Sdelphij AFTER_JMP 497205194Sdelphij/* 498205194Sdelphij;;; If the length of the match is not longer than the best match we 499205194Sdelphij;;; have so far, then forget it and return to the lookup loop. 500205194Sdelphij;/////////////////////////////////// 501205194Sdelphij*/ 502205194Sdelphij cmp eax, r11d 503205194Sdelphij jg LongerMatch 504205194Sdelphij 505205194Sdelphij lea rsi,[r10+r11] 506205194Sdelphij 507205194Sdelphij mov rdi, prev_ad 508205194Sdelphij mov edx, [chainlenwmask] 509205194Sdelphij BEFORE_JMP 510205194Sdelphij jmp LookupLoop 511205194Sdelphij AFTER_JMP 512205194Sdelphij/* 513205194Sdelphij;;; s->match_start = cur_match; 514205194Sdelphij;;; best_len = len; 515205194Sdelphij;;; if (len >= nice_match) break; 516205194Sdelphij;;; scan_end = *(ushf*)(scan+best_len-1); 517205194Sdelphij*/ 518205194SdelphijLongerMatch: 519205194Sdelphij mov r11d, eax 520205194Sdelphij mov match_start, r8d 521205194Sdelphij cmp eax, [nicematch] 522205194Sdelphij BEFORE_JMP 523205194Sdelphij jge LeaveNow 524205194Sdelphij AFTER_JMP 525205194Sdelphij 526205194Sdelphij lea rsi,[r10+rax] 527205194Sdelphij 528205194Sdelphij movzx ebx, word ptr [r9 + rax - 1] 529205194Sdelphij mov rdi, prev_ad 530205194Sdelphij mov edx, [chainlenwmask] 531205194Sdelphij BEFORE_JMP 532205194Sdelphij jmp LookupLoop 533205194Sdelphij AFTER_JMP 534205194Sdelphij 535205194Sdelphij//;;; Accept the current string, with the maximum possible length. 536205194Sdelphij 537205194SdelphijLenMaximum: 538205194Sdelphij mov r11d,MAX_MATCH 539205194Sdelphij mov match_start, r8d 540205194Sdelphij 541205194Sdelphij//;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 542205194Sdelphij//;;; return s->lookahead; 543205194Sdelphij 544205194SdelphijLeaveNow: 545205194Sdelphij mov eax, Lookahead 546205194Sdelphij cmp r11d, eax 547205194Sdelphij cmovng eax, r11d 548205194Sdelphij 549205194Sdelphij 550205194Sdelphij 551205194Sdelphij//;;; Restore the stack and return from whence we came. 552205194Sdelphij 553205194Sdelphij 554205194Sdelphij// mov rsi,[save_rsi] 555205194Sdelphij// mov rdi,[save_rdi] 556205194Sdelphij mov rbx,[save_rbx] 557205194Sdelphij mov rbp,[save_rbp] 558205194Sdelphij mov r12,[save_r12] 559205194Sdelphij mov r13,[save_r13] 560205194Sdelphij mov r14,[save_r14] 561205194Sdelphij mov r15,[save_r15] 562205194Sdelphij 563205194Sdelphij 564205194Sdelphij ret 0 565205194Sdelphij//; please don't remove this string ! 566205194Sdelphij//; Your can freely use gvmat64 in any free or commercial app 567205194Sdelphij//; but it is far better don't remove the string in the binary! 568205194Sdelphij // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 569205194Sdelphij 570205194Sdelphij 571205194Sdelphijmatch_init: 572205194Sdelphij ret 0 573205194Sdelphij 574205194Sdelphij 575