1184610Salfred// SPDX-License-Identifier: 0BSD 2184610Salfred 3184610Salfred/////////////////////////////////////////////////////////////////////////////// 4184610Salfred// 5184610Salfred/// \file fastpos.h 6184610Salfred/// \brief Kind of two-bit version of bit scan reverse 7184610Salfred/// 8184610Salfred// Authors: Igor Pavlov 9184610Salfred// Lasse Collin 10184610Salfred// 11184610Salfred/////////////////////////////////////////////////////////////////////////////// 12184610Salfred 13184610Salfred#ifndef LZMA_FASTPOS_H 14184610Salfred#define LZMA_FASTPOS_H 15184610Salfred 16184610Salfred// LZMA encodes match distances by storing the highest two bits using 17184610Salfred// a six-bit value [0, 63], and then the missing lower bits. 18184610Salfred// Dictionary size is also stored using this encoding in the .xz 19184610Salfred// file format header. 20184610Salfred// 21184610Salfred// fastpos.h provides a way to quickly find out the correct six-bit 22184610Salfred// values. The following table gives some examples of this encoding: 23184610Salfred// 24184610Salfred// dist return 25184610Salfred// 0 0 26184610Salfred// 1 1 27246122Shselasky// 2 2 28246122Shselasky// 3 3 29246122Shselasky// 4 4 30194677Sthompsa// 5 4 31194677Sthompsa// 6 5 32194677Sthompsa// 7 5 33194677Sthompsa// 8 6 34194677Sthompsa// 11 6 35194677Sthompsa// 12 7 36194677Sthompsa// ... ... 37194677Sthompsa// 15 7 38194677Sthompsa// 16 8 39194677Sthompsa// 17 8 40194677Sthompsa// ... ... 41194677Sthompsa// 23 8 42194677Sthompsa// 24 9 43194677Sthompsa// 25 9 44194677Sthompsa// ... ... 45194677Sthompsa// 46194677Sthompsa// 47194677Sthompsa// Provided functions or macros 48194677Sthompsa// ---------------------------- 49194677Sthompsa// 50194677Sthompsa// get_dist_slot(dist) is the basic version. get_dist_slot_2(dist) 51194677Sthompsa// assumes that dist >= FULL_DISTANCES, thus the result is at least 52188942Sthompsa// FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of 53246360Shselasky// get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist) 54246360Shselasky// should be tiny bit faster due to the assumption being made. 55188942Sthompsa// 56188942Sthompsa// 57184610Salfred// Size vs. speed 58184610Salfred// -------------- 59184610Salfred// 60184610Salfred// With some CPUs that have fast BSR (bit scan reverse) instruction, the 61246122Shselasky// size optimized version is slightly faster than the bigger table based 62184610Salfred// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 63184610Salfred// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 64184610Salfred// would still have speed roughly comparable to the table version. Older 65184610Salfred// x86 CPUs like the original Pentium have very slow BSR; on those systems 66184610Salfred// the table version is a lot faster. 67184610Salfred// 68184610Salfred// On some CPUs, the table version is a lot faster when using position 69196274Sthompsa// dependent code, but with position independent code the size optimized 70196547Sthompsa// version is slightly faster. This occurs at least on 32-bit SPARC (no 71184610Salfred// ASM optimizations). 72196274Sthompsa// 73196274Sthompsa// I'm making the table version the default, because that has good speed 74227286Shselasky// on all systems I have tried. The size optimized version is sometimes 75227075Shselasky// slightly faster, but sometimes it is a lot slower. 76227286Shselasky 77227286Shselasky#ifdef HAVE_SMALL 78227286Shselasky# define get_dist_slot(dist) \ 79196274Sthompsa ((dist) <= 4 ? (dist) : get_dist_slot_2(dist)) 80196274Sthompsa 81184610Salfredstatic inline uint32_t 82184610Salfredget_dist_slot_2(uint32_t dist) 83184610Salfred{ 84227286Shselasky const uint32_t i = bsr32(dist); 85184610Salfred return (i + i) + ((dist >> (i - 1)) & 1); 86184610Salfred} 87184610Salfred 88184610Salfred 89194677Sthompsa#else 90194228Sthompsa 91184610Salfred#define FASTPOS_BITS 13 92227309Sed 93267992Shselaskylzma_attr_visibility_hidden 94184610Salfredextern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 95184610Salfred 96184610Salfred 97184610Salfred#define fastpos_shift(extra, n) \ 98192984Sthompsa ((extra) + (n) * (FASTPOS_BITS - 1)) 99184610Salfred 100184610Salfred#define fastpos_limit(extra, n) \ 101184610Salfred (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 102184610Salfred 103192984Sthompsa#define fastpos_result(dist, extra, n) \ 104184610Salfred (uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \ 105192984Sthompsa + 2 * fastpos_shift(extra, n) 106192984Sthompsa 107184610Salfred 108184610Salfredstatic inline uint32_t 109227075Shselaskyget_dist_slot(uint32_t dist) 110227075Shselasky{ 111227075Shselasky // If it is small enough, we can pick the result directly from 112184610Salfred // the precalculated table. 113184610Salfred if (dist < fastpos_limit(0, 0)) 114184610Salfred return lzma_fastpos[dist]; 115184610Salfred 116184610Salfred if (dist < fastpos_limit(0, 1)) 117184610Salfred return fastpos_result(dist, 0, 1); 118184610Salfred 119184610Salfred return fastpos_result(dist, 0, 2); 120184610Salfred} 121184610Salfred 122184610Salfred 123184610Salfred#ifdef FULL_DISTANCES_BITS 124188600Sthompsastatic inline uint32_t 125184610Salfredget_dist_slot_2(uint32_t dist) 126188600Sthompsa{ 127184610Salfred assert(dist >= FULL_DISTANCES); 128184610Salfred 129184610Salfred if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 130184610Salfred return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0); 131184610Salfred 132184610Salfred if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 133184610Salfred return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1); 134184610Salfred 135184610Salfred return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2); 136184610Salfred} 137184610Salfred#endif 138184610Salfred 139184610Salfred#endif 140184610Salfred 141184610Salfred#endif 142184610Salfred