1207753Smm///////////////////////////////////////////////////////////////////////////////
2207753Smm//
3207753Smm/// \file       fastpos.h
4207753Smm/// \brief      Kind of two-bit version of bit scan reverse
5207753Smm///
6207753Smm//  Authors:    Igor Pavlov
7207753Smm//              Lasse Collin
8207753Smm//
9207753Smm//  This file has been put into the public domain.
10207753Smm//  You can do whatever you want with this file.
11207753Smm//
12207753Smm///////////////////////////////////////////////////////////////////////////////
13207753Smm
14207753Smm#ifndef LZMA_FASTPOS_H
15207753Smm#define LZMA_FASTPOS_H
16207753Smm
17292588Sdelphij// LZMA encodes match distances by storing the highest two bits using
18292588Sdelphij// a six-bit value [0, 63], and then the missing lower bits.
19292588Sdelphij// Dictionary size is also stored using this encoding in the .xz
20207753Smm// file format header.
21207753Smm//
22207753Smm// fastpos.h provides a way to quickly find out the correct six-bit
23207753Smm// values. The following table gives some examples of this encoding:
24207753Smm//
25292588Sdelphij//     dist   return
26207753Smm//       0       0
27207753Smm//       1       1
28207753Smm//       2       2
29207753Smm//       3       3
30207753Smm//       4       4
31207753Smm//       5       4
32207753Smm//       6       5
33207753Smm//       7       5
34207753Smm//       8       6
35207753Smm//      11       6
36207753Smm//      12       7
37207753Smm//     ...      ...
38207753Smm//      15       7
39207753Smm//      16       8
40207753Smm//      17       8
41207753Smm//     ...      ...
42207753Smm//      23       8
43207753Smm//      24       9
44207753Smm//      25       9
45207753Smm//     ...      ...
46207753Smm//
47207753Smm//
48207753Smm// Provided functions or macros
49207753Smm// ----------------------------
50207753Smm//
51292588Sdelphij// get_dist_slot(dist) is the basic version. get_dist_slot_2(dist)
52292588Sdelphij// assumes that dist >= FULL_DISTANCES, thus the result is at least
53292588Sdelphij// FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of
54292588Sdelphij// get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist)
55207753Smm// should be tiny bit faster due to the assumption being made.
56207753Smm//
57207753Smm//
58207753Smm// Size vs. speed
59207753Smm// --------------
60207753Smm//
61207753Smm// With some CPUs that have fast BSR (bit scan reverse) instruction, the
62207753Smm// size optimized version is slightly faster than the bigger table based
63207753Smm// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III
64207753Smm// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that
65207753Smm// would still have speed roughly comparable to the table version. Older
66207753Smm// x86 CPUs like the original Pentium have very slow BSR; on those systems
67207753Smm// the table version is a lot faster.
68207753Smm//
69207753Smm// On some CPUs, the table version is a lot faster when using position
70207753Smm// dependent code, but with position independent code the size optimized
71207753Smm// version is slightly faster. This occurs at least on 32-bit SPARC (no
72207753Smm// ASM optimizations).
73207753Smm//
74207753Smm// I'm making the table version the default, because that has good speed
75207753Smm// on all systems I have tried. The size optimized version is sometimes
76207753Smm// slightly faster, but sometimes it is a lot slower.
77207753Smm
78207753Smm#ifdef HAVE_SMALL
79292588Sdelphij#	define get_dist_slot(dist) \
80292588Sdelphij		((dist) <= 4 ? (dist) : get_dist_slot_2(dist))
81207753Smm
82207753Smmstatic inline uint32_t
83292588Sdelphijget_dist_slot_2(uint32_t dist)
84207753Smm{
85292588Sdelphij	const uint32_t i = bsr32(dist);
86292588Sdelphij	return (i + i) + ((dist >> (i - 1)) & 1);
87207753Smm}
88207753Smm
89207753Smm
90207753Smm#else
91207753Smm
92207753Smm#define FASTPOS_BITS 13
93207753Smm
94207753Smmextern const uint8_t lzma_fastpos[1 << FASTPOS_BITS];
95207753Smm
96207753Smm
97207753Smm#define fastpos_shift(extra, n) \
98207753Smm	((extra) + (n) * (FASTPOS_BITS - 1))
99207753Smm
100207753Smm#define fastpos_limit(extra, n) \
101207753Smm	(UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n)))
102207753Smm
103292588Sdelphij#define fastpos_result(dist, extra, n) \
104292588Sdelphij	lzma_fastpos[(dist) >> fastpos_shift(extra, n)] \
105207753Smm			+ 2 * fastpos_shift(extra, n)
106207753Smm
107207753Smm
108207753Smmstatic inline uint32_t
109292588Sdelphijget_dist_slot(uint32_t dist)
110207753Smm{
111207753Smm	// If it is small enough, we can pick the result directly from
112207753Smm	// the precalculated table.
113292588Sdelphij	if (dist < fastpos_limit(0, 0))
114292588Sdelphij		return lzma_fastpos[dist];
115207753Smm
116292588Sdelphij	if (dist < fastpos_limit(0, 1))
117292588Sdelphij		return fastpos_result(dist, 0, 1);
118207753Smm
119292588Sdelphij	return fastpos_result(dist, 0, 2);
120207753Smm}
121207753Smm
122207753Smm
123207753Smm#ifdef FULL_DISTANCES_BITS
124207753Smmstatic inline uint32_t
125292588Sdelphijget_dist_slot_2(uint32_t dist)
126207753Smm{
127292588Sdelphij	assert(dist >= FULL_DISTANCES);
128207753Smm
129292588Sdelphij	if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0))
130292588Sdelphij		return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0);
131207753Smm
132292588Sdelphij	if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1))
133292588Sdelphij		return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1);
134207753Smm
135292588Sdelphij	return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2);
136207753Smm}
137207753Smm#endif
138207753Smm
139207753Smm#endif
140207753Smm
141207753Smm#endif
142