1207753Smm///////////////////////////////////////////////////////////////////////////////
2207753Smm//
3207753Smm/// \file       fastpos.h
4207753Smm/// \brief      Kind of two-bit version of bit scan reverse
5207753Smm///
6207753Smm//  Authors:    Igor Pavlov
7207753Smm//              Lasse Collin
8207753Smm//
9207753Smm//  This file has been put into the public domain.
10207753Smm//  You can do whatever you want with this file.
11207753Smm//
12207753Smm///////////////////////////////////////////////////////////////////////////////
13207753Smm
14207753Smm#ifndef LZMA_FASTPOS_H
15207753Smm#define LZMA_FASTPOS_H
16207753Smm
17207753Smm// LZMA encodes match distances (positions) by storing the highest two
18207753Smm// bits using a six-bit value [0, 63], and then the missing lower bits.
19207753Smm// Dictionary size is also stored using this encoding in the new .lzma
20207753Smm// file format header.
21207753Smm//
22207753Smm// fastpos.h provides a way to quickly find out the correct six-bit
23207753Smm// values. The following table gives some examples of this encoding:
24207753Smm//
25207753Smm//      pos   return
26207753Smm//       0       0
27207753Smm//       1       1
28207753Smm//       2       2
29207753Smm//       3       3
30207753Smm//       4       4
31207753Smm//       5       4
32207753Smm//       6       5
33207753Smm//       7       5
34207753Smm//       8       6
35207753Smm//      11       6
36207753Smm//      12       7
37207753Smm//     ...      ...
38207753Smm//      15       7
39207753Smm//      16       8
40207753Smm//      17       8
41207753Smm//     ...      ...
42207753Smm//      23       8
43207753Smm//      24       9
44207753Smm//      25       9
45207753Smm//     ...      ...
46207753Smm//
47207753Smm//
48207753Smm// Provided functions or macros
49207753Smm// ----------------------------
50207753Smm//
51207753Smm// get_pos_slot(pos) is the basic version. get_pos_slot_2(pos)
52207753Smm// assumes that pos >= FULL_DISTANCES, thus the result is at least
53207753Smm// FULL_DISTANCES_BITS * 2. Using get_pos_slot(pos) instead of
54207753Smm// get_pos_slot_2(pos) would give the same result, but get_pos_slot_2(pos)
55207753Smm// should be tiny bit faster due to the assumption being made.
56207753Smm//
57207753Smm//
58207753Smm// Size vs. speed
59207753Smm// --------------
60207753Smm//
61207753Smm// With some CPUs that have fast BSR (bit scan reverse) instruction, the
62207753Smm// size optimized version is slightly faster than the bigger table based
63207753Smm// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III
64207753Smm// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that
65207753Smm// would still have speed roughly comparable to the table version. Older
66207753Smm// x86 CPUs like the original Pentium have very slow BSR; on those systems
67207753Smm// the table version is a lot faster.
68207753Smm//
69207753Smm// On some CPUs, the table version is a lot faster when using position
70207753Smm// dependent code, but with position independent code the size optimized
71207753Smm// version is slightly faster. This occurs at least on 32-bit SPARC (no
72207753Smm// ASM optimizations).
73207753Smm//
74207753Smm// I'm making the table version the default, because that has good speed
75207753Smm// on all systems I have tried. The size optimized version is sometimes
76207753Smm// slightly faster, but sometimes it is a lot slower.
77207753Smm
78207753Smm#ifdef HAVE_SMALL
79207753Smm#	define get_pos_slot(pos) ((pos) <= 4 ? (pos) : get_pos_slot_2(pos))
80207753Smm
81207753Smmstatic inline uint32_t
82207753Smmget_pos_slot_2(uint32_t pos)
83207753Smm{
84207753Smm	const uint32_t i = bsr32(pos);
85207753Smm	return (i + i) + ((pos >> (i - 1)) & 1);
86207753Smm}
87207753Smm
88207753Smm
89207753Smm#else
90207753Smm
91207753Smm#define FASTPOS_BITS 13
92207753Smm
93207753Smmextern const uint8_t lzma_fastpos[1 << FASTPOS_BITS];
94207753Smm
95207753Smm
96207753Smm#define fastpos_shift(extra, n) \
97207753Smm	((extra) + (n) * (FASTPOS_BITS - 1))
98207753Smm
99207753Smm#define fastpos_limit(extra, n) \
100207753Smm	(UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n)))
101207753Smm
102207753Smm#define fastpos_result(pos, extra, n) \
103207753Smm	lzma_fastpos[(pos) >> fastpos_shift(extra, n)] \
104207753Smm			+ 2 * fastpos_shift(extra, n)
105207753Smm
106207753Smm
107207753Smmstatic inline uint32_t
108207753Smmget_pos_slot(uint32_t pos)
109207753Smm{
110207753Smm	// If it is small enough, we can pick the result directly from
111207753Smm	// the precalculated table.
112207753Smm	if (pos < fastpos_limit(0, 0))
113207753Smm		return lzma_fastpos[pos];
114207753Smm
115207753Smm	if (pos < fastpos_limit(0, 1))
116207753Smm		return fastpos_result(pos, 0, 1);
117207753Smm
118207753Smm	return fastpos_result(pos, 0, 2);
119207753Smm}
120207753Smm
121207753Smm
122207753Smm#ifdef FULL_DISTANCES_BITS
123207753Smmstatic inline uint32_t
124207753Smmget_pos_slot_2(uint32_t pos)
125207753Smm{
126207753Smm	assert(pos >= FULL_DISTANCES);
127207753Smm
128207753Smm	if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 0))
129207753Smm		return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 0);
130207753Smm
131207753Smm	if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 1))
132207753Smm		return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 1);
133207753Smm
134207753Smm	return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 2);
135207753Smm}
136207753Smm#endif
137207753Smm
138207753Smm#endif
139207753Smm
140207753Smm#endif
141