1/* mpfr_rec_sqrt -- inverse square root 2 3Copyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. 4Contributed by the AriC and Caramel projects, INRIA. 5 6This file is part of the GNU MPFR Library. 7 8The GNU MPFR Library is free software; you can redistribute it and/or modify 9it under the terms of the GNU Lesser General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at your 11option) any later version. 12 13The GNU MPFR Library is distributed in the hope that it will be useful, but 14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16License for more details. 17 18You should have received a copy of the GNU Lesser General Public License 19along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 20http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 2151 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23#include <stdio.h> 24#include <stdlib.h> 25 26#define MPFR_NEED_LONGLONG_H /* for umul_ppmm */ 27#include "mpfr-impl.h" 28 29#define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1) 30 31#define MPFR_COM_N(x,y,n) \ 32 { \ 33 mp_size_t i; \ 34 for (i = 0; i < n; i++) \ 35 *((x)+i) = ~*((y)+i); \ 36 } 37 38/* Put in X a p-bit approximation of 1/sqrt(A), 39 where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS), 40 A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS), 41 where B = 2^GMP_NUMB_BITS. 42 43 We have 1 <= A < 4 and 1/2 <= X < 1. 44 45 The error in the approximate result with respect to the true 46 value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1. 47 48 Note: x and a are left-aligned, i.e., the most significant bit of 49 a[an-1] is set, and so is the most significant bit of the output x[n-1]. 50 51 If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input 52 A are taken into account to compute the approximation of 1/sqrt(A), but 53 whether or not they are zero, the error between X and 1/sqrt(A) is bounded 54 by 1 ulp(X) [in precision p]. 55 The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS) 56 are set to 0. 57 58 Assumptions: 59 (1) A should be normalized, i.e., the most significant bit of a[an-1] 60 should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4. 61 (2) p >= 12 62 (3) {a, an} and {x, n} should not overlap 63 (4) GMP_NUMB_BITS >= 12 and is even 64 65 Note: this routine is much more efficient when ap is small compared to p, 66 including the case where ap <= GMP_NUMB_BITS, thus it can be used to 67 implement an efficient mpfr_rec_sqrt_ui function. 68 69 References: 70 [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann, 71 http://www.loria.fr/~zimmerma/mca/pub226.html 72*/ 73static void 74mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p, 75 mpfr_limb_srcptr a, mpfr_prec_t ap, int as) 76 77{ 78 /* the following T1 and T2 are bipartite tables giving initial 79 approximation for the inverse square root, with 13-bit input split in 80 5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192, 81 with i = a*2^8 + b*2^4 + c, we use for approximation of 82 2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c]. 83 The largest error is obtained for i = 2054, where x = 2044, 84 and 2048/sqrt(i/2048) = 2045.006576... 85 */ 86 static short int T1[384] = { 872040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951, 881944, 1938, 1931, /* a=8 */ 891925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849, 901844, 1838, 1832, /* a=9 */ 911827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762, 921757, 1752, 1747, /* a=10 */ 931742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686, 941681, 1677, 1673, /* a=11 */ 951669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619, 961615, 1611, 1607, /* a=12 */ 971603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559, 981556, 1552, 1549, /* a=13 */ 991545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505, 1001502, 1499, 1496, /* a=14 */ 1011493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457, 1021454, 1451, 1449, /* a=15 */ 1031446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413, 1041411, 1408, 1405, /* a=16 */ 1051403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373, 1061371, 1368, 1366, /* a=17 */ 1071363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335, 1081333, 1331, 1329, /* a=18 */ 1091327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302, 1101300, 1298, 1296, /* a=19 */ 1111294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270, 1121268, 1266, 1265, /* a=20 */ 1131263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241, 1141239, 1237, 1235, /* a=21 */ 1151234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213, 1161212, 1210, 1208, /* a=22 */ 1171206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187, 1181185, 1184, 1182, /* a=23 */ 1191181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163, 1201162, 1160, 1159, /* a=24 */ 1211157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140, 1221139, 1137, 1136, /* a=25 */ 1231135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119, 1241117, 1116, 1115, /* a=26 */ 1251114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099, 1261098, 1096, 1095, /* a=27 */ 1271093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079, 1281078, 1077, 1076, /* a=28 */ 1291075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061, 1301060, 1059, 1058, /* a=29 */ 1311057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044, 1321043, 1042, 1041, /* a=30 */ 1331040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028, 1341027, 1026, 1025 /* a=31 */ 135}; 136 static unsigned char T2[384] = { 137 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */ 138 6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */ 139 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */ 140 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */ 141 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */ 142 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */ 143 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */ 144 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */ 145 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */ 146 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */ 147 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */ 148 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */ 149 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */ 150 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */ 151 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */ 152 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */ 153 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */ 154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */ 155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */ 156 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */ 157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */ 158 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */ 159 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */ 160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* a=31 */ 161}; 162 mp_size_t n = LIMB_SIZE(p); /* number of limbs of X */ 163 mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */ 164 165 /* A should be normalized */ 166 MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0); 167 /* We should have enough bits in one limb and GMP_NUMB_BITS should be even. 168 Since that does not depend on MPFR, we always check this. */ 169 MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0)); 170 /* {a, an} and {x, n} should not overlap */ 171 MPFR_ASSERTD((a + an <= x) || (x + n <= a)); 172 MPFR_ASSERTD(p >= 11); 173 174 if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */ 175 { 176 a += an - n; 177 an = n; 178 } 179 180 if (p == 11) /* should happen only from recursive calls */ 181 { 182 unsigned long i, ab, ac; 183 mp_limb_t t; 184 185 /* take the 12+as most significant bits of A */ 186 i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as)); 187 /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */ 188 ab = i >> 4; 189 ac = (ab & 0x3F0) | (i & 0x0F); 190 t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80]; 191 x[0] = t << (GMP_NUMB_BITS - p); 192 } 193 else /* p >= 12 */ 194 { 195 mpfr_prec_t h, pl; 196 mpfr_limb_ptr r, s, t, u; 197 mp_size_t xn, rn, th, ln, tn, sn, ahn, un; 198 mp_limb_t neg, cy, cu; 199 MPFR_TMP_DECL(marker); 200 201 /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0, 202 and A/4 if as=1. */ 203 204 /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */ 205 h = (p < 18) ? 11 : (p >> 1) + 2; 206 207 xn = LIMB_SIZE(h); /* limb size of the recursive Xh */ 208 rn = LIMB_SIZE(2 * h); /* a priori limb size of Xh^2 */ 209 ln = n - xn; /* remaining limbs to be computed */ 210 211 /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2} 212 we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3}, 213 thus the h-3 most significant bits of t should be zero, 214 which is in fact h+1+as-3 because of the normalization of A. 215 This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs. 216 217 More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2}) 218 <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2} 219 since A < 4 and h >= 11, thus 220 |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}. 221 This is sufficient to prove that the upper limb of {t,tn} below is 222 less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below. 223 */ 224 th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS; 225 tn = LIMB_SIZE(2 * h + 1 + as); 226 227 /* we need h+1+as bits of a */ 228 ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A 229 needed for the recursive call*/ 230 if (MPFR_UNLIKELY(ahn > an)) 231 ahn = an; 232 mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as); 233 /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS) 234 limbs, the low (-h) % GMP_NUMB_BITS bits are zero */ 235 236 /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */ 237 238 MPFR_TMP_MARK (marker); 239 /* first step: square X in r, result is exact */ 240 un = xn + (tn - th); 241 /* We use the same temporary buffer to store r and u: r needs 2*xn 242 limbs where u needs xn+(tn-th) limbs. Since tn can store at least 243 2h bits, and th at most h bits, then tn-th can store at least h bits, 244 thus tn - th >= xn, and reserving the space for u is enough. */ 245 MPFR_ASSERTD(2 * xn <= un); 246 u = r = MPFR_TMP_LIMBS_ALLOC (un); 247 if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1, 248 thus ln = 0 */ 249 { 250 MPFR_ASSERTD(ln == 0); 251 cy = x[0] >> (GMP_NUMB_BITS >> 1); 252 r ++; 253 r[0] = cy * cy; 254 } 255 else if (xn == 1) /* xn=1, rn=2 */ 256 umul_ppmm(r[1], r[0], x[ln], x[ln]); 257 else 258 { 259 mpn_mul_n (r, x + ln, x + ln, xn); 260 /* we have {r, 2*xn} = X_h^2 */ 261 if (rn < 2 * xn) 262 r ++; 263 } 264 /* now the 2h most significant bits of {r, rn} contains X^2, r has rn 265 limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */ 266 267 /* Second step: s <- A * (r^2), and truncate the low ap bits, 268 i.e., at weight 2^{-2h} (s is aligned to the low significant bits) 269 */ 270 sn = an + rn; 271 s = MPFR_TMP_LIMBS_ALLOC (sn); 272 if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h, 273 and 2h >= p+3 */ 274 { 275 /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low 276 bits from A */ 277 /* since n=1, and we ensured an <= n, we also have an=1 */ 278 MPFR_ASSERTD(an == 1); 279 umul_ppmm (s[1], s[0], r[0], a[0]); 280 } 281 else 282 { 283 /* we have p <= n * GMP_NUMB_BITS 284 2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4 285 thus n <= rn <= n + 1 */ 286 MPFR_ASSERTD(rn <= n + 1); 287 /* since we ensured an <= n, we have an <= rn */ 288 MPFR_ASSERTD(an <= rn); 289 mpn_mul (s, r, rn, a, an); 290 /* s should be near B^sn/2^(1+as), thus s[sn-1] is either 291 100000... or 011111... if as=0, or 292 010000... or 001111... if as=1. 293 We ignore the bits of s after the first 2h+1+as ones. 294 We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */ 295 } 296 297 /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an 298 limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */ 299 t = s + sn - tn; /* pointer to low limb of the high part of t */ 300 /* the upper h-3 bits of 1-t should be zero, 301 where 1 corresponds to the most significant bit of t[tn-1] if as=0, 302 and to the 2nd most significant bit of t[tn-1] if as=1 */ 303 304 /* compute t <- 1 - t, which is B^tn - {t, tn+1}, 305 with rounding toward -Inf, i.e., rounding the input t toward +Inf. 306 We could only modify the low tn - th limbs from t, but it gives only 307 a small speedup, and would make the code more complex. 308 */ 309 neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as); 310 if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th) 311 is the part truncated above, thus 1 - t rounded to -Inf 312 is 1 - th - ulp(th) */ 313 { 314 /* since the 1+as most significant bits of t are zero, set them 315 to 1 before the one-complement */ 316 t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as); 317 MPFR_COM_N (t, t, tn); 318 /* we should add 1 here to get 1-th complement, and subtract 1 for 319 -ulp(th), thus we do nothing */ 320 } 321 else /* negative case: we want 1 - t rounded toward -Inf, i.e., 322 th + eps rounded toward +Inf, which is th + ulp(th): 323 we discard the bit corresponding to 1, 324 and we add 1 to the least significant bit of t */ 325 { 326 t[tn - 1] ^= neg; 327 mpn_add_1 (t, t, tn, 1); 328 } 329 tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of 330 the high limbs of {t, tn} are zero */ 331 332 /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and 333 th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */ 334 MPFR_ASSERTD(tn > 0); 335 336 /* u <- x * t, where {t, tn} contains at least h+3 bits, 337 and {x, xn} contains h bits, thus tn >= xn */ 338 MPFR_ASSERTD(tn >= xn); 339 if (tn == 1) /* necessarily xn=1 */ 340 umul_ppmm (u[1], u[0], t[0], x[ln]); 341 else 342 mpn_mul (u, t, tn, x + ln, xn); 343 344 /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */ 345 346 /* we have already discarded the upper th high limbs of t, thus we only 347 have to consider the upper n - th limbs of u */ 348 un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS, 349 h = ceil((p+3)/2) <= (p+4)/2, 350 th*GMP_NUMB_BITS <= h-1 <= p/2+1, 351 thus (n-th)*GMP_NUMB_BITS >= p/2-1. 352 */ 353 MPFR_ASSERTD(un > 0); 354 u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th) 355 = xn + original_tn - n 356 = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0 357 since 2h >= p+3 */ 358 MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */ 359 360 /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we 361 need to add or subtract. 362 In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply 363 u by 2. */ 364 365 if (as == 1) 366 /* shift on un+1 limbs to get most significant bit of u[-1] into 367 least significant bit of u[0] */ 368 mpn_lshift (u - 1, u - 1, un + 1, 1); 369 370 /* now {u,un} represents U / 2 from Algorithm 3.9 */ 371 372 pl = n * GMP_NUMB_BITS - p; /* low bits from x */ 373 /* We want that the low pl bits are zero after rounding to nearest, 374 thus we round u to nearest at bit pl-1 of u[0] */ 375 if (pl > 0) 376 { 377 cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1))); 378 /* mask bits 0..pl-1 of u[0] */ 379 u[0] &= ~MPFR_LIMB_MASK(pl); 380 } 381 else /* round bit is in u[-1] */ 382 cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1)); 383 MPFR_ASSERTN(cu == 0); 384 385 /* We already have filled {x + ln, xn = n - ln}, and we want to add or 386 subtract {u, un} at position x. 387 un = n - th, where th contains <= h+1+as-3<=h-1 bits 388 ln = n - xn, where xn contains >= h bits 389 thus un > ln. 390 Warning: ln might be zero. 391 */ 392 MPFR_ASSERTD(un > ln); 393 /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and 394 p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */ 395 MPFR_ASSERTD(un == ln + 1 || un == ln + 2); 396 /* the high un-ln limbs of u will overlap the low part of {x+ln,xn}, 397 we need to add or subtract the overlapping part {u + ln, un - ln} */ 398 /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1 399 below (with size = th) mustn't be used. */ 400 if (neg == 0) 401 { 402 if (ln > 0) 403 MPN_COPY (x, u, ln); 404 cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln); 405 /* cy is the carry at x + (ln + xn) = x + n */ 406 } 407 else /* negative case */ 408 { 409 /* subtract {u+ln, un-ln} from {x+ln,un} */ 410 cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln); 411 /* cy is the borrow at x + (ln + xn) = x + n */ 412 413 /* cy cannot be non-zero, since the most significant bit of Xh is 1, 414 and the correction is bounded by 2^{-h+3} */ 415 MPFR_ASSERTD(cy == 0); 416 if (ln > 0) 417 { 418 MPFR_COM_N (x, u, ln); 419 /* we must add one for the 2-complement ... */ 420 cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE); 421 /* ... and subtract 1 at x[ln], where n = ln + xn */ 422 cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE); 423 } 424 } 425 426 /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should 427 have X = B^n, and setting X to 1-2^{-p} satisties the error bound 428 of 1 ulp. */ 429 if (MPFR_UNLIKELY(cy != 0)) 430 { 431 cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl); 432 MPFR_ASSERTD(cy == 0); 433 } 434 435 MPFR_TMP_FREE (marker); 436 } 437} 438 439int 440mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode) 441{ 442 mpfr_prec_t rp, up, wp; 443 mp_size_t rn, wn; 444 int s, cy, inex; 445 mpfr_limb_ptr x; 446 MPFR_TMP_DECL(marker); 447 448 MPFR_LOG_FUNC 449 (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode), 450 ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex)); 451 452 /* special values */ 453 if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u))) 454 { 455 if (MPFR_IS_NAN(u)) 456 { 457 MPFR_SET_NAN(r); 458 MPFR_RET_NAN; 459 } 460 else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */ 461 { 462 /* 0+ or 0- */ 463 MPFR_SET_INF(r); 464 MPFR_SET_POS(r); 465 mpfr_set_divby0 (); 466 MPFR_RET(0); /* Inf is exact */ 467 } 468 else 469 { 470 MPFR_ASSERTD(MPFR_IS_INF(u)); 471 /* 1/sqrt(-Inf) = NAN */ 472 if (MPFR_IS_NEG(u)) 473 { 474 MPFR_SET_NAN(r); 475 MPFR_RET_NAN; 476 } 477 /* 1/sqrt(+Inf) = +0 */ 478 MPFR_SET_POS(r); 479 MPFR_SET_ZERO(r); 480 MPFR_RET(0); 481 } 482 } 483 484 /* if u < 0, 1/sqrt(u) is NaN */ 485 if (MPFR_UNLIKELY(MPFR_IS_NEG(u))) 486 { 487 MPFR_SET_NAN(r); 488 MPFR_RET_NAN; 489 } 490 491 MPFR_SET_POS(r); 492 493 rp = MPFR_PREC(r); /* output precision */ 494 up = MPFR_PREC(u); /* input precision */ 495 wp = rp + 11; /* initial working precision */ 496 497 /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1. 498 If e is even, we compute an approximation of X of (4U)^{-1/2}, 499 and the result is X*2^(-(e-2)/2) [case s=1]. 500 If e is odd, we compute an approximation of X of (2U)^{-1/2}, 501 and the result is X*2^(-(e-1)/2) [case s=0]. */ 502 503 /* parity of the exponent of u */ 504 s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1); 505 506 rn = LIMB_SIZE(rp); 507 508 /* for the first iteration, if rp + 11 fits into rn limbs, we round up 509 up to a full limb to maximize the chance of rounding, while avoiding 510 to allocate extra space */ 511 wp = rp + 11; 512 if (wp < rn * GMP_NUMB_BITS) 513 wp = rn * GMP_NUMB_BITS; 514 for (;;) 515 { 516 MPFR_TMP_MARK (marker); 517 wn = LIMB_SIZE(wp); 518 if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */ 519 x = MPFR_TMP_LIMBS_ALLOC (wn); 520 else 521 x = MPFR_MANT(r); 522 mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s); 523 /* If the input was not truncated, the error is at most one ulp; 524 if the input was truncated, the error is at most two ulps 525 (see algorithms.tex). */ 526 if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up), 527 rp + (rnd_mode == MPFR_RNDN)))) 528 break; 529 530 /* We detect only now the exact case where u=2^(2e), to avoid 531 slowing down the average case. This can happen only when the 532 mantissa is exactly 1/2 and the exponent is odd. */ 533 if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0) 534 { 535 mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp; 536 537 /* we should have x=111...111 */ 538 mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl); 539 x[wn - 1] = MPFR_LIMB_HIGHBIT; 540 s += 2; 541 break; /* go through */ 542 } 543 MPFR_TMP_FREE(marker); 544 545 wp += GMP_NUMB_BITS; 546 } 547 cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex); 548 MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2; 549 if (MPFR_UNLIKELY(cy != 0)) 550 { 551 MPFR_EXP(r) ++; 552 MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT; 553 } 554 MPFR_TMP_FREE(marker); 555 return mpfr_check_range (r, inex, rnd_mode); 556} 557