1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static void re_string_construct_common (const char *str, int len, 21 re_string_t *pstr, 22 RE_TRANSLATE_TYPE trans, int icase, 23 const re_dfa_t *dfa) internal_function; 24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 25 const re_node_set *nodes, 26 unsigned int hash) internal_function; 27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 28 const re_node_set *nodes, 29 unsigned int context, 30 unsigned int hash) internal_function; 31 32/* Functions for string operation. */ 33 34/* This function allocate the buffers. It is necessary to call 35 re_string_reconstruct before using the object. */ 36 37static reg_errcode_t 38internal_function __attribute_warn_unused_result__ 39re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len, 40 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 41{ 42 reg_errcode_t ret; 43 int init_buf_len; 44 45 /* Ensure at least one character fits into the buffers. */ 46 if (init_len < dfa->mb_cur_max) 47 init_len = dfa->mb_cur_max; 48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 49 re_string_construct_common (str, len, pstr, trans, icase, dfa); 50 51 ret = re_string_realloc_buffers (pstr, init_buf_len); 52 if (BE (ret != REG_NOERROR, 0)) 53 return ret; 54 55 pstr->word_char = dfa->word_char; 56 pstr->word_ops_used = dfa->word_ops_used; 57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 59 pstr->valid_raw_len = pstr->valid_len; 60 return REG_NOERROR; 61} 62 63/* This function allocate the buffers, and initialize them. */ 64 65static reg_errcode_t 66internal_function __attribute_warn_unused_result__ 67re_string_construct (re_string_t *pstr, const char *str, int len, 68 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 69{ 70 reg_errcode_t ret; 71 memset (pstr, '\0', sizeof (re_string_t)); 72 re_string_construct_common (str, len, pstr, trans, icase, dfa); 73 74 if (len > 0) 75 { 76 ret = re_string_realloc_buffers (pstr, len + 1); 77 if (BE (ret != REG_NOERROR, 0)) 78 return ret; 79 } 80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 81 82 if (icase) 83 { 84#ifdef RE_ENABLE_I18N 85 if (dfa->mb_cur_max > 1) 86 { 87 while (1) 88 { 89 ret = build_wcs_upper_buffer (pstr); 90 if (BE (ret != REG_NOERROR, 0)) 91 return ret; 92 if (pstr->valid_raw_len >= len) 93 break; 94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 95 break; 96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 97 if (BE (ret != REG_NOERROR, 0)) 98 return ret; 99 } 100 } 101 else 102#endif /* RE_ENABLE_I18N */ 103 build_upper_buffer (pstr); 104 } 105 else 106 { 107#ifdef RE_ENABLE_I18N 108 if (dfa->mb_cur_max > 1) 109 build_wcs_buffer (pstr); 110 else 111#endif /* RE_ENABLE_I18N */ 112 { 113 if (trans != NULL) 114 re_string_translate_buffer (pstr); 115 else 116 { 117 pstr->valid_len = pstr->bufs_len; 118 pstr->valid_raw_len = pstr->bufs_len; 119 } 120 } 121 } 122 123 return REG_NOERROR; 124} 125 126/* Helper functions for re_string_allocate, and re_string_construct. */ 127 128static reg_errcode_t 129internal_function __attribute_warn_unused_result__ 130re_string_realloc_buffers (re_string_t *pstr, int new_buf_len) 131{ 132#ifdef RE_ENABLE_I18N 133 if (pstr->mb_cur_max > 1) 134 { 135 wint_t *new_wcs; 136 137 /* Avoid overflow in realloc. */ 138 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int)); 139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 140 return REG_ESPACE; 141 142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 143 if (BE (new_wcs == NULL, 0)) 144 return REG_ESPACE; 145 pstr->wcs = new_wcs; 146 if (pstr->offsets != NULL) 147 { 148 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len); 149 if (BE (new_offsets == NULL, 0)) 150 return REG_ESPACE; 151 pstr->offsets = new_offsets; 152 } 153 } 154#endif /* RE_ENABLE_I18N */ 155 if (pstr->mbs_allocated) 156 { 157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 158 new_buf_len); 159 if (BE (new_mbs == NULL, 0)) 160 return REG_ESPACE; 161 pstr->mbs = new_mbs; 162 } 163 pstr->bufs_len = new_buf_len; 164 return REG_NOERROR; 165} 166 167 168static void 169internal_function 170re_string_construct_common (const char *str, int len, re_string_t *pstr, 171 RE_TRANSLATE_TYPE trans, int icase, 172 const re_dfa_t *dfa) 173{ 174 pstr->raw_mbs = (const unsigned char *) str; 175 pstr->len = len; 176 pstr->raw_len = len; 177 pstr->trans = trans; 178 pstr->icase = icase ? 1 : 0; 179 pstr->mbs_allocated = (trans != NULL || icase); 180 pstr->mb_cur_max = dfa->mb_cur_max; 181 pstr->is_utf8 = dfa->is_utf8; 182 pstr->map_notascii = dfa->map_notascii; 183 pstr->stop = pstr->len; 184 pstr->raw_stop = pstr->stop; 185} 186 187#ifdef RE_ENABLE_I18N 188 189/* Build wide character buffer PSTR->WCS. 190 If the byte sequence of the string are: 191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 192 Then wide character buffer will be: 193 <wc1> , WEOF , <wc2> , WEOF , <wc3> 194 We use WEOF for padding, they indicate that the position isn't 195 a first byte of a multibyte character. 196 197 Note that this function assumes PSTR->VALID_LEN elements are already 198 built and starts from PSTR->VALID_LEN. */ 199 200static void 201internal_function 202build_wcs_buffer (re_string_t *pstr) 203{ 204#ifdef _LIBC 205 unsigned char buf[MB_LEN_MAX]; 206 assert (MB_LEN_MAX >= pstr->mb_cur_max); 207#else 208 unsigned char buf[64]; 209#endif 210 mbstate_t prev_st; 211 int byte_idx, end_idx, remain_len; 212 size_t mbclen; 213 214 /* Build the buffers from pstr->valid_len to either pstr->len or 215 pstr->bufs_len. */ 216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 218 { 219 wchar_t wc; 220 const char *p; 221 222 remain_len = end_idx - byte_idx; 223 prev_st = pstr->cur_state; 224 /* Apply the translation if we need. */ 225 if (BE (pstr->trans != NULL, 0)) 226 { 227 int i, ch; 228 229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 230 { 231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 233 } 234 p = (const char *) buf; 235 } 236 else 237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 238 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 239 if (BE (mbclen == (size_t) -1 || mbclen == 0 240 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) 241 { 242 /* We treat these cases as a singlebyte character. */ 243 mbclen = 1; 244 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 245 if (BE (pstr->trans != NULL, 0)) 246 wc = pstr->trans[wc]; 247 pstr->cur_state = prev_st; 248 } 249 else if (BE (mbclen == (size_t) -2, 0)) 250 { 251 /* The buffer doesn't have enough space, finish to build. */ 252 pstr->cur_state = prev_st; 253 break; 254 } 255 256 /* Write wide character and padding. */ 257 pstr->wcs[byte_idx++] = wc; 258 /* Write paddings. */ 259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 260 pstr->wcs[byte_idx++] = WEOF; 261 } 262 pstr->valid_len = byte_idx; 263 pstr->valid_raw_len = byte_idx; 264} 265 266/* Build wide character buffer PSTR->WCS like build_wcs_buffer, 267 but for REG_ICASE. */ 268 269static reg_errcode_t 270internal_function __attribute_warn_unused_result__ 271build_wcs_upper_buffer (re_string_t *pstr) 272{ 273 mbstate_t prev_st; 274 int src_idx, byte_idx, end_idx, remain_len; 275 size_t mbclen; 276#ifdef _LIBC 277 char buf[MB_LEN_MAX]; 278 assert (MB_LEN_MAX >= pstr->mb_cur_max); 279#else 280 char buf[64]; 281#endif 282 283 byte_idx = pstr->valid_len; 284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 285 286 /* The following optimization assumes that ASCII characters can be 287 mapped to wide characters with a simple cast. */ 288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 289 { 290 while (byte_idx < end_idx) 291 { 292 wchar_t wc; 293 294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 295 && mbsinit (&pstr->cur_state)) 296 { 297 /* In case of a singlebyte character. */ 298 pstr->mbs[byte_idx] 299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 300 /* The next step uses the assumption that wchar_t is encoded 301 ASCII-safe: all ASCII values can be converted like this. */ 302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 303 ++byte_idx; 304 continue; 305 } 306 307 remain_len = end_idx - byte_idx; 308 prev_st = pstr->cur_state; 309 mbclen = __mbrtowc (&wc, 310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 311 + byte_idx), remain_len, &pstr->cur_state); 312 if (BE (mbclen + 2 > 2, 1)) 313 { 314 wchar_t wcu = wc; 315 if (iswlower (wc)) 316 { 317 size_t mbcdlen; 318 319 wcu = towupper (wc); 320 mbcdlen = wcrtomb (buf, wcu, &prev_st); 321 if (BE (mbclen == mbcdlen, 1)) 322 memcpy (pstr->mbs + byte_idx, buf, mbclen); 323 else 324 { 325 src_idx = byte_idx; 326 goto offsets_needed; 327 } 328 } 329 else 330 memcpy (pstr->mbs + byte_idx, 331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 332 pstr->wcs[byte_idx++] = wcu; 333 /* Write paddings. */ 334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 335 pstr->wcs[byte_idx++] = WEOF; 336 } 337 else if (mbclen == (size_t) -1 || mbclen == 0 338 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 339 { 340 /* It is an invalid character, an incomplete character 341 at the end of the string, or '\0'. Just use the byte. */ 342 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 343 pstr->mbs[byte_idx] = ch; 344 /* And also cast it to wide char. */ 345 pstr->wcs[byte_idx++] = (wchar_t) ch; 346 if (BE (mbclen == (size_t) -1, 0)) 347 pstr->cur_state = prev_st; 348 } 349 else 350 { 351 /* The buffer doesn't have enough space, finish to build. */ 352 pstr->cur_state = prev_st; 353 break; 354 } 355 } 356 pstr->valid_len = byte_idx; 357 pstr->valid_raw_len = byte_idx; 358 return REG_NOERROR; 359 } 360 else 361 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 362 { 363 wchar_t wc; 364 const char *p; 365 offsets_needed: 366 remain_len = end_idx - byte_idx; 367 prev_st = pstr->cur_state; 368 if (BE (pstr->trans != NULL, 0)) 369 { 370 int i, ch; 371 372 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 373 { 374 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 375 buf[i] = pstr->trans[ch]; 376 } 377 p = (const char *) buf; 378 } 379 else 380 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 381 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 382 if (BE (mbclen + 2 > 2, 1)) 383 { 384 wchar_t wcu = wc; 385 if (iswlower (wc)) 386 { 387 size_t mbcdlen; 388 389 wcu = towupper (wc); 390 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 391 if (BE (mbclen == mbcdlen, 1)) 392 memcpy (pstr->mbs + byte_idx, buf, mbclen); 393 else if (mbcdlen != (size_t) -1) 394 { 395 size_t i; 396 397 if (byte_idx + mbcdlen > pstr->bufs_len) 398 { 399 pstr->cur_state = prev_st; 400 break; 401 } 402 403 if (pstr->offsets == NULL) 404 { 405 pstr->offsets = re_malloc (int, pstr->bufs_len); 406 407 if (pstr->offsets == NULL) 408 return REG_ESPACE; 409 } 410 if (!pstr->offsets_needed) 411 { 412 for (i = 0; i < (size_t) byte_idx; ++i) 413 pstr->offsets[i] = i; 414 pstr->offsets_needed = 1; 415 } 416 417 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 418 pstr->wcs[byte_idx] = wcu; 419 pstr->offsets[byte_idx] = src_idx; 420 for (i = 1; i < mbcdlen; ++i) 421 { 422 pstr->offsets[byte_idx + i] 423 = src_idx + (i < mbclen ? i : mbclen - 1); 424 pstr->wcs[byte_idx + i] = WEOF; 425 } 426 pstr->len += mbcdlen - mbclen; 427 if (pstr->raw_stop > src_idx) 428 pstr->stop += mbcdlen - mbclen; 429 end_idx = (pstr->bufs_len > pstr->len) 430 ? pstr->len : pstr->bufs_len; 431 byte_idx += mbcdlen; 432 src_idx += mbclen; 433 continue; 434 } 435 else 436 memcpy (pstr->mbs + byte_idx, p, mbclen); 437 } 438 else 439 memcpy (pstr->mbs + byte_idx, p, mbclen); 440 441 if (BE (pstr->offsets_needed != 0, 0)) 442 { 443 size_t i; 444 for (i = 0; i < mbclen; ++i) 445 pstr->offsets[byte_idx + i] = src_idx + i; 446 } 447 src_idx += mbclen; 448 449 pstr->wcs[byte_idx++] = wcu; 450 /* Write paddings. */ 451 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 452 pstr->wcs[byte_idx++] = WEOF; 453 } 454 else if (mbclen == (size_t) -1 || mbclen == 0 455 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 456 { 457 /* It is an invalid character or '\0'. Just use the byte. */ 458 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 459 460 if (BE (pstr->trans != NULL, 0)) 461 ch = pstr->trans [ch]; 462 pstr->mbs[byte_idx] = ch; 463 464 if (BE (pstr->offsets_needed != 0, 0)) 465 pstr->offsets[byte_idx] = src_idx; 466 ++src_idx; 467 468 /* And also cast it to wide char. */ 469 pstr->wcs[byte_idx++] = (wchar_t) ch; 470 if (BE (mbclen == (size_t) -1, 0)) 471 pstr->cur_state = prev_st; 472 } 473 else 474 { 475 /* The buffer doesn't have enough space, finish to build. */ 476 pstr->cur_state = prev_st; 477 break; 478 } 479 } 480 pstr->valid_len = byte_idx; 481 pstr->valid_raw_len = src_idx; 482 return REG_NOERROR; 483} 484 485/* Skip characters until the index becomes greater than NEW_RAW_IDX. 486 Return the index. */ 487 488static int 489internal_function 490re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc) 491{ 492 mbstate_t prev_st; 493 int rawbuf_idx; 494 size_t mbclen; 495 wint_t wc = WEOF; 496 497 /* Skip the characters which are not necessary to check. */ 498 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 499 rawbuf_idx < new_raw_idx;) 500 { 501 wchar_t wc2; 502 int remain_len = pstr->raw_len - rawbuf_idx; 503 prev_st = pstr->cur_state; 504 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 505 remain_len, &pstr->cur_state); 506 if (BE ((ssize_t) mbclen <= 0, 0)) 507 { 508 /* We treat these cases as a single byte character. */ 509 if (mbclen == 0 || remain_len == 0) 510 wc = L'\0'; 511 else 512 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 513 mbclen = 1; 514 pstr->cur_state = prev_st; 515 } 516 else 517 wc = (wint_t) wc2; 518 /* Then proceed the next character. */ 519 rawbuf_idx += mbclen; 520 } 521 *last_wc = wc; 522 return rawbuf_idx; 523} 524#endif /* RE_ENABLE_I18N */ 525 526/* Build the buffer PSTR->MBS, and apply the translation if we need. 527 This function is used in case of REG_ICASE. */ 528 529static void 530internal_function 531build_upper_buffer (re_string_t *pstr) 532{ 533 int char_idx, end_idx; 534 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 535 536 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 537 { 538 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 539 if (BE (pstr->trans != NULL, 0)) 540 ch = pstr->trans[ch]; 541 if (islower (ch)) 542 pstr->mbs[char_idx] = toupper (ch); 543 else 544 pstr->mbs[char_idx] = ch; 545 } 546 pstr->valid_len = char_idx; 547 pstr->valid_raw_len = char_idx; 548} 549 550/* Apply TRANS to the buffer in PSTR. */ 551 552static void 553internal_function 554re_string_translate_buffer (re_string_t *pstr) 555{ 556 int buf_idx, end_idx; 557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 558 559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 560 { 561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 562 pstr->mbs[buf_idx] = pstr->trans[ch]; 563 } 564 565 pstr->valid_len = buf_idx; 566 pstr->valid_raw_len = buf_idx; 567} 568 569/* This function re-construct the buffers. 570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 571 convert to upper case in case of REG_ICASE, apply translation. */ 572 573static reg_errcode_t 574internal_function __attribute_warn_unused_result__ 575re_string_reconstruct (re_string_t *pstr, int idx, int eflags) 576{ 577 int offset = idx - pstr->raw_mbs_idx; 578 if (BE (offset < 0, 0)) 579 { 580 /* Reset buffer. */ 581#ifdef RE_ENABLE_I18N 582 if (pstr->mb_cur_max > 1) 583 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 584#endif /* RE_ENABLE_I18N */ 585 pstr->len = pstr->raw_len; 586 pstr->stop = pstr->raw_stop; 587 pstr->valid_len = 0; 588 pstr->raw_mbs_idx = 0; 589 pstr->valid_raw_len = 0; 590 pstr->offsets_needed = 0; 591 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 592 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 593 if (!pstr->mbs_allocated) 594 pstr->mbs = (unsigned char *) pstr->raw_mbs; 595 offset = idx; 596 } 597 598 if (BE (offset != 0, 1)) 599 { 600 /* Should the already checked characters be kept? */ 601 if (BE (offset < pstr->valid_raw_len, 1)) 602 { 603 /* Yes, move them to the front of the buffer. */ 604#ifdef RE_ENABLE_I18N 605 if (BE (pstr->offsets_needed, 0)) 606 { 607 int low = 0, high = pstr->valid_len, mid; 608 do 609 { 610 mid = (high + low) / 2; 611 if (pstr->offsets[mid] > offset) 612 high = mid; 613 else if (pstr->offsets[mid] < offset) 614 low = mid + 1; 615 else 616 break; 617 } 618 while (low < high); 619 if (pstr->offsets[mid] < offset) 620 ++mid; 621 pstr->tip_context = re_string_context_at (pstr, mid - 1, 622 eflags); 623 /* This can be quite complicated, so handle specially 624 only the common and easy case where the character with 625 different length representation of lower and upper 626 case is present at or after offset. */ 627 if (pstr->valid_len > offset 628 && mid == offset && pstr->offsets[mid] == offset) 629 { 630 memmove (pstr->wcs, pstr->wcs + offset, 631 (pstr->valid_len - offset) * sizeof (wint_t)); 632 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 633 pstr->valid_len -= offset; 634 pstr->valid_raw_len -= offset; 635 for (low = 0; low < pstr->valid_len; low++) 636 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 637 } 638 else 639 { 640 /* Otherwise, just find out how long the partial multibyte 641 character at offset is and fill it with WEOF/255. */ 642 pstr->len = pstr->raw_len - idx + offset; 643 pstr->stop = pstr->raw_stop - idx + offset; 644 pstr->offsets_needed = 0; 645 while (mid > 0 && pstr->offsets[mid - 1] == offset) 646 --mid; 647 while (mid < pstr->valid_len) 648 if (pstr->wcs[mid] != WEOF) 649 break; 650 else 651 ++mid; 652 if (mid == pstr->valid_len) 653 pstr->valid_len = 0; 654 else 655 { 656 pstr->valid_len = pstr->offsets[mid] - offset; 657 if (pstr->valid_len) 658 { 659 for (low = 0; low < pstr->valid_len; ++low) 660 pstr->wcs[low] = WEOF; 661 memset (pstr->mbs, 255, pstr->valid_len); 662 } 663 } 664 pstr->valid_raw_len = pstr->valid_len; 665 } 666 } 667 else 668#endif 669 { 670 pstr->tip_context = re_string_context_at (pstr, offset - 1, 671 eflags); 672#ifdef RE_ENABLE_I18N 673 if (pstr->mb_cur_max > 1) 674 memmove (pstr->wcs, pstr->wcs + offset, 675 (pstr->valid_len - offset) * sizeof (wint_t)); 676#endif /* RE_ENABLE_I18N */ 677 if (BE (pstr->mbs_allocated, 0)) 678 memmove (pstr->mbs, pstr->mbs + offset, 679 pstr->valid_len - offset); 680 pstr->valid_len -= offset; 681 pstr->valid_raw_len -= offset; 682#if DEBUG 683 assert (pstr->valid_len > 0); 684#endif 685 } 686 } 687 else 688 { 689 /* No, skip all characters until IDX. */ 690 int prev_valid_len = pstr->valid_len; 691 692#ifdef RE_ENABLE_I18N 693 if (BE (pstr->offsets_needed, 0)) 694 { 695 pstr->len = pstr->raw_len - idx + offset; 696 pstr->stop = pstr->raw_stop - idx + offset; 697 pstr->offsets_needed = 0; 698 } 699#endif 700 pstr->valid_len = 0; 701#ifdef RE_ENABLE_I18N 702 if (pstr->mb_cur_max > 1) 703 { 704 int wcs_idx; 705 wint_t wc = WEOF; 706 707 if (pstr->is_utf8) 708 { 709 const unsigned char *raw, *p, *end; 710 711 /* Special case UTF-8. Multi-byte chars start with any 712 byte other than 0x80 - 0xbf. */ 713 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 714 end = raw + (offset - pstr->mb_cur_max); 715 if (end < pstr->raw_mbs) 716 end = pstr->raw_mbs; 717 p = raw + offset - 1; 718#ifdef _LIBC 719 /* We know the wchar_t encoding is UCS4, so for the simple 720 case, ASCII characters, skip the conversion step. */ 721 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 722 { 723 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 724 /* pstr->valid_len = 0; */ 725 wc = (wchar_t) *p; 726 } 727 else 728#endif 729 for (; p >= end; --p) 730 if ((*p & 0xc0) != 0x80) 731 { 732 mbstate_t cur_state; 733 wchar_t wc2; 734 int mlen = raw + pstr->len - p; 735 unsigned char buf[6]; 736 size_t mbclen; 737 738 const unsigned char *pp = p; 739 if (BE (pstr->trans != NULL, 0)) 740 { 741 int i = mlen < 6 ? mlen : 6; 742 while (--i >= 0) 743 buf[i] = pstr->trans[p[i]]; 744 pp = buf; 745 } 746 /* XXX Don't use mbrtowc, we know which conversion 747 to use (UTF-8 -> UCS4). */ 748 memset (&cur_state, 0, sizeof (cur_state)); 749 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, 750 &cur_state); 751 if (raw + offset - p <= mbclen 752 && mbclen < (size_t) -2) 753 { 754 memset (&pstr->cur_state, '\0', 755 sizeof (mbstate_t)); 756 pstr->valid_len = mbclen - (raw + offset - p); 757 wc = wc2; 758 } 759 break; 760 } 761 } 762 763 if (wc == WEOF) 764 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 765 if (wc == WEOF) 766 pstr->tip_context 767 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 768 else 769 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 770 && IS_WIDE_WORD_CHAR (wc)) 771 ? CONTEXT_WORD 772 : ((IS_WIDE_NEWLINE (wc) 773 && pstr->newline_anchor) 774 ? CONTEXT_NEWLINE : 0)); 775 if (BE (pstr->valid_len, 0)) 776 { 777 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 778 pstr->wcs[wcs_idx] = WEOF; 779 if (pstr->mbs_allocated) 780 memset (pstr->mbs, 255, pstr->valid_len); 781 } 782 pstr->valid_raw_len = pstr->valid_len; 783 } 784 else 785#endif /* RE_ENABLE_I18N */ 786 { 787 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 788 pstr->valid_raw_len = 0; 789 if (pstr->trans) 790 c = pstr->trans[c]; 791 pstr->tip_context = (bitset_contain (pstr->word_char, c) 792 ? CONTEXT_WORD 793 : ((IS_NEWLINE (c) && pstr->newline_anchor) 794 ? CONTEXT_NEWLINE : 0)); 795 } 796 } 797 if (!BE (pstr->mbs_allocated, 0)) 798 pstr->mbs += offset; 799 } 800 pstr->raw_mbs_idx = idx; 801 pstr->len -= offset; 802 pstr->stop -= offset; 803 804 /* Then build the buffers. */ 805#ifdef RE_ENABLE_I18N 806 if (pstr->mb_cur_max > 1) 807 { 808 if (pstr->icase) 809 { 810 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 811 if (BE (ret != REG_NOERROR, 0)) 812 return ret; 813 } 814 else 815 build_wcs_buffer (pstr); 816 } 817 else 818#endif /* RE_ENABLE_I18N */ 819 if (BE (pstr->mbs_allocated, 0)) 820 { 821 if (pstr->icase) 822 build_upper_buffer (pstr); 823 else if (pstr->trans != NULL) 824 re_string_translate_buffer (pstr); 825 } 826 else 827 pstr->valid_len = pstr->len; 828 829 pstr->cur_idx = 0; 830 return REG_NOERROR; 831} 832 833static unsigned char 834internal_function __attribute ((pure)) 835re_string_peek_byte_case (const re_string_t *pstr, int idx) 836{ 837 int ch, off; 838 839 /* Handle the common (easiest) cases first. */ 840 if (BE (!pstr->mbs_allocated, 1)) 841 return re_string_peek_byte (pstr, idx); 842 843#ifdef RE_ENABLE_I18N 844 if (pstr->mb_cur_max > 1 845 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 846 return re_string_peek_byte (pstr, idx); 847#endif 848 849 off = pstr->cur_idx + idx; 850#ifdef RE_ENABLE_I18N 851 if (pstr->offsets_needed) 852 off = pstr->offsets[off]; 853#endif 854 855 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 856 857#ifdef RE_ENABLE_I18N 858 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 859 this function returns CAPITAL LETTER I instead of first byte of 860 DOTLESS SMALL LETTER I. The latter would confuse the parser, 861 since peek_byte_case doesn't advance cur_idx in any way. */ 862 if (pstr->offsets_needed && !isascii (ch)) 863 return re_string_peek_byte (pstr, idx); 864#endif 865 866 return ch; 867} 868 869static unsigned char 870internal_function 871re_string_fetch_byte_case (re_string_t *pstr) 872{ 873 if (BE (!pstr->mbs_allocated, 1)) 874 return re_string_fetch_byte (pstr); 875 876#ifdef RE_ENABLE_I18N 877 if (pstr->offsets_needed) 878 { 879 int off, ch; 880 881 /* For tr_TR.UTF-8 [[:islower:]] there is 882 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 883 in that case the whole multi-byte character and return 884 the original letter. On the other side, with 885 [[: DOTLESS SMALL LETTER I return [[:I, as doing 886 anything else would complicate things too much. */ 887 888 if (!re_string_first_byte (pstr, pstr->cur_idx)) 889 return re_string_fetch_byte (pstr); 890 891 off = pstr->offsets[pstr->cur_idx]; 892 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 893 894 if (! isascii (ch)) 895 return re_string_fetch_byte (pstr); 896 897 re_string_skip_bytes (pstr, 898 re_string_char_size_at (pstr, pstr->cur_idx)); 899 return ch; 900 } 901#endif 902 903 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 904} 905 906static void 907internal_function 908re_string_destruct (re_string_t *pstr) 909{ 910#ifdef RE_ENABLE_I18N 911 re_free (pstr->wcs); 912 re_free (pstr->offsets); 913#endif /* RE_ENABLE_I18N */ 914 if (pstr->mbs_allocated) 915 re_free (pstr->mbs); 916} 917 918/* Return the context at IDX in INPUT. */ 919 920static unsigned int 921internal_function 922re_string_context_at (const re_string_t *input, int idx, int eflags) 923{ 924 int c; 925 if (BE (idx < 0, 0)) 926 /* In this case, we use the value stored in input->tip_context, 927 since we can't know the character in input->mbs[-1] here. */ 928 return input->tip_context; 929 if (BE (idx == input->len, 0)) 930 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 931 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 932#ifdef RE_ENABLE_I18N 933 if (input->mb_cur_max > 1) 934 { 935 wint_t wc; 936 int wc_idx = idx; 937 while(input->wcs[wc_idx] == WEOF) 938 { 939#ifdef DEBUG 940 /* It must not happen. */ 941 assert (wc_idx >= 0); 942#endif 943 --wc_idx; 944 if (wc_idx < 0) 945 return input->tip_context; 946 } 947 wc = input->wcs[wc_idx]; 948 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 949 return CONTEXT_WORD; 950 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 951 ? CONTEXT_NEWLINE : 0); 952 } 953 else 954#endif 955 { 956 c = re_string_byte_at (input, idx); 957 if (bitset_contain (input->word_char, c)) 958 return CONTEXT_WORD; 959 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 960 } 961} 962 963/* Functions for set operation. */ 964 965static reg_errcode_t 966internal_function __attribute_warn_unused_result__ 967re_node_set_alloc (re_node_set *set, int size) 968{ 969 set->alloc = size; 970 set->nelem = 0; 971 set->elems = re_malloc (int, size); 972 if (BE (set->elems == NULL, 0)) 973 return REG_ESPACE; 974 return REG_NOERROR; 975} 976 977static reg_errcode_t 978internal_function __attribute_warn_unused_result__ 979re_node_set_init_1 (re_node_set *set, int elem) 980{ 981 set->alloc = 1; 982 set->nelem = 1; 983 set->elems = re_malloc (int, 1); 984 if (BE (set->elems == NULL, 0)) 985 { 986 set->alloc = set->nelem = 0; 987 return REG_ESPACE; 988 } 989 set->elems[0] = elem; 990 return REG_NOERROR; 991} 992 993static reg_errcode_t 994internal_function __attribute_warn_unused_result__ 995re_node_set_init_2 (re_node_set *set, int elem1, int elem2) 996{ 997 set->alloc = 2; 998 set->elems = re_malloc (int, 2); 999 if (BE (set->elems == NULL, 0)) 1000 return REG_ESPACE; 1001 if (elem1 == elem2) 1002 { 1003 set->nelem = 1; 1004 set->elems[0] = elem1; 1005 } 1006 else 1007 { 1008 set->nelem = 2; 1009 if (elem1 < elem2) 1010 { 1011 set->elems[0] = elem1; 1012 set->elems[1] = elem2; 1013 } 1014 else 1015 { 1016 set->elems[0] = elem2; 1017 set->elems[1] = elem1; 1018 } 1019 } 1020 return REG_NOERROR; 1021} 1022 1023static reg_errcode_t 1024internal_function __attribute_warn_unused_result__ 1025re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1026{ 1027 dest->nelem = src->nelem; 1028 if (src->nelem > 0) 1029 { 1030 dest->alloc = dest->nelem; 1031 dest->elems = re_malloc (int, dest->alloc); 1032 if (BE (dest->elems == NULL, 0)) 1033 { 1034 dest->alloc = dest->nelem = 0; 1035 return REG_ESPACE; 1036 } 1037 memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1038 } 1039 else 1040 re_node_set_init_empty (dest); 1041 return REG_NOERROR; 1042} 1043 1044/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1045 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1046 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1047 1048static reg_errcode_t 1049internal_function __attribute_warn_unused_result__ 1050re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1051 const re_node_set *src2) 1052{ 1053 int i1, i2, is, id, delta, sbase; 1054 if (src1->nelem == 0 || src2->nelem == 0) 1055 return REG_NOERROR; 1056 1057 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1058 conservative estimate. */ 1059 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1060 { 1061 int new_alloc = src1->nelem + src2->nelem + dest->alloc; 1062 int *new_elems = re_realloc (dest->elems, int, new_alloc); 1063 if (BE (new_elems == NULL, 0)) 1064 return REG_ESPACE; 1065 dest->elems = new_elems; 1066 dest->alloc = new_alloc; 1067 } 1068 1069 /* Find the items in the intersection of SRC1 and SRC2, and copy 1070 into the top of DEST those that are not already in DEST itself. */ 1071 sbase = dest->nelem + src1->nelem + src2->nelem; 1072 i1 = src1->nelem - 1; 1073 i2 = src2->nelem - 1; 1074 id = dest->nelem - 1; 1075 for (;;) 1076 { 1077 if (src1->elems[i1] == src2->elems[i2]) 1078 { 1079 /* Try to find the item in DEST. Maybe we could binary search? */ 1080 while (id >= 0 && dest->elems[id] > src1->elems[i1]) 1081 --id; 1082 1083 if (id < 0 || dest->elems[id] != src1->elems[i1]) 1084 dest->elems[--sbase] = src1->elems[i1]; 1085 1086 if (--i1 < 0 || --i2 < 0) 1087 break; 1088 } 1089 1090 /* Lower the highest of the two items. */ 1091 else if (src1->elems[i1] < src2->elems[i2]) 1092 { 1093 if (--i2 < 0) 1094 break; 1095 } 1096 else 1097 { 1098 if (--i1 < 0) 1099 break; 1100 } 1101 } 1102 1103 id = dest->nelem - 1; 1104 is = dest->nelem + src1->nelem + src2->nelem - 1; 1105 delta = is - sbase + 1; 1106 1107 /* Now copy. When DELTA becomes zero, the remaining 1108 DEST elements are already in place; this is more or 1109 less the same loop that is in re_node_set_merge. */ 1110 dest->nelem += delta; 1111 if (delta > 0 && id >= 0) 1112 for (;;) 1113 { 1114 if (dest->elems[is] > dest->elems[id]) 1115 { 1116 /* Copy from the top. */ 1117 dest->elems[id + delta--] = dest->elems[is--]; 1118 if (delta == 0) 1119 break; 1120 } 1121 else 1122 { 1123 /* Slide from the bottom. */ 1124 dest->elems[id + delta] = dest->elems[id]; 1125 if (--id < 0) 1126 break; 1127 } 1128 } 1129 1130 /* Copy remaining SRC elements. */ 1131 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int)); 1132 1133 return REG_NOERROR; 1134} 1135 1136/* Calculate the union set of the sets SRC1 and SRC2. And store it to 1137 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1138 1139static reg_errcode_t 1140internal_function __attribute_warn_unused_result__ 1141re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1142 const re_node_set *src2) 1143{ 1144 int i1, i2, id; 1145 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1146 { 1147 dest->alloc = src1->nelem + src2->nelem; 1148 dest->elems = re_malloc (int, dest->alloc); 1149 if (BE (dest->elems == NULL, 0)) 1150 return REG_ESPACE; 1151 } 1152 else 1153 { 1154 if (src1 != NULL && src1->nelem > 0) 1155 return re_node_set_init_copy (dest, src1); 1156 else if (src2 != NULL && src2->nelem > 0) 1157 return re_node_set_init_copy (dest, src2); 1158 else 1159 re_node_set_init_empty (dest); 1160 return REG_NOERROR; 1161 } 1162 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1163 { 1164 if (src1->elems[i1] > src2->elems[i2]) 1165 { 1166 dest->elems[id++] = src2->elems[i2++]; 1167 continue; 1168 } 1169 if (src1->elems[i1] == src2->elems[i2]) 1170 ++i2; 1171 dest->elems[id++] = src1->elems[i1++]; 1172 } 1173 if (i1 < src1->nelem) 1174 { 1175 memcpy (dest->elems + id, src1->elems + i1, 1176 (src1->nelem - i1) * sizeof (int)); 1177 id += src1->nelem - i1; 1178 } 1179 else if (i2 < src2->nelem) 1180 { 1181 memcpy (dest->elems + id, src2->elems + i2, 1182 (src2->nelem - i2) * sizeof (int)); 1183 id += src2->nelem - i2; 1184 } 1185 dest->nelem = id; 1186 return REG_NOERROR; 1187} 1188 1189/* Calculate the union set of the sets DEST and SRC. And store it to 1190 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1191 1192static reg_errcode_t 1193internal_function __attribute_warn_unused_result__ 1194re_node_set_merge (re_node_set *dest, const re_node_set *src) 1195{ 1196 int is, id, sbase, delta; 1197 if (src == NULL || src->nelem == 0) 1198 return REG_NOERROR; 1199 if (dest->alloc < 2 * src->nelem + dest->nelem) 1200 { 1201 int new_alloc = 2 * (src->nelem + dest->alloc); 1202 int *new_buffer = re_realloc (dest->elems, int, new_alloc); 1203 if (BE (new_buffer == NULL, 0)) 1204 return REG_ESPACE; 1205 dest->elems = new_buffer; 1206 dest->alloc = new_alloc; 1207 } 1208 1209 if (BE (dest->nelem == 0, 0)) 1210 { 1211 dest->nelem = src->nelem; 1212 memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1213 return REG_NOERROR; 1214 } 1215 1216 /* Copy into the top of DEST the items of SRC that are not 1217 found in DEST. Maybe we could binary search in DEST? */ 1218 for (sbase = dest->nelem + 2 * src->nelem, 1219 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; ) 1220 { 1221 if (dest->elems[id] == src->elems[is]) 1222 is--, id--; 1223 else if (dest->elems[id] < src->elems[is]) 1224 dest->elems[--sbase] = src->elems[is--]; 1225 else /* if (dest->elems[id] > src->elems[is]) */ 1226 --id; 1227 } 1228 1229 if (is >= 0) 1230 { 1231 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1232 sbase -= is + 1; 1233 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int)); 1234 } 1235 1236 id = dest->nelem - 1; 1237 is = dest->nelem + 2 * src->nelem - 1; 1238 delta = is - sbase + 1; 1239 if (delta == 0) 1240 return REG_NOERROR; 1241 1242 /* Now copy. When DELTA becomes zero, the remaining 1243 DEST elements are already in place. */ 1244 dest->nelem += delta; 1245 for (;;) 1246 { 1247 if (dest->elems[is] > dest->elems[id]) 1248 { 1249 /* Copy from the top. */ 1250 dest->elems[id + delta--] = dest->elems[is--]; 1251 if (delta == 0) 1252 break; 1253 } 1254 else 1255 { 1256 /* Slide from the bottom. */ 1257 dest->elems[id + delta] = dest->elems[id]; 1258 if (--id < 0) 1259 { 1260 /* Copy remaining SRC elements. */ 1261 memcpy (dest->elems, dest->elems + sbase, 1262 delta * sizeof (int)); 1263 break; 1264 } 1265 } 1266 } 1267 1268 return REG_NOERROR; 1269} 1270 1271/* Insert the new element ELEM to the re_node_set* SET. 1272 SET should not already have ELEM. 1273 return -1 if an error is occured, return 1 otherwise. */ 1274 1275static int 1276internal_function __attribute_warn_unused_result__ 1277re_node_set_insert (re_node_set *set, int elem) 1278{ 1279 int idx; 1280 /* In case the set is empty. */ 1281 if (set->alloc == 0) 1282 { 1283 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1)) 1284 return 1; 1285 else 1286 return -1; 1287 } 1288 1289 if (BE (set->nelem, 0) == 0) 1290 { 1291 /* We already guaranteed above that set->alloc != 0. */ 1292 set->elems[0] = elem; 1293 ++set->nelem; 1294 return 1; 1295 } 1296 1297 /* Realloc if we need. */ 1298 if (set->alloc == set->nelem) 1299 { 1300 int *new_elems; 1301 set->alloc = set->alloc * 2; 1302 new_elems = re_realloc (set->elems, int, set->alloc); 1303 if (BE (new_elems == NULL, 0)) 1304 return -1; 1305 set->elems = new_elems; 1306 } 1307 1308 /* Move the elements which follows the new element. Test the 1309 first element separately to skip a check in the inner loop. */ 1310 if (elem < set->elems[0]) 1311 { 1312 idx = 0; 1313 for (idx = set->nelem; idx > 0; idx--) 1314 set->elems[idx] = set->elems[idx - 1]; 1315 } 1316 else 1317 { 1318 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1319 set->elems[idx] = set->elems[idx - 1]; 1320 } 1321 1322 /* Insert the new element. */ 1323 set->elems[idx] = elem; 1324 ++set->nelem; 1325 return 1; 1326} 1327 1328/* Insert the new element ELEM to the re_node_set* SET. 1329 SET should not already have any element greater than or equal to ELEM. 1330 Return -1 if an error is occured, return 1 otherwise. */ 1331 1332static int 1333internal_function __attribute_warn_unused_result__ 1334re_node_set_insert_last (re_node_set *set, int elem) 1335{ 1336 /* Realloc if we need. */ 1337 if (set->alloc == set->nelem) 1338 { 1339 int *new_elems; 1340 set->alloc = (set->alloc + 1) * 2; 1341 new_elems = re_realloc (set->elems, int, set->alloc); 1342 if (BE (new_elems == NULL, 0)) 1343 return -1; 1344 set->elems = new_elems; 1345 } 1346 1347 /* Insert the new element. */ 1348 set->elems[set->nelem++] = elem; 1349 return 1; 1350} 1351 1352/* Compare two node sets SET1 and SET2. 1353 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */ 1354 1355static int 1356internal_function __attribute ((pure)) 1357re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1358{ 1359 int i; 1360 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1361 return 0; 1362 for (i = set1->nelem ; --i >= 0 ; ) 1363 if (set1->elems[i] != set2->elems[i]) 1364 return 0; 1365 return 1; 1366} 1367 1368/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1369 1370static int 1371internal_function __attribute ((pure)) 1372re_node_set_contains (const re_node_set *set, int elem) 1373{ 1374 unsigned int idx, right, mid; 1375 if (set->nelem <= 0) 1376 return 0; 1377 1378 /* Binary search the element. */ 1379 idx = 0; 1380 right = set->nelem - 1; 1381 while (idx < right) 1382 { 1383 mid = (idx + right) / 2; 1384 if (set->elems[mid] < elem) 1385 idx = mid + 1; 1386 else 1387 right = mid; 1388 } 1389 return set->elems[idx] == elem ? idx + 1 : 0; 1390} 1391 1392static void 1393internal_function 1394re_node_set_remove_at (re_node_set *set, int idx) 1395{ 1396 if (idx < 0 || idx >= set->nelem) 1397 return; 1398 --set->nelem; 1399 for (; idx < set->nelem; idx++) 1400 set->elems[idx] = set->elems[idx + 1]; 1401} 1402 1403 1404/* Add the token TOKEN to dfa->nodes, and return the index of the token. 1405 Or return -1, if an error will be occured. */ 1406 1407static int 1408internal_function 1409re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1410{ 1411 int type = token.type; 1412 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1413 { 1414 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1415 int *new_nexts, *new_indices; 1416 re_node_set *new_edests, *new_eclosures; 1417 re_token_t *new_nodes; 1418 1419 /* Avoid overflows in realloc. */ 1420 const size_t max_object_size = MAX (sizeof (re_token_t), 1421 MAX (sizeof (re_node_set), 1422 sizeof (int))); 1423 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0)) 1424 return -1; 1425 1426 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1427 if (BE (new_nodes == NULL, 0)) 1428 return -1; 1429 dfa->nodes = new_nodes; 1430 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); 1431 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); 1432 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1433 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1434 if (BE (new_nexts == NULL || new_indices == NULL 1435 || new_edests == NULL || new_eclosures == NULL, 0)) 1436 return -1; 1437 dfa->nexts = new_nexts; 1438 dfa->org_indices = new_indices; 1439 dfa->edests = new_edests; 1440 dfa->eclosures = new_eclosures; 1441 dfa->nodes_alloc = new_nodes_alloc; 1442 } 1443 dfa->nodes[dfa->nodes_len] = token; 1444 dfa->nodes[dfa->nodes_len].constraint = 0; 1445#ifdef RE_ENABLE_I18N 1446 dfa->nodes[dfa->nodes_len].accept_mb = 1447 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1448#endif 1449 dfa->nexts[dfa->nodes_len] = -1; 1450 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1451 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1452 return dfa->nodes_len++; 1453} 1454 1455static inline unsigned int 1456internal_function 1457calc_state_hash (const re_node_set *nodes, unsigned int context) 1458{ 1459 unsigned int hash = nodes->nelem + context; 1460 int i; 1461 for (i = 0 ; i < nodes->nelem ; i++) 1462 hash += nodes->elems[i]; 1463 return hash; 1464} 1465 1466/* Search for the state whose node_set is equivalent to NODES. 1467 Return the pointer to the state, if we found it in the DFA. 1468 Otherwise create the new one and return it. In case of an error 1469 return NULL and set the error code in ERR. 1470 Note: - We assume NULL as the invalid state, then it is possible that 1471 return value is NULL and ERR is REG_NOERROR. 1472 - We never return non-NULL value in case of any errors, it is for 1473 optimization. */ 1474 1475static re_dfastate_t * 1476internal_function __attribute_warn_unused_result__ 1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1478 const re_node_set *nodes) 1479{ 1480 unsigned int hash; 1481 re_dfastate_t *new_state; 1482 struct re_state_table_entry *spot; 1483 int i; 1484 if (BE (nodes->nelem == 0, 0)) 1485 { 1486 *err = REG_NOERROR; 1487 return NULL; 1488 } 1489 hash = calc_state_hash (nodes, 0); 1490 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1491 1492 for (i = 0 ; i < spot->num ; i++) 1493 { 1494 re_dfastate_t *state = spot->array[i]; 1495 if (hash != state->hash) 1496 continue; 1497 if (re_node_set_compare (&state->nodes, nodes)) 1498 return state; 1499 } 1500 1501 /* There are no appropriate state in the dfa, create the new one. */ 1502 new_state = create_ci_newstate (dfa, nodes, hash); 1503 if (BE (new_state == NULL, 0)) 1504 *err = REG_ESPACE; 1505 1506 return new_state; 1507} 1508 1509/* Search for the state whose node_set is equivalent to NODES and 1510 whose context is equivalent to CONTEXT. 1511 Return the pointer to the state, if we found it in the DFA. 1512 Otherwise create the new one and return it. In case of an error 1513 return NULL and set the error code in ERR. 1514 Note: - We assume NULL as the invalid state, then it is possible that 1515 return value is NULL and ERR is REG_NOERROR. 1516 - We never return non-NULL value in case of any errors, it is for 1517 optimization. */ 1518 1519static re_dfastate_t * 1520internal_function __attribute_warn_unused_result__ 1521re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1522 const re_node_set *nodes, unsigned int context) 1523{ 1524 unsigned int hash; 1525 re_dfastate_t *new_state; 1526 struct re_state_table_entry *spot; 1527 int i; 1528 if (nodes->nelem == 0) 1529 { 1530 *err = REG_NOERROR; 1531 return NULL; 1532 } 1533 hash = calc_state_hash (nodes, context); 1534 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1535 1536 for (i = 0 ; i < spot->num ; i++) 1537 { 1538 re_dfastate_t *state = spot->array[i]; 1539 if (state->hash == hash 1540 && state->context == context 1541 && re_node_set_compare (state->entrance_nodes, nodes)) 1542 return state; 1543 } 1544 /* There are no appropriate state in `dfa', create the new one. */ 1545 new_state = create_cd_newstate (dfa, nodes, context, hash); 1546 if (BE (new_state == NULL, 0)) 1547 *err = REG_ESPACE; 1548 1549 return new_state; 1550} 1551 1552/* Finish initialization of the new state NEWSTATE, and using its hash value 1553 HASH put in the appropriate bucket of DFA's state table. Return value 1554 indicates the error code if failed. */ 1555 1556static reg_errcode_t 1557__attribute_warn_unused_result__ 1558register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1559 unsigned int hash) 1560{ 1561 struct re_state_table_entry *spot; 1562 reg_errcode_t err; 1563 int i; 1564 1565 newstate->hash = hash; 1566 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1567 if (BE (err != REG_NOERROR, 0)) 1568 return REG_ESPACE; 1569 for (i = 0; i < newstate->nodes.nelem; i++) 1570 { 1571 int elem = newstate->nodes.elems[i]; 1572 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1573 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0) 1574 return REG_ESPACE; 1575 } 1576 1577 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1578 if (BE (spot->alloc <= spot->num, 0)) 1579 { 1580 int new_alloc = 2 * spot->num + 2; 1581 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1582 new_alloc); 1583 if (BE (new_array == NULL, 0)) 1584 return REG_ESPACE; 1585 spot->array = new_array; 1586 spot->alloc = new_alloc; 1587 } 1588 spot->array[spot->num++] = newstate; 1589 return REG_NOERROR; 1590} 1591 1592static void 1593free_state (re_dfastate_t *state) 1594{ 1595 re_node_set_free (&state->non_eps_nodes); 1596 re_node_set_free (&state->inveclosure); 1597 if (state->entrance_nodes != &state->nodes) 1598 { 1599 re_node_set_free (state->entrance_nodes); 1600 re_free (state->entrance_nodes); 1601 } 1602 re_node_set_free (&state->nodes); 1603 re_free (state->word_trtable); 1604 re_free (state->trtable); 1605 re_free (state); 1606} 1607 1608/* Create the new state which is independ of contexts. 1609 Return the new state if succeeded, otherwise return NULL. */ 1610 1611static re_dfastate_t * 1612internal_function __attribute_warn_unused_result__ 1613create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1614 unsigned int hash) 1615{ 1616 int i; 1617 reg_errcode_t err; 1618 re_dfastate_t *newstate; 1619 1620 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1621 if (BE (newstate == NULL, 0)) 1622 return NULL; 1623 err = re_node_set_init_copy (&newstate->nodes, nodes); 1624 if (BE (err != REG_NOERROR, 0)) 1625 { 1626 re_free (newstate); 1627 return NULL; 1628 } 1629 1630 newstate->entrance_nodes = &newstate->nodes; 1631 for (i = 0 ; i < nodes->nelem ; i++) 1632 { 1633 re_token_t *node = dfa->nodes + nodes->elems[i]; 1634 re_token_type_t type = node->type; 1635 if (type == CHARACTER && !node->constraint) 1636 continue; 1637#ifdef RE_ENABLE_I18N 1638 newstate->accept_mb |= node->accept_mb; 1639#endif /* RE_ENABLE_I18N */ 1640 1641 /* If the state has the halt node, the state is a halt state. */ 1642 if (type == END_OF_RE) 1643 newstate->halt = 1; 1644 else if (type == OP_BACK_REF) 1645 newstate->has_backref = 1; 1646 else if (type == ANCHOR || node->constraint) 1647 newstate->has_constraint = 1; 1648 } 1649 err = register_state (dfa, newstate, hash); 1650 if (BE (err != REG_NOERROR, 0)) 1651 { 1652 free_state (newstate); 1653 newstate = NULL; 1654 } 1655 return newstate; 1656} 1657 1658/* Create the new state which is depend on the context CONTEXT. 1659 Return the new state if succeeded, otherwise return NULL. */ 1660 1661static re_dfastate_t * 1662internal_function __attribute_warn_unused_result__ 1663create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1664 unsigned int context, unsigned int hash) 1665{ 1666 int i, nctx_nodes = 0; 1667 reg_errcode_t err; 1668 re_dfastate_t *newstate; 1669 1670 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1671 if (BE (newstate == NULL, 0)) 1672 return NULL; 1673 err = re_node_set_init_copy (&newstate->nodes, nodes); 1674 if (BE (err != REG_NOERROR, 0)) 1675 { 1676 re_free (newstate); 1677 return NULL; 1678 } 1679 1680 newstate->context = context; 1681 newstate->entrance_nodes = &newstate->nodes; 1682 1683 for (i = 0 ; i < nodes->nelem ; i++) 1684 { 1685 re_token_t *node = dfa->nodes + nodes->elems[i]; 1686 re_token_type_t type = node->type; 1687 unsigned int constraint = node->constraint; 1688 1689 if (type == CHARACTER && !constraint) 1690 continue; 1691#ifdef RE_ENABLE_I18N 1692 newstate->accept_mb |= node->accept_mb; 1693#endif /* RE_ENABLE_I18N */ 1694 1695 /* If the state has the halt node, the state is a halt state. */ 1696 if (type == END_OF_RE) 1697 newstate->halt = 1; 1698 else if (type == OP_BACK_REF) 1699 newstate->has_backref = 1; 1700 1701 if (constraint) 1702 { 1703 if (newstate->entrance_nodes == &newstate->nodes) 1704 { 1705 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1706 if (BE (newstate->entrance_nodes == NULL, 0)) 1707 { 1708 free_state (newstate); 1709 return NULL; 1710 } 1711 if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1712 != REG_NOERROR) 1713 return NULL; 1714 nctx_nodes = 0; 1715 newstate->has_constraint = 1; 1716 } 1717 1718 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1719 { 1720 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1721 ++nctx_nodes; 1722 } 1723 } 1724 } 1725 err = register_state (dfa, newstate, hash); 1726 if (BE (err != REG_NOERROR, 0)) 1727 { 1728 free_state (newstate); 1729 newstate = NULL; 1730 } 1731 return newstate; 1732} 1733