1146040Stjr/* Extended regular expression matching and search library. 2250724Sjkim Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc. 3146040Stjr This file is part of the GNU C Library. 4146040Stjr Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5146040Stjr 6146040Stjr The GNU C Library is free software; you can redistribute it and/or 7146040Stjr modify it under the terms of the GNU Lesser General Public 8146040Stjr License as published by the Free Software Foundation; either 9146040Stjr version 2.1 of the License, or (at your option) any later version. 10146040Stjr 11146040Stjr The GNU C Library is distributed in the hope that it will be useful, 12146040Stjr but WITHOUT ANY WARRANTY; without even the implied warranty of 13146040Stjr MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14146040Stjr Lesser General Public License for more details. 15146040Stjr 16146040Stjr You should have received a copy of the GNU Lesser General Public 17250724Sjkim License along with the GNU C Library; if not, see 18250724Sjkim <http://www.gnu.org/licenses/>. */ 19146040Stjr 20146040Stjrstatic void re_string_construct_common (const char *str, int len, 21146040Stjr re_string_t *pstr, 22146040Stjr RE_TRANSLATE_TYPE trans, int icase, 23146040Stjr const re_dfa_t *dfa) internal_function; 24250724Sjkimstatic re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 25146040Stjr const re_node_set *nodes, 26146040Stjr unsigned int hash) internal_function; 27250724Sjkimstatic re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 28146040Stjr const re_node_set *nodes, 29146040Stjr unsigned int context, 30146040Stjr unsigned int hash) internal_function; 31146040Stjr 32146040Stjr/* Functions for string operation. */ 33146040Stjr 34146040Stjr/* This function allocate the buffers. It is necessary to call 35146040Stjr re_string_reconstruct before using the object. */ 36146040Stjr 37146040Stjrstatic reg_errcode_t 38250724Sjkiminternal_function __attribute_warn_unused_result__ 39250724Sjkimre_string_allocate (re_string_t *pstr, const char *str, int len, int init_len, 40250724Sjkim RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 41146040Stjr{ 42146040Stjr reg_errcode_t ret; 43146040Stjr int init_buf_len; 44146040Stjr 45146040Stjr /* Ensure at least one character fits into the buffers. */ 46146040Stjr if (init_len < dfa->mb_cur_max) 47146040Stjr init_len = dfa->mb_cur_max; 48146040Stjr init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 49146040Stjr re_string_construct_common (str, len, pstr, trans, icase, dfa); 50146040Stjr 51146040Stjr ret = re_string_realloc_buffers (pstr, init_buf_len); 52146040Stjr if (BE (ret != REG_NOERROR, 0)) 53146040Stjr return ret; 54146040Stjr 55146040Stjr pstr->word_char = dfa->word_char; 56146040Stjr pstr->word_ops_used = dfa->word_ops_used; 57146040Stjr pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 58146040Stjr pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 59146040Stjr pstr->valid_raw_len = pstr->valid_len; 60146040Stjr return REG_NOERROR; 61146040Stjr} 62146040Stjr 63146040Stjr/* This function allocate the buffers, and initialize them. */ 64146040Stjr 65146040Stjrstatic reg_errcode_t 66250724Sjkiminternal_function __attribute_warn_unused_result__ 67250724Sjkimre_string_construct (re_string_t *pstr, const char *str, int len, 68250724Sjkim RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 69146040Stjr{ 70146040Stjr reg_errcode_t ret; 71146040Stjr memset (pstr, '\0', sizeof (re_string_t)); 72146040Stjr re_string_construct_common (str, len, pstr, trans, icase, dfa); 73146040Stjr 74146040Stjr if (len > 0) 75146040Stjr { 76146040Stjr ret = re_string_realloc_buffers (pstr, len + 1); 77146040Stjr if (BE (ret != REG_NOERROR, 0)) 78146040Stjr return ret; 79146040Stjr } 80146040Stjr pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 81146040Stjr 82146040Stjr if (icase) 83146040Stjr { 84146040Stjr#ifdef RE_ENABLE_I18N 85146040Stjr if (dfa->mb_cur_max > 1) 86146040Stjr { 87146040Stjr while (1) 88146040Stjr { 89146040Stjr ret = build_wcs_upper_buffer (pstr); 90146040Stjr if (BE (ret != REG_NOERROR, 0)) 91146040Stjr return ret; 92146040Stjr if (pstr->valid_raw_len >= len) 93146040Stjr break; 94146040Stjr if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 95146040Stjr break; 96146040Stjr ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 97146040Stjr if (BE (ret != REG_NOERROR, 0)) 98146040Stjr return ret; 99146040Stjr } 100146040Stjr } 101146040Stjr else 102146040Stjr#endif /* RE_ENABLE_I18N */ 103146040Stjr build_upper_buffer (pstr); 104146040Stjr } 105146040Stjr else 106146040Stjr { 107146040Stjr#ifdef RE_ENABLE_I18N 108146040Stjr if (dfa->mb_cur_max > 1) 109146040Stjr build_wcs_buffer (pstr); 110146040Stjr else 111146040Stjr#endif /* RE_ENABLE_I18N */ 112146040Stjr { 113146040Stjr if (trans != NULL) 114146040Stjr re_string_translate_buffer (pstr); 115146040Stjr else 116146040Stjr { 117146040Stjr pstr->valid_len = pstr->bufs_len; 118146040Stjr pstr->valid_raw_len = pstr->bufs_len; 119146040Stjr } 120146040Stjr } 121146040Stjr } 122146040Stjr 123146040Stjr return REG_NOERROR; 124146040Stjr} 125146040Stjr 126146040Stjr/* Helper functions for re_string_allocate, and re_string_construct. */ 127146040Stjr 128146040Stjrstatic reg_errcode_t 129250724Sjkiminternal_function __attribute_warn_unused_result__ 130250724Sjkimre_string_realloc_buffers (re_string_t *pstr, int new_buf_len) 131146040Stjr{ 132146040Stjr#ifdef RE_ENABLE_I18N 133146040Stjr if (pstr->mb_cur_max > 1) 134146040Stjr { 135250724Sjkim wint_t *new_wcs; 136250724Sjkim 137250724Sjkim /* Avoid overflow in realloc. */ 138250724Sjkim const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int)); 139250724Sjkim if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 140146040Stjr return REG_ESPACE; 141250724Sjkim 142250724Sjkim new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 143250724Sjkim if (BE (new_wcs == NULL, 0)) 144250724Sjkim return REG_ESPACE; 145250724Sjkim pstr->wcs = new_wcs; 146146040Stjr if (pstr->offsets != NULL) 147146040Stjr { 148250724Sjkim int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len); 149250724Sjkim if (BE (new_offsets == NULL, 0)) 150146040Stjr return REG_ESPACE; 151250724Sjkim pstr->offsets = new_offsets; 152146040Stjr } 153146040Stjr } 154146040Stjr#endif /* RE_ENABLE_I18N */ 155146040Stjr if (pstr->mbs_allocated) 156146040Stjr { 157250724Sjkim unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 158250724Sjkim new_buf_len); 159250724Sjkim if (BE (new_mbs == NULL, 0)) 160146040Stjr return REG_ESPACE; 161250724Sjkim pstr->mbs = new_mbs; 162146040Stjr } 163146040Stjr pstr->bufs_len = new_buf_len; 164146040Stjr return REG_NOERROR; 165146040Stjr} 166146040Stjr 167146040Stjr 168146040Stjrstatic void 169250724Sjkiminternal_function 170250724Sjkimre_string_construct_common (const char *str, int len, re_string_t *pstr, 171250724Sjkim RE_TRANSLATE_TYPE trans, int icase, 172250724Sjkim const re_dfa_t *dfa) 173146040Stjr{ 174146040Stjr pstr->raw_mbs = (const unsigned char *) str; 175146040Stjr pstr->len = len; 176146040Stjr pstr->raw_len = len; 177250724Sjkim pstr->trans = trans; 178146040Stjr pstr->icase = icase ? 1 : 0; 179146040Stjr pstr->mbs_allocated = (trans != NULL || icase); 180146040Stjr pstr->mb_cur_max = dfa->mb_cur_max; 181146040Stjr pstr->is_utf8 = dfa->is_utf8; 182146040Stjr pstr->map_notascii = dfa->map_notascii; 183146040Stjr pstr->stop = pstr->len; 184146040Stjr pstr->raw_stop = pstr->stop; 185146040Stjr} 186146040Stjr 187146040Stjr#ifdef RE_ENABLE_I18N 188146040Stjr 189146040Stjr/* Build wide character buffer PSTR->WCS. 190146040Stjr If the byte sequence of the string are: 191146040Stjr <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 192146040Stjr Then wide character buffer will be: 193146040Stjr <wc1> , WEOF , <wc2> , WEOF , <wc3> 194146040Stjr We use WEOF for padding, they indicate that the position isn't 195146040Stjr a first byte of a multibyte character. 196146040Stjr 197146040Stjr Note that this function assumes PSTR->VALID_LEN elements are already 198146040Stjr built and starts from PSTR->VALID_LEN. */ 199146040Stjr 200146040Stjrstatic void 201250724Sjkiminternal_function 202250724Sjkimbuild_wcs_buffer (re_string_t *pstr) 203146040Stjr{ 204146040Stjr#ifdef _LIBC 205250724Sjkim unsigned char buf[MB_LEN_MAX]; 206250724Sjkim assert (MB_LEN_MAX >= pstr->mb_cur_max); 207146040Stjr#else 208146040Stjr unsigned char buf[64]; 209146040Stjr#endif 210146040Stjr mbstate_t prev_st; 211146040Stjr int byte_idx, end_idx, remain_len; 212146040Stjr size_t mbclen; 213146040Stjr 214146040Stjr /* Build the buffers from pstr->valid_len to either pstr->len or 215146040Stjr pstr->bufs_len. */ 216146040Stjr end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 217146040Stjr for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 218146040Stjr { 219146040Stjr wchar_t wc; 220146040Stjr const char *p; 221146040Stjr 222146040Stjr remain_len = end_idx - byte_idx; 223146040Stjr prev_st = pstr->cur_state; 224146040Stjr /* Apply the translation if we need. */ 225146040Stjr if (BE (pstr->trans != NULL, 0)) 226146040Stjr { 227146040Stjr int i, ch; 228146040Stjr 229146040Stjr for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 230146040Stjr { 231146040Stjr ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 232146040Stjr buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 233146040Stjr } 234146040Stjr p = (const char *) buf; 235146040Stjr } 236146040Stjr else 237146040Stjr p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 238250724Sjkim mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 239250724Sjkim if (BE (mbclen == (size_t) -1 || mbclen == 0 240250724Sjkim || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) 241146040Stjr { 242146040Stjr /* We treat these cases as a singlebyte character. */ 243146040Stjr mbclen = 1; 244146040Stjr wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 245146040Stjr if (BE (pstr->trans != NULL, 0)) 246146040Stjr wc = pstr->trans[wc]; 247146040Stjr pstr->cur_state = prev_st; 248146040Stjr } 249250724Sjkim else if (BE (mbclen == (size_t) -2, 0)) 250250724Sjkim { 251250724Sjkim /* The buffer doesn't have enough space, finish to build. */ 252250724Sjkim pstr->cur_state = prev_st; 253250724Sjkim break; 254250724Sjkim } 255146040Stjr 256146040Stjr /* Write wide character and padding. */ 257146040Stjr pstr->wcs[byte_idx++] = wc; 258146040Stjr /* Write paddings. */ 259146040Stjr for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 260146040Stjr pstr->wcs[byte_idx++] = WEOF; 261146040Stjr } 262146040Stjr pstr->valid_len = byte_idx; 263146040Stjr pstr->valid_raw_len = byte_idx; 264146040Stjr} 265146040Stjr 266146040Stjr/* Build wide character buffer PSTR->WCS like build_wcs_buffer, 267146040Stjr but for REG_ICASE. */ 268146040Stjr 269250724Sjkimstatic reg_errcode_t 270250724Sjkiminternal_function __attribute_warn_unused_result__ 271250724Sjkimbuild_wcs_upper_buffer (re_string_t *pstr) 272146040Stjr{ 273146040Stjr mbstate_t prev_st; 274146040Stjr int src_idx, byte_idx, end_idx, remain_len; 275146040Stjr size_t mbclen; 276146040Stjr#ifdef _LIBC 277250724Sjkim char buf[MB_LEN_MAX]; 278250724Sjkim assert (MB_LEN_MAX >= pstr->mb_cur_max); 279146040Stjr#else 280146040Stjr char buf[64]; 281146040Stjr#endif 282146040Stjr 283146040Stjr byte_idx = pstr->valid_len; 284146040Stjr end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 285146040Stjr 286146040Stjr /* The following optimization assumes that ASCII characters can be 287146040Stjr mapped to wide characters with a simple cast. */ 288146040Stjr if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 289146040Stjr { 290146040Stjr while (byte_idx < end_idx) 291146040Stjr { 292146040Stjr wchar_t wc; 293146040Stjr 294146040Stjr if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 295146040Stjr && mbsinit (&pstr->cur_state)) 296146040Stjr { 297146040Stjr /* In case of a singlebyte character. */ 298146040Stjr pstr->mbs[byte_idx] 299146040Stjr = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 300146040Stjr /* The next step uses the assumption that wchar_t is encoded 301146040Stjr ASCII-safe: all ASCII values can be converted like this. */ 302146040Stjr pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 303146040Stjr ++byte_idx; 304146040Stjr continue; 305146040Stjr } 306146040Stjr 307146040Stjr remain_len = end_idx - byte_idx; 308146040Stjr prev_st = pstr->cur_state; 309250724Sjkim mbclen = __mbrtowc (&wc, 310250724Sjkim ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 311250724Sjkim + byte_idx), remain_len, &pstr->cur_state); 312146040Stjr if (BE (mbclen + 2 > 2, 1)) 313146040Stjr { 314146040Stjr wchar_t wcu = wc; 315146040Stjr if (iswlower (wc)) 316146040Stjr { 317146040Stjr size_t mbcdlen; 318146040Stjr 319146040Stjr wcu = towupper (wc); 320146040Stjr mbcdlen = wcrtomb (buf, wcu, &prev_st); 321146040Stjr if (BE (mbclen == mbcdlen, 1)) 322146040Stjr memcpy (pstr->mbs + byte_idx, buf, mbclen); 323146040Stjr else 324146040Stjr { 325146040Stjr src_idx = byte_idx; 326146040Stjr goto offsets_needed; 327146040Stjr } 328146040Stjr } 329146040Stjr else 330146040Stjr memcpy (pstr->mbs + byte_idx, 331146040Stjr pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 332146040Stjr pstr->wcs[byte_idx++] = wcu; 333146040Stjr /* Write paddings. */ 334146040Stjr for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 335146040Stjr pstr->wcs[byte_idx++] = WEOF; 336146040Stjr } 337250724Sjkim else if (mbclen == (size_t) -1 || mbclen == 0 338250724Sjkim || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 339146040Stjr { 340250724Sjkim /* It is an invalid character, an incomplete character 341250724Sjkim at the end of the string, or '\0'. Just use the byte. */ 342146040Stjr int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 343146040Stjr pstr->mbs[byte_idx] = ch; 344146040Stjr /* And also cast it to wide char. */ 345146040Stjr pstr->wcs[byte_idx++] = (wchar_t) ch; 346146040Stjr if (BE (mbclen == (size_t) -1, 0)) 347146040Stjr pstr->cur_state = prev_st; 348146040Stjr } 349146040Stjr else 350146040Stjr { 351146040Stjr /* The buffer doesn't have enough space, finish to build. */ 352146040Stjr pstr->cur_state = prev_st; 353146040Stjr break; 354146040Stjr } 355146040Stjr } 356146040Stjr pstr->valid_len = byte_idx; 357146040Stjr pstr->valid_raw_len = byte_idx; 358146040Stjr return REG_NOERROR; 359146040Stjr } 360146040Stjr else 361146040Stjr for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 362146040Stjr { 363146040Stjr wchar_t wc; 364146040Stjr const char *p; 365146040Stjr offsets_needed: 366146040Stjr remain_len = end_idx - byte_idx; 367146040Stjr prev_st = pstr->cur_state; 368146040Stjr if (BE (pstr->trans != NULL, 0)) 369146040Stjr { 370146040Stjr int i, ch; 371146040Stjr 372146040Stjr for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 373146040Stjr { 374146040Stjr ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 375146040Stjr buf[i] = pstr->trans[ch]; 376146040Stjr } 377146040Stjr p = (const char *) buf; 378146040Stjr } 379146040Stjr else 380146040Stjr p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 381250724Sjkim mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 382146040Stjr if (BE (mbclen + 2 > 2, 1)) 383146040Stjr { 384146040Stjr wchar_t wcu = wc; 385146040Stjr if (iswlower (wc)) 386146040Stjr { 387146040Stjr size_t mbcdlen; 388146040Stjr 389146040Stjr wcu = towupper (wc); 390146040Stjr mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 391146040Stjr if (BE (mbclen == mbcdlen, 1)) 392146040Stjr memcpy (pstr->mbs + byte_idx, buf, mbclen); 393146040Stjr else if (mbcdlen != (size_t) -1) 394146040Stjr { 395146040Stjr size_t i; 396146040Stjr 397146040Stjr if (byte_idx + mbcdlen > pstr->bufs_len) 398146040Stjr { 399146040Stjr pstr->cur_state = prev_st; 400146040Stjr break; 401146040Stjr } 402146040Stjr 403146040Stjr if (pstr->offsets == NULL) 404146040Stjr { 405146040Stjr pstr->offsets = re_malloc (int, pstr->bufs_len); 406146040Stjr 407146040Stjr if (pstr->offsets == NULL) 408146040Stjr return REG_ESPACE; 409146040Stjr } 410146040Stjr if (!pstr->offsets_needed) 411146040Stjr { 412146040Stjr for (i = 0; i < (size_t) byte_idx; ++i) 413146040Stjr pstr->offsets[i] = i; 414146040Stjr pstr->offsets_needed = 1; 415146040Stjr } 416146040Stjr 417146040Stjr memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 418146040Stjr pstr->wcs[byte_idx] = wcu; 419146040Stjr pstr->offsets[byte_idx] = src_idx; 420146040Stjr for (i = 1; i < mbcdlen; ++i) 421146040Stjr { 422146040Stjr pstr->offsets[byte_idx + i] 423146040Stjr = src_idx + (i < mbclen ? i : mbclen - 1); 424146040Stjr pstr->wcs[byte_idx + i] = WEOF; 425146040Stjr } 426146040Stjr pstr->len += mbcdlen - mbclen; 427146040Stjr if (pstr->raw_stop > src_idx) 428146040Stjr pstr->stop += mbcdlen - mbclen; 429146040Stjr end_idx = (pstr->bufs_len > pstr->len) 430146040Stjr ? pstr->len : pstr->bufs_len; 431146040Stjr byte_idx += mbcdlen; 432146040Stjr src_idx += mbclen; 433146040Stjr continue; 434146040Stjr } 435250724Sjkim else 436250724Sjkim memcpy (pstr->mbs + byte_idx, p, mbclen); 437146040Stjr } 438146040Stjr else 439146040Stjr memcpy (pstr->mbs + byte_idx, p, mbclen); 440146040Stjr 441146040Stjr if (BE (pstr->offsets_needed != 0, 0)) 442146040Stjr { 443146040Stjr size_t i; 444146040Stjr for (i = 0; i < mbclen; ++i) 445146040Stjr pstr->offsets[byte_idx + i] = src_idx + i; 446146040Stjr } 447146040Stjr src_idx += mbclen; 448146040Stjr 449146040Stjr pstr->wcs[byte_idx++] = wcu; 450146040Stjr /* Write paddings. */ 451146040Stjr for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 452146040Stjr pstr->wcs[byte_idx++] = WEOF; 453146040Stjr } 454250724Sjkim else if (mbclen == (size_t) -1 || mbclen == 0 455250724Sjkim || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 456146040Stjr { 457146040Stjr /* It is an invalid character or '\0'. Just use the byte. */ 458146040Stjr int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 459146040Stjr 460146040Stjr if (BE (pstr->trans != NULL, 0)) 461146040Stjr ch = pstr->trans [ch]; 462146040Stjr pstr->mbs[byte_idx] = ch; 463146040Stjr 464146040Stjr if (BE (pstr->offsets_needed != 0, 0)) 465146040Stjr pstr->offsets[byte_idx] = src_idx; 466146040Stjr ++src_idx; 467146040Stjr 468146040Stjr /* And also cast it to wide char. */ 469146040Stjr pstr->wcs[byte_idx++] = (wchar_t) ch; 470146040Stjr if (BE (mbclen == (size_t) -1, 0)) 471146040Stjr pstr->cur_state = prev_st; 472146040Stjr } 473146040Stjr else 474146040Stjr { 475146040Stjr /* The buffer doesn't have enough space, finish to build. */ 476146040Stjr pstr->cur_state = prev_st; 477146040Stjr break; 478146040Stjr } 479146040Stjr } 480146040Stjr pstr->valid_len = byte_idx; 481146040Stjr pstr->valid_raw_len = src_idx; 482146040Stjr return REG_NOERROR; 483146040Stjr} 484146040Stjr 485146040Stjr/* Skip characters until the index becomes greater than NEW_RAW_IDX. 486146040Stjr Return the index. */ 487146040Stjr 488146040Stjrstatic int 489250724Sjkiminternal_function 490250724Sjkimre_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc) 491146040Stjr{ 492146040Stjr mbstate_t prev_st; 493146040Stjr int rawbuf_idx; 494146040Stjr size_t mbclen; 495250724Sjkim wint_t wc = WEOF; 496146040Stjr 497146040Stjr /* Skip the characters which are not necessary to check. */ 498146040Stjr for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 499146040Stjr rawbuf_idx < new_raw_idx;) 500146040Stjr { 501250724Sjkim wchar_t wc2; 502250724Sjkim int remain_len = pstr->raw_len - rawbuf_idx; 503146040Stjr prev_st = pstr->cur_state; 504250724Sjkim mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 505250724Sjkim remain_len, &pstr->cur_state); 506250724Sjkim if (BE ((ssize_t) mbclen <= 0, 0)) 507146040Stjr { 508250724Sjkim /* We treat these cases as a single byte character. */ 509250724Sjkim if (mbclen == 0 || remain_len == 0) 510250724Sjkim wc = L'\0'; 511250724Sjkim else 512250724Sjkim wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 513146040Stjr mbclen = 1; 514146040Stjr pstr->cur_state = prev_st; 515146040Stjr } 516250724Sjkim else 517250724Sjkim wc = (wint_t) wc2; 518146040Stjr /* Then proceed the next character. */ 519146040Stjr rawbuf_idx += mbclen; 520146040Stjr } 521250724Sjkim *last_wc = wc; 522146040Stjr return rawbuf_idx; 523146040Stjr} 524146040Stjr#endif /* RE_ENABLE_I18N */ 525146040Stjr 526146040Stjr/* Build the buffer PSTR->MBS, and apply the translation if we need. 527146040Stjr This function is used in case of REG_ICASE. */ 528146040Stjr 529146040Stjrstatic void 530250724Sjkiminternal_function 531250724Sjkimbuild_upper_buffer (re_string_t *pstr) 532146040Stjr{ 533146040Stjr int char_idx, end_idx; 534146040Stjr end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 535146040Stjr 536146040Stjr for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 537146040Stjr { 538146040Stjr int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 539146040Stjr if (BE (pstr->trans != NULL, 0)) 540146040Stjr ch = pstr->trans[ch]; 541146040Stjr if (islower (ch)) 542146040Stjr pstr->mbs[char_idx] = toupper (ch); 543146040Stjr else 544146040Stjr pstr->mbs[char_idx] = ch; 545146040Stjr } 546146040Stjr pstr->valid_len = char_idx; 547146040Stjr pstr->valid_raw_len = char_idx; 548146040Stjr} 549146040Stjr 550146040Stjr/* Apply TRANS to the buffer in PSTR. */ 551146040Stjr 552146040Stjrstatic void 553250724Sjkiminternal_function 554250724Sjkimre_string_translate_buffer (re_string_t *pstr) 555146040Stjr{ 556146040Stjr int buf_idx, end_idx; 557146040Stjr end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 558146040Stjr 559146040Stjr for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 560146040Stjr { 561146040Stjr int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 562146040Stjr pstr->mbs[buf_idx] = pstr->trans[ch]; 563146040Stjr } 564146040Stjr 565146040Stjr pstr->valid_len = buf_idx; 566146040Stjr pstr->valid_raw_len = buf_idx; 567146040Stjr} 568146040Stjr 569146040Stjr/* This function re-construct the buffers. 570146040Stjr Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 571146040Stjr convert to upper case in case of REG_ICASE, apply translation. */ 572146040Stjr 573146040Stjrstatic reg_errcode_t 574250724Sjkiminternal_function __attribute_warn_unused_result__ 575250724Sjkimre_string_reconstruct (re_string_t *pstr, int idx, int eflags) 576146040Stjr{ 577146040Stjr int offset = idx - pstr->raw_mbs_idx; 578146040Stjr if (BE (offset < 0, 0)) 579146040Stjr { 580146040Stjr /* Reset buffer. */ 581146040Stjr#ifdef RE_ENABLE_I18N 582146040Stjr if (pstr->mb_cur_max > 1) 583146040Stjr memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 584146040Stjr#endif /* RE_ENABLE_I18N */ 585146040Stjr pstr->len = pstr->raw_len; 586146040Stjr pstr->stop = pstr->raw_stop; 587146040Stjr pstr->valid_len = 0; 588146040Stjr pstr->raw_mbs_idx = 0; 589146040Stjr pstr->valid_raw_len = 0; 590146040Stjr pstr->offsets_needed = 0; 591146040Stjr pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 592146040Stjr : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 593146040Stjr if (!pstr->mbs_allocated) 594146040Stjr pstr->mbs = (unsigned char *) pstr->raw_mbs; 595146040Stjr offset = idx; 596146040Stjr } 597146040Stjr 598146040Stjr if (BE (offset != 0, 1)) 599146040Stjr { 600250724Sjkim /* Should the already checked characters be kept? */ 601250724Sjkim if (BE (offset < pstr->valid_raw_len, 1)) 602146040Stjr { 603146040Stjr /* Yes, move them to the front of the buffer. */ 604146040Stjr#ifdef RE_ENABLE_I18N 605250724Sjkim if (BE (pstr->offsets_needed, 0)) 606250724Sjkim { 607250724Sjkim int low = 0, high = pstr->valid_len, mid; 608250724Sjkim do 609250724Sjkim { 610250724Sjkim mid = (high + low) / 2; 611250724Sjkim if (pstr->offsets[mid] > offset) 612250724Sjkim high = mid; 613250724Sjkim else if (pstr->offsets[mid] < offset) 614250724Sjkim low = mid + 1; 615250724Sjkim else 616250724Sjkim break; 617250724Sjkim } 618250724Sjkim while (low < high); 619250724Sjkim if (pstr->offsets[mid] < offset) 620250724Sjkim ++mid; 621250724Sjkim pstr->tip_context = re_string_context_at (pstr, mid - 1, 622250724Sjkim eflags); 623250724Sjkim /* This can be quite complicated, so handle specially 624250724Sjkim only the common and easy case where the character with 625250724Sjkim different length representation of lower and upper 626250724Sjkim case is present at or after offset. */ 627250724Sjkim if (pstr->valid_len > offset 628250724Sjkim && mid == offset && pstr->offsets[mid] == offset) 629250724Sjkim { 630250724Sjkim memmove (pstr->wcs, pstr->wcs + offset, 631250724Sjkim (pstr->valid_len - offset) * sizeof (wint_t)); 632250724Sjkim memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 633250724Sjkim pstr->valid_len -= offset; 634250724Sjkim pstr->valid_raw_len -= offset; 635250724Sjkim for (low = 0; low < pstr->valid_len; low++) 636250724Sjkim pstr->offsets[low] = pstr->offsets[low + offset] - offset; 637250724Sjkim } 638250724Sjkim else 639250724Sjkim { 640250724Sjkim /* Otherwise, just find out how long the partial multibyte 641250724Sjkim character at offset is and fill it with WEOF/255. */ 642250724Sjkim pstr->len = pstr->raw_len - idx + offset; 643250724Sjkim pstr->stop = pstr->raw_stop - idx + offset; 644250724Sjkim pstr->offsets_needed = 0; 645250724Sjkim while (mid > 0 && pstr->offsets[mid - 1] == offset) 646250724Sjkim --mid; 647250724Sjkim while (mid < pstr->valid_len) 648250724Sjkim if (pstr->wcs[mid] != WEOF) 649250724Sjkim break; 650250724Sjkim else 651250724Sjkim ++mid; 652250724Sjkim if (mid == pstr->valid_len) 653250724Sjkim pstr->valid_len = 0; 654250724Sjkim else 655250724Sjkim { 656250724Sjkim pstr->valid_len = pstr->offsets[mid] - offset; 657250724Sjkim if (pstr->valid_len) 658250724Sjkim { 659250724Sjkim for (low = 0; low < pstr->valid_len; ++low) 660250724Sjkim pstr->wcs[low] = WEOF; 661250724Sjkim memset (pstr->mbs, 255, pstr->valid_len); 662250724Sjkim } 663250724Sjkim } 664250724Sjkim pstr->valid_raw_len = pstr->valid_len; 665250724Sjkim } 666250724Sjkim } 667250724Sjkim else 668250724Sjkim#endif 669250724Sjkim { 670250724Sjkim pstr->tip_context = re_string_context_at (pstr, offset - 1, 671250724Sjkim eflags); 672250724Sjkim#ifdef RE_ENABLE_I18N 673250724Sjkim if (pstr->mb_cur_max > 1) 674250724Sjkim memmove (pstr->wcs, pstr->wcs + offset, 675250724Sjkim (pstr->valid_len - offset) * sizeof (wint_t)); 676146040Stjr#endif /* RE_ENABLE_I18N */ 677250724Sjkim if (BE (pstr->mbs_allocated, 0)) 678250724Sjkim memmove (pstr->mbs, pstr->mbs + offset, 679250724Sjkim pstr->valid_len - offset); 680250724Sjkim pstr->valid_len -= offset; 681250724Sjkim pstr->valid_raw_len -= offset; 682146040Stjr#if DEBUG 683250724Sjkim assert (pstr->valid_len > 0); 684146040Stjr#endif 685250724Sjkim } 686146040Stjr } 687146040Stjr else 688146040Stjr { 689146040Stjr /* No, skip all characters until IDX. */ 690250724Sjkim int prev_valid_len = pstr->valid_len; 691250724Sjkim 692146040Stjr#ifdef RE_ENABLE_I18N 693146040Stjr if (BE (pstr->offsets_needed, 0)) 694146040Stjr { 695146040Stjr pstr->len = pstr->raw_len - idx + offset; 696146040Stjr pstr->stop = pstr->raw_stop - idx + offset; 697146040Stjr pstr->offsets_needed = 0; 698146040Stjr } 699146040Stjr#endif 700146040Stjr pstr->valid_len = 0; 701146040Stjr#ifdef RE_ENABLE_I18N 702146040Stjr if (pstr->mb_cur_max > 1) 703146040Stjr { 704146040Stjr int wcs_idx; 705146040Stjr wint_t wc = WEOF; 706146040Stjr 707146040Stjr if (pstr->is_utf8) 708146040Stjr { 709250724Sjkim const unsigned char *raw, *p, *end; 710146040Stjr 711146040Stjr /* Special case UTF-8. Multi-byte chars start with any 712146040Stjr byte other than 0x80 - 0xbf. */ 713146040Stjr raw = pstr->raw_mbs + pstr->raw_mbs_idx; 714146040Stjr end = raw + (offset - pstr->mb_cur_max); 715250724Sjkim if (end < pstr->raw_mbs) 716250724Sjkim end = pstr->raw_mbs; 717250724Sjkim p = raw + offset - 1; 718250724Sjkim#ifdef _LIBC 719250724Sjkim /* We know the wchar_t encoding is UCS4, so for the simple 720250724Sjkim case, ASCII characters, skip the conversion step. */ 721250724Sjkim if (isascii (*p) && BE (pstr->trans == NULL, 1)) 722250724Sjkim { 723250724Sjkim memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 724250724Sjkim /* pstr->valid_len = 0; */ 725250724Sjkim wc = (wchar_t) *p; 726250724Sjkim } 727250724Sjkim else 728250724Sjkim#endif 729250724Sjkim for (; p >= end; --p) 730250724Sjkim if ((*p & 0xc0) != 0x80) 731250724Sjkim { 732250724Sjkim mbstate_t cur_state; 733250724Sjkim wchar_t wc2; 734250724Sjkim int mlen = raw + pstr->len - p; 735250724Sjkim unsigned char buf[6]; 736250724Sjkim size_t mbclen; 737146040Stjr 738250724Sjkim const unsigned char *pp = p; 739250724Sjkim if (BE (pstr->trans != NULL, 0)) 740250724Sjkim { 741250724Sjkim int i = mlen < 6 ? mlen : 6; 742250724Sjkim while (--i >= 0) 743250724Sjkim buf[i] = pstr->trans[p[i]]; 744250724Sjkim pp = buf; 745250724Sjkim } 746250724Sjkim /* XXX Don't use mbrtowc, we know which conversion 747250724Sjkim to use (UTF-8 -> UCS4). */ 748250724Sjkim memset (&cur_state, 0, sizeof (cur_state)); 749250724Sjkim mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, 750250724Sjkim &cur_state); 751250724Sjkim if (raw + offset - p <= mbclen 752250724Sjkim && mbclen < (size_t) -2) 753250724Sjkim { 754250724Sjkim memset (&pstr->cur_state, '\0', 755250724Sjkim sizeof (mbstate_t)); 756250724Sjkim pstr->valid_len = mbclen - (raw + offset - p); 757250724Sjkim wc = wc2; 758250724Sjkim } 759250724Sjkim break; 760250724Sjkim } 761146040Stjr } 762146040Stjr 763146040Stjr if (wc == WEOF) 764146040Stjr pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 765250724Sjkim if (wc == WEOF) 766250724Sjkim pstr->tip_context 767250724Sjkim = re_string_context_at (pstr, prev_valid_len - 1, eflags); 768250724Sjkim else 769250724Sjkim pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 770250724Sjkim && IS_WIDE_WORD_CHAR (wc)) 771250724Sjkim ? CONTEXT_WORD 772250724Sjkim : ((IS_WIDE_NEWLINE (wc) 773250724Sjkim && pstr->newline_anchor) 774250724Sjkim ? CONTEXT_NEWLINE : 0)); 775146040Stjr if (BE (pstr->valid_len, 0)) 776146040Stjr { 777146040Stjr for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 778146040Stjr pstr->wcs[wcs_idx] = WEOF; 779146040Stjr if (pstr->mbs_allocated) 780146040Stjr memset (pstr->mbs, 255, pstr->valid_len); 781146040Stjr } 782146040Stjr pstr->valid_raw_len = pstr->valid_len; 783146040Stjr } 784146040Stjr else 785146040Stjr#endif /* RE_ENABLE_I18N */ 786146040Stjr { 787146040Stjr int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 788250724Sjkim pstr->valid_raw_len = 0; 789146040Stjr if (pstr->trans) 790146040Stjr c = pstr->trans[c]; 791146040Stjr pstr->tip_context = (bitset_contain (pstr->word_char, c) 792146040Stjr ? CONTEXT_WORD 793146040Stjr : ((IS_NEWLINE (c) && pstr->newline_anchor) 794146040Stjr ? CONTEXT_NEWLINE : 0)); 795146040Stjr } 796146040Stjr } 797146040Stjr if (!BE (pstr->mbs_allocated, 0)) 798146040Stjr pstr->mbs += offset; 799146040Stjr } 800146040Stjr pstr->raw_mbs_idx = idx; 801146040Stjr pstr->len -= offset; 802146040Stjr pstr->stop -= offset; 803146040Stjr 804146040Stjr /* Then build the buffers. */ 805146040Stjr#ifdef RE_ENABLE_I18N 806146040Stjr if (pstr->mb_cur_max > 1) 807146040Stjr { 808146040Stjr if (pstr->icase) 809146040Stjr { 810250724Sjkim reg_errcode_t ret = build_wcs_upper_buffer (pstr); 811146040Stjr if (BE (ret != REG_NOERROR, 0)) 812146040Stjr return ret; 813146040Stjr } 814146040Stjr else 815146040Stjr build_wcs_buffer (pstr); 816146040Stjr } 817146040Stjr else 818146040Stjr#endif /* RE_ENABLE_I18N */ 819250724Sjkim if (BE (pstr->mbs_allocated, 0)) 820250724Sjkim { 821250724Sjkim if (pstr->icase) 822250724Sjkim build_upper_buffer (pstr); 823250724Sjkim else if (pstr->trans != NULL) 824250724Sjkim re_string_translate_buffer (pstr); 825250724Sjkim } 826250724Sjkim else 827250724Sjkim pstr->valid_len = pstr->len; 828146040Stjr 829146040Stjr pstr->cur_idx = 0; 830146040Stjr return REG_NOERROR; 831146040Stjr} 832146040Stjr 833146040Stjrstatic unsigned char 834250724Sjkiminternal_function __attribute ((pure)) 835250724Sjkimre_string_peek_byte_case (const re_string_t *pstr, int idx) 836146040Stjr{ 837146040Stjr int ch, off; 838146040Stjr 839146040Stjr /* Handle the common (easiest) cases first. */ 840146040Stjr if (BE (!pstr->mbs_allocated, 1)) 841146040Stjr return re_string_peek_byte (pstr, idx); 842146040Stjr 843146040Stjr#ifdef RE_ENABLE_I18N 844146040Stjr if (pstr->mb_cur_max > 1 845146040Stjr && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 846146040Stjr return re_string_peek_byte (pstr, idx); 847146040Stjr#endif 848146040Stjr 849146040Stjr off = pstr->cur_idx + idx; 850146040Stjr#ifdef RE_ENABLE_I18N 851146040Stjr if (pstr->offsets_needed) 852146040Stjr off = pstr->offsets[off]; 853146040Stjr#endif 854146040Stjr 855146040Stjr ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 856146040Stjr 857146040Stjr#ifdef RE_ENABLE_I18N 858146040Stjr /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 859146040Stjr this function returns CAPITAL LETTER I instead of first byte of 860146040Stjr DOTLESS SMALL LETTER I. The latter would confuse the parser, 861146040Stjr since peek_byte_case doesn't advance cur_idx in any way. */ 862146040Stjr if (pstr->offsets_needed && !isascii (ch)) 863146040Stjr return re_string_peek_byte (pstr, idx); 864146040Stjr#endif 865146040Stjr 866146040Stjr return ch; 867146040Stjr} 868146040Stjr 869146040Stjrstatic unsigned char 870250724Sjkiminternal_function 871250724Sjkimre_string_fetch_byte_case (re_string_t *pstr) 872146040Stjr{ 873146040Stjr if (BE (!pstr->mbs_allocated, 1)) 874146040Stjr return re_string_fetch_byte (pstr); 875146040Stjr 876146040Stjr#ifdef RE_ENABLE_I18N 877146040Stjr if (pstr->offsets_needed) 878146040Stjr { 879146040Stjr int off, ch; 880146040Stjr 881146040Stjr /* For tr_TR.UTF-8 [[:islower:]] there is 882146040Stjr [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 883146040Stjr in that case the whole multi-byte character and return 884146040Stjr the original letter. On the other side, with 885146040Stjr [[: DOTLESS SMALL LETTER I return [[:I, as doing 886146040Stjr anything else would complicate things too much. */ 887146040Stjr 888146040Stjr if (!re_string_first_byte (pstr, pstr->cur_idx)) 889146040Stjr return re_string_fetch_byte (pstr); 890146040Stjr 891146040Stjr off = pstr->offsets[pstr->cur_idx]; 892146040Stjr ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 893146040Stjr 894146040Stjr if (! isascii (ch)) 895146040Stjr return re_string_fetch_byte (pstr); 896146040Stjr 897146040Stjr re_string_skip_bytes (pstr, 898146040Stjr re_string_char_size_at (pstr, pstr->cur_idx)); 899146040Stjr return ch; 900146040Stjr } 901146040Stjr#endif 902146040Stjr 903146040Stjr return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 904146040Stjr} 905146040Stjr 906146040Stjrstatic void 907250724Sjkiminternal_function 908250724Sjkimre_string_destruct (re_string_t *pstr) 909146040Stjr{ 910146040Stjr#ifdef RE_ENABLE_I18N 911146040Stjr re_free (pstr->wcs); 912146040Stjr re_free (pstr->offsets); 913146040Stjr#endif /* RE_ENABLE_I18N */ 914146040Stjr if (pstr->mbs_allocated) 915146040Stjr re_free (pstr->mbs); 916146040Stjr} 917146040Stjr 918146040Stjr/* Return the context at IDX in INPUT. */ 919146040Stjr 920146040Stjrstatic unsigned int 921250724Sjkiminternal_function 922250724Sjkimre_string_context_at (const re_string_t *input, int idx, int eflags) 923146040Stjr{ 924146040Stjr int c; 925146040Stjr if (BE (idx < 0, 0)) 926146040Stjr /* In this case, we use the value stored in input->tip_context, 927146040Stjr since we can't know the character in input->mbs[-1] here. */ 928146040Stjr return input->tip_context; 929146040Stjr if (BE (idx == input->len, 0)) 930146040Stjr return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 931146040Stjr : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 932146040Stjr#ifdef RE_ENABLE_I18N 933146040Stjr if (input->mb_cur_max > 1) 934146040Stjr { 935146040Stjr wint_t wc; 936146040Stjr int wc_idx = idx; 937146040Stjr while(input->wcs[wc_idx] == WEOF) 938146040Stjr { 939146040Stjr#ifdef DEBUG 940146040Stjr /* It must not happen. */ 941146040Stjr assert (wc_idx >= 0); 942146040Stjr#endif 943146040Stjr --wc_idx; 944146040Stjr if (wc_idx < 0) 945146040Stjr return input->tip_context; 946146040Stjr } 947146040Stjr wc = input->wcs[wc_idx]; 948146040Stjr if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 949146040Stjr return CONTEXT_WORD; 950146040Stjr return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 951146040Stjr ? CONTEXT_NEWLINE : 0); 952146040Stjr } 953146040Stjr else 954146040Stjr#endif 955146040Stjr { 956146040Stjr c = re_string_byte_at (input, idx); 957146040Stjr if (bitset_contain (input->word_char, c)) 958146040Stjr return CONTEXT_WORD; 959146040Stjr return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 960146040Stjr } 961146040Stjr} 962146040Stjr 963146040Stjr/* Functions for set operation. */ 964146040Stjr 965146040Stjrstatic reg_errcode_t 966250724Sjkiminternal_function __attribute_warn_unused_result__ 967250724Sjkimre_node_set_alloc (re_node_set *set, int size) 968146040Stjr{ 969146040Stjr set->alloc = size; 970146040Stjr set->nelem = 0; 971146040Stjr set->elems = re_malloc (int, size); 972146040Stjr if (BE (set->elems == NULL, 0)) 973146040Stjr return REG_ESPACE; 974146040Stjr return REG_NOERROR; 975146040Stjr} 976146040Stjr 977146040Stjrstatic reg_errcode_t 978250724Sjkiminternal_function __attribute_warn_unused_result__ 979250724Sjkimre_node_set_init_1 (re_node_set *set, int elem) 980146040Stjr{ 981146040Stjr set->alloc = 1; 982146040Stjr set->nelem = 1; 983146040Stjr set->elems = re_malloc (int, 1); 984146040Stjr if (BE (set->elems == NULL, 0)) 985146040Stjr { 986146040Stjr set->alloc = set->nelem = 0; 987146040Stjr return REG_ESPACE; 988146040Stjr } 989146040Stjr set->elems[0] = elem; 990146040Stjr return REG_NOERROR; 991146040Stjr} 992146040Stjr 993146040Stjrstatic reg_errcode_t 994250724Sjkiminternal_function __attribute_warn_unused_result__ 995250724Sjkimre_node_set_init_2 (re_node_set *set, int elem1, int elem2) 996146040Stjr{ 997146040Stjr set->alloc = 2; 998146040Stjr set->elems = re_malloc (int, 2); 999146040Stjr if (BE (set->elems == NULL, 0)) 1000146040Stjr return REG_ESPACE; 1001146040Stjr if (elem1 == elem2) 1002146040Stjr { 1003146040Stjr set->nelem = 1; 1004146040Stjr set->elems[0] = elem1; 1005146040Stjr } 1006146040Stjr else 1007146040Stjr { 1008146040Stjr set->nelem = 2; 1009146040Stjr if (elem1 < elem2) 1010146040Stjr { 1011146040Stjr set->elems[0] = elem1; 1012146040Stjr set->elems[1] = elem2; 1013146040Stjr } 1014146040Stjr else 1015146040Stjr { 1016146040Stjr set->elems[0] = elem2; 1017146040Stjr set->elems[1] = elem1; 1018146040Stjr } 1019146040Stjr } 1020146040Stjr return REG_NOERROR; 1021146040Stjr} 1022146040Stjr 1023146040Stjrstatic reg_errcode_t 1024250724Sjkiminternal_function __attribute_warn_unused_result__ 1025250724Sjkimre_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1026146040Stjr{ 1027146040Stjr dest->nelem = src->nelem; 1028146040Stjr if (src->nelem > 0) 1029146040Stjr { 1030146040Stjr dest->alloc = dest->nelem; 1031146040Stjr dest->elems = re_malloc (int, dest->alloc); 1032146040Stjr if (BE (dest->elems == NULL, 0)) 1033146040Stjr { 1034146040Stjr dest->alloc = dest->nelem = 0; 1035146040Stjr return REG_ESPACE; 1036146040Stjr } 1037146040Stjr memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1038146040Stjr } 1039146040Stjr else 1040146040Stjr re_node_set_init_empty (dest); 1041146040Stjr return REG_NOERROR; 1042146040Stjr} 1043146040Stjr 1044146040Stjr/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1045146040Stjr DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1046146040Stjr Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1047146040Stjr 1048146040Stjrstatic reg_errcode_t 1049250724Sjkiminternal_function __attribute_warn_unused_result__ 1050250724Sjkimre_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1051250724Sjkim const re_node_set *src2) 1052146040Stjr{ 1053146040Stjr int i1, i2, is, id, delta, sbase; 1054146040Stjr if (src1->nelem == 0 || src2->nelem == 0) 1055146040Stjr return REG_NOERROR; 1056146040Stjr 1057146040Stjr /* We need dest->nelem + 2 * elems_in_intersection; this is a 1058146040Stjr conservative estimate. */ 1059146040Stjr if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1060146040Stjr { 1061146040Stjr int new_alloc = src1->nelem + src2->nelem + dest->alloc; 1062146040Stjr int *new_elems = re_realloc (dest->elems, int, new_alloc); 1063146040Stjr if (BE (new_elems == NULL, 0)) 1064250724Sjkim return REG_ESPACE; 1065146040Stjr dest->elems = new_elems; 1066146040Stjr dest->alloc = new_alloc; 1067146040Stjr } 1068146040Stjr 1069146040Stjr /* Find the items in the intersection of SRC1 and SRC2, and copy 1070146040Stjr into the top of DEST those that are not already in DEST itself. */ 1071146040Stjr sbase = dest->nelem + src1->nelem + src2->nelem; 1072146040Stjr i1 = src1->nelem - 1; 1073146040Stjr i2 = src2->nelem - 1; 1074146040Stjr id = dest->nelem - 1; 1075146040Stjr for (;;) 1076146040Stjr { 1077146040Stjr if (src1->elems[i1] == src2->elems[i2]) 1078146040Stjr { 1079146040Stjr /* Try to find the item in DEST. Maybe we could binary search? */ 1080146040Stjr while (id >= 0 && dest->elems[id] > src1->elems[i1]) 1081146040Stjr --id; 1082146040Stjr 1083250724Sjkim if (id < 0 || dest->elems[id] != src1->elems[i1]) 1084250724Sjkim dest->elems[--sbase] = src1->elems[i1]; 1085146040Stjr 1086146040Stjr if (--i1 < 0 || --i2 < 0) 1087146040Stjr break; 1088146040Stjr } 1089146040Stjr 1090146040Stjr /* Lower the highest of the two items. */ 1091146040Stjr else if (src1->elems[i1] < src2->elems[i2]) 1092146040Stjr { 1093146040Stjr if (--i2 < 0) 1094146040Stjr break; 1095146040Stjr } 1096146040Stjr else 1097146040Stjr { 1098146040Stjr if (--i1 < 0) 1099146040Stjr break; 1100146040Stjr } 1101146040Stjr } 1102146040Stjr 1103146040Stjr id = dest->nelem - 1; 1104146040Stjr is = dest->nelem + src1->nelem + src2->nelem - 1; 1105146040Stjr delta = is - sbase + 1; 1106146040Stjr 1107146040Stjr /* Now copy. When DELTA becomes zero, the remaining 1108146040Stjr DEST elements are already in place; this is more or 1109146040Stjr less the same loop that is in re_node_set_merge. */ 1110146040Stjr dest->nelem += delta; 1111146040Stjr if (delta > 0 && id >= 0) 1112146040Stjr for (;;) 1113146040Stjr { 1114250724Sjkim if (dest->elems[is] > dest->elems[id]) 1115250724Sjkim { 1116250724Sjkim /* Copy from the top. */ 1117250724Sjkim dest->elems[id + delta--] = dest->elems[is--]; 1118250724Sjkim if (delta == 0) 1119250724Sjkim break; 1120250724Sjkim } 1121250724Sjkim else 1122250724Sjkim { 1123250724Sjkim /* Slide from the bottom. */ 1124250724Sjkim dest->elems[id + delta] = dest->elems[id]; 1125250724Sjkim if (--id < 0) 1126250724Sjkim break; 1127250724Sjkim } 1128146040Stjr } 1129146040Stjr 1130146040Stjr /* Copy remaining SRC elements. */ 1131146040Stjr memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int)); 1132146040Stjr 1133146040Stjr return REG_NOERROR; 1134146040Stjr} 1135146040Stjr 1136146040Stjr/* Calculate the union set of the sets SRC1 and SRC2. And store it to 1137146040Stjr DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1138146040Stjr 1139146040Stjrstatic reg_errcode_t 1140250724Sjkiminternal_function __attribute_warn_unused_result__ 1141250724Sjkimre_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1142250724Sjkim const re_node_set *src2) 1143146040Stjr{ 1144146040Stjr int i1, i2, id; 1145146040Stjr if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1146146040Stjr { 1147146040Stjr dest->alloc = src1->nelem + src2->nelem; 1148146040Stjr dest->elems = re_malloc (int, dest->alloc); 1149146040Stjr if (BE (dest->elems == NULL, 0)) 1150146040Stjr return REG_ESPACE; 1151146040Stjr } 1152146040Stjr else 1153146040Stjr { 1154146040Stjr if (src1 != NULL && src1->nelem > 0) 1155146040Stjr return re_node_set_init_copy (dest, src1); 1156146040Stjr else if (src2 != NULL && src2->nelem > 0) 1157146040Stjr return re_node_set_init_copy (dest, src2); 1158146040Stjr else 1159146040Stjr re_node_set_init_empty (dest); 1160146040Stjr return REG_NOERROR; 1161146040Stjr } 1162146040Stjr for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1163146040Stjr { 1164146040Stjr if (src1->elems[i1] > src2->elems[i2]) 1165146040Stjr { 1166146040Stjr dest->elems[id++] = src2->elems[i2++]; 1167146040Stjr continue; 1168146040Stjr } 1169146040Stjr if (src1->elems[i1] == src2->elems[i2]) 1170146040Stjr ++i2; 1171146040Stjr dest->elems[id++] = src1->elems[i1++]; 1172146040Stjr } 1173146040Stjr if (i1 < src1->nelem) 1174146040Stjr { 1175146040Stjr memcpy (dest->elems + id, src1->elems + i1, 1176146040Stjr (src1->nelem - i1) * sizeof (int)); 1177146040Stjr id += src1->nelem - i1; 1178146040Stjr } 1179146040Stjr else if (i2 < src2->nelem) 1180146040Stjr { 1181146040Stjr memcpy (dest->elems + id, src2->elems + i2, 1182146040Stjr (src2->nelem - i2) * sizeof (int)); 1183146040Stjr id += src2->nelem - i2; 1184146040Stjr } 1185146040Stjr dest->nelem = id; 1186146040Stjr return REG_NOERROR; 1187146040Stjr} 1188146040Stjr 1189146040Stjr/* Calculate the union set of the sets DEST and SRC. And store it to 1190146040Stjr DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1191146040Stjr 1192146040Stjrstatic reg_errcode_t 1193250724Sjkiminternal_function __attribute_warn_unused_result__ 1194250724Sjkimre_node_set_merge (re_node_set *dest, const re_node_set *src) 1195146040Stjr{ 1196146040Stjr int is, id, sbase, delta; 1197146040Stjr if (src == NULL || src->nelem == 0) 1198146040Stjr return REG_NOERROR; 1199146040Stjr if (dest->alloc < 2 * src->nelem + dest->nelem) 1200146040Stjr { 1201146040Stjr int new_alloc = 2 * (src->nelem + dest->alloc); 1202146040Stjr int *new_buffer = re_realloc (dest->elems, int, new_alloc); 1203146040Stjr if (BE (new_buffer == NULL, 0)) 1204146040Stjr return REG_ESPACE; 1205146040Stjr dest->elems = new_buffer; 1206146040Stjr dest->alloc = new_alloc; 1207146040Stjr } 1208146040Stjr 1209146040Stjr if (BE (dest->nelem == 0, 0)) 1210146040Stjr { 1211146040Stjr dest->nelem = src->nelem; 1212146040Stjr memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1213146040Stjr return REG_NOERROR; 1214146040Stjr } 1215146040Stjr 1216146040Stjr /* Copy into the top of DEST the items of SRC that are not 1217146040Stjr found in DEST. Maybe we could binary search in DEST? */ 1218146040Stjr for (sbase = dest->nelem + 2 * src->nelem, 1219146040Stjr is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; ) 1220146040Stjr { 1221146040Stjr if (dest->elems[id] == src->elems[is]) 1222250724Sjkim is--, id--; 1223146040Stjr else if (dest->elems[id] < src->elems[is]) 1224250724Sjkim dest->elems[--sbase] = src->elems[is--]; 1225146040Stjr else /* if (dest->elems[id] > src->elems[is]) */ 1226250724Sjkim --id; 1227146040Stjr } 1228146040Stjr 1229146040Stjr if (is >= 0) 1230146040Stjr { 1231146040Stjr /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1232146040Stjr sbase -= is + 1; 1233146040Stjr memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int)); 1234146040Stjr } 1235146040Stjr 1236146040Stjr id = dest->nelem - 1; 1237146040Stjr is = dest->nelem + 2 * src->nelem - 1; 1238146040Stjr delta = is - sbase + 1; 1239146040Stjr if (delta == 0) 1240146040Stjr return REG_NOERROR; 1241146040Stjr 1242146040Stjr /* Now copy. When DELTA becomes zero, the remaining 1243146040Stjr DEST elements are already in place. */ 1244146040Stjr dest->nelem += delta; 1245146040Stjr for (;;) 1246146040Stjr { 1247146040Stjr if (dest->elems[is] > dest->elems[id]) 1248250724Sjkim { 1249146040Stjr /* Copy from the top. */ 1250250724Sjkim dest->elems[id + delta--] = dest->elems[is--]; 1251146040Stjr if (delta == 0) 1252146040Stjr break; 1253146040Stjr } 1254146040Stjr else 1255250724Sjkim { 1256250724Sjkim /* Slide from the bottom. */ 1257250724Sjkim dest->elems[id + delta] = dest->elems[id]; 1258146040Stjr if (--id < 0) 1259146040Stjr { 1260146040Stjr /* Copy remaining SRC elements. */ 1261146040Stjr memcpy (dest->elems, dest->elems + sbase, 1262250724Sjkim delta * sizeof (int)); 1263146040Stjr break; 1264146040Stjr } 1265146040Stjr } 1266146040Stjr } 1267146040Stjr 1268146040Stjr return REG_NOERROR; 1269146040Stjr} 1270146040Stjr 1271146040Stjr/* Insert the new element ELEM to the re_node_set* SET. 1272146040Stjr SET should not already have ELEM. 1273146040Stjr return -1 if an error is occured, return 1 otherwise. */ 1274146040Stjr 1275146040Stjrstatic int 1276250724Sjkiminternal_function __attribute_warn_unused_result__ 1277250724Sjkimre_node_set_insert (re_node_set *set, int elem) 1278146040Stjr{ 1279146040Stjr int idx; 1280146040Stjr /* In case the set is empty. */ 1281146040Stjr if (set->alloc == 0) 1282146040Stjr { 1283146040Stjr if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1)) 1284146040Stjr return 1; 1285146040Stjr else 1286146040Stjr return -1; 1287146040Stjr } 1288146040Stjr 1289146040Stjr if (BE (set->nelem, 0) == 0) 1290146040Stjr { 1291146040Stjr /* We already guaranteed above that set->alloc != 0. */ 1292146040Stjr set->elems[0] = elem; 1293146040Stjr ++set->nelem; 1294146040Stjr return 1; 1295146040Stjr } 1296146040Stjr 1297146040Stjr /* Realloc if we need. */ 1298146040Stjr if (set->alloc == set->nelem) 1299146040Stjr { 1300250724Sjkim int *new_elems; 1301146040Stjr set->alloc = set->alloc * 2; 1302250724Sjkim new_elems = re_realloc (set->elems, int, set->alloc); 1303250724Sjkim if (BE (new_elems == NULL, 0)) 1304146040Stjr return -1; 1305250724Sjkim set->elems = new_elems; 1306146040Stjr } 1307146040Stjr 1308146040Stjr /* Move the elements which follows the new element. Test the 1309146040Stjr first element separately to skip a check in the inner loop. */ 1310146040Stjr if (elem < set->elems[0]) 1311146040Stjr { 1312146040Stjr idx = 0; 1313146040Stjr for (idx = set->nelem; idx > 0; idx--) 1314250724Sjkim set->elems[idx] = set->elems[idx - 1]; 1315146040Stjr } 1316146040Stjr else 1317146040Stjr { 1318146040Stjr for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1319250724Sjkim set->elems[idx] = set->elems[idx - 1]; 1320146040Stjr } 1321146040Stjr 1322146040Stjr /* Insert the new element. */ 1323146040Stjr set->elems[idx] = elem; 1324146040Stjr ++set->nelem; 1325146040Stjr return 1; 1326146040Stjr} 1327146040Stjr 1328146040Stjr/* Insert the new element ELEM to the re_node_set* SET. 1329146040Stjr SET should not already have any element greater than or equal to ELEM. 1330146040Stjr Return -1 if an error is occured, return 1 otherwise. */ 1331146040Stjr 1332146040Stjrstatic int 1333250724Sjkiminternal_function __attribute_warn_unused_result__ 1334250724Sjkimre_node_set_insert_last (re_node_set *set, int elem) 1335146040Stjr{ 1336146040Stjr /* Realloc if we need. */ 1337146040Stjr if (set->alloc == set->nelem) 1338146040Stjr { 1339250724Sjkim int *new_elems; 1340146040Stjr set->alloc = (set->alloc + 1) * 2; 1341250724Sjkim new_elems = re_realloc (set->elems, int, set->alloc); 1342250724Sjkim if (BE (new_elems == NULL, 0)) 1343146040Stjr return -1; 1344250724Sjkim set->elems = new_elems; 1345146040Stjr } 1346146040Stjr 1347146040Stjr /* Insert the new element. */ 1348146040Stjr set->elems[set->nelem++] = elem; 1349146040Stjr return 1; 1350146040Stjr} 1351146040Stjr 1352146040Stjr/* Compare two node sets SET1 and SET2. 1353146040Stjr return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */ 1354146040Stjr 1355146040Stjrstatic int 1356250724Sjkiminternal_function __attribute ((pure)) 1357250724Sjkimre_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1358146040Stjr{ 1359146040Stjr int i; 1360146040Stjr if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1361146040Stjr return 0; 1362146040Stjr for (i = set1->nelem ; --i >= 0 ; ) 1363146040Stjr if (set1->elems[i] != set2->elems[i]) 1364146040Stjr return 0; 1365146040Stjr return 1; 1366146040Stjr} 1367146040Stjr 1368146040Stjr/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1369146040Stjr 1370146040Stjrstatic int 1371250724Sjkiminternal_function __attribute ((pure)) 1372250724Sjkimre_node_set_contains (const re_node_set *set, int elem) 1373146040Stjr{ 1374146040Stjr unsigned int idx, right, mid; 1375146040Stjr if (set->nelem <= 0) 1376146040Stjr return 0; 1377146040Stjr 1378146040Stjr /* Binary search the element. */ 1379146040Stjr idx = 0; 1380146040Stjr right = set->nelem - 1; 1381146040Stjr while (idx < right) 1382146040Stjr { 1383146040Stjr mid = (idx + right) / 2; 1384146040Stjr if (set->elems[mid] < elem) 1385146040Stjr idx = mid + 1; 1386146040Stjr else 1387146040Stjr right = mid; 1388146040Stjr } 1389146040Stjr return set->elems[idx] == elem ? idx + 1 : 0; 1390146040Stjr} 1391146040Stjr 1392146040Stjrstatic void 1393250724Sjkiminternal_function 1394250724Sjkimre_node_set_remove_at (re_node_set *set, int idx) 1395146040Stjr{ 1396146040Stjr if (idx < 0 || idx >= set->nelem) 1397146040Stjr return; 1398146040Stjr --set->nelem; 1399146040Stjr for (; idx < set->nelem; idx++) 1400146040Stjr set->elems[idx] = set->elems[idx + 1]; 1401146040Stjr} 1402146040Stjr 1403146040Stjr 1404146040Stjr/* Add the token TOKEN to dfa->nodes, and return the index of the token. 1405146040Stjr Or return -1, if an error will be occured. */ 1406146040Stjr 1407146040Stjrstatic int 1408250724Sjkiminternal_function 1409250724Sjkimre_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1410146040Stjr{ 1411146040Stjr int type = token.type; 1412146040Stjr if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1413146040Stjr { 1414250724Sjkim size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1415146040Stjr int *new_nexts, *new_indices; 1416146040Stjr re_node_set *new_edests, *new_eclosures; 1417250724Sjkim re_token_t *new_nodes; 1418146040Stjr 1419250724Sjkim /* Avoid overflows in realloc. */ 1420250724Sjkim const size_t max_object_size = MAX (sizeof (re_token_t), 1421250724Sjkim MAX (sizeof (re_node_set), 1422250724Sjkim sizeof (int))); 1423250724Sjkim if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0)) 1424146040Stjr return -1; 1425250724Sjkim 1426250724Sjkim new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1427250724Sjkim if (BE (new_nodes == NULL, 0)) 1428250724Sjkim return -1; 1429250724Sjkim dfa->nodes = new_nodes; 1430146040Stjr new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); 1431146040Stjr new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); 1432146040Stjr new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1433146040Stjr new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1434146040Stjr if (BE (new_nexts == NULL || new_indices == NULL 1435146040Stjr || new_edests == NULL || new_eclosures == NULL, 0)) 1436146040Stjr return -1; 1437146040Stjr dfa->nexts = new_nexts; 1438146040Stjr dfa->org_indices = new_indices; 1439146040Stjr dfa->edests = new_edests; 1440146040Stjr dfa->eclosures = new_eclosures; 1441146040Stjr dfa->nodes_alloc = new_nodes_alloc; 1442146040Stjr } 1443146040Stjr dfa->nodes[dfa->nodes_len] = token; 1444146040Stjr dfa->nodes[dfa->nodes_len].constraint = 0; 1445146040Stjr#ifdef RE_ENABLE_I18N 1446146040Stjr dfa->nodes[dfa->nodes_len].accept_mb = 1447146040Stjr (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1448146040Stjr#endif 1449146040Stjr dfa->nexts[dfa->nodes_len] = -1; 1450146040Stjr re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1451146040Stjr re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1452146040Stjr return dfa->nodes_len++; 1453146040Stjr} 1454146040Stjr 1455250724Sjkimstatic inline unsigned int 1456250724Sjkiminternal_function 1457250724Sjkimcalc_state_hash (const re_node_set *nodes, unsigned int context) 1458146040Stjr{ 1459146040Stjr unsigned int hash = nodes->nelem + context; 1460146040Stjr int i; 1461146040Stjr for (i = 0 ; i < nodes->nelem ; i++) 1462146040Stjr hash += nodes->elems[i]; 1463146040Stjr return hash; 1464146040Stjr} 1465146040Stjr 1466146040Stjr/* Search for the state whose node_set is equivalent to NODES. 1467146040Stjr Return the pointer to the state, if we found it in the DFA. 1468146040Stjr Otherwise create the new one and return it. In case of an error 1469146040Stjr return NULL and set the error code in ERR. 1470146040Stjr Note: - We assume NULL as the invalid state, then it is possible that 1471146040Stjr return value is NULL and ERR is REG_NOERROR. 1472146040Stjr - We never return non-NULL value in case of any errors, it is for 1473146040Stjr optimization. */ 1474146040Stjr 1475250724Sjkimstatic re_dfastate_t * 1476250724Sjkiminternal_function __attribute_warn_unused_result__ 1477250724Sjkimre_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1478250724Sjkim const re_node_set *nodes) 1479146040Stjr{ 1480146040Stjr unsigned int hash; 1481146040Stjr re_dfastate_t *new_state; 1482146040Stjr struct re_state_table_entry *spot; 1483146040Stjr int i; 1484146040Stjr if (BE (nodes->nelem == 0, 0)) 1485146040Stjr { 1486146040Stjr *err = REG_NOERROR; 1487146040Stjr return NULL; 1488146040Stjr } 1489146040Stjr hash = calc_state_hash (nodes, 0); 1490146040Stjr spot = dfa->state_table + (hash & dfa->state_hash_mask); 1491146040Stjr 1492146040Stjr for (i = 0 ; i < spot->num ; i++) 1493146040Stjr { 1494146040Stjr re_dfastate_t *state = spot->array[i]; 1495146040Stjr if (hash != state->hash) 1496146040Stjr continue; 1497146040Stjr if (re_node_set_compare (&state->nodes, nodes)) 1498146040Stjr return state; 1499146040Stjr } 1500146040Stjr 1501146040Stjr /* There are no appropriate state in the dfa, create the new one. */ 1502146040Stjr new_state = create_ci_newstate (dfa, nodes, hash); 1503250724Sjkim if (BE (new_state == NULL, 0)) 1504250724Sjkim *err = REG_ESPACE; 1505250724Sjkim 1506250724Sjkim return new_state; 1507146040Stjr} 1508146040Stjr 1509146040Stjr/* Search for the state whose node_set is equivalent to NODES and 1510146040Stjr whose context is equivalent to CONTEXT. 1511146040Stjr Return the pointer to the state, if we found it in the DFA. 1512146040Stjr Otherwise create the new one and return it. In case of an error 1513146040Stjr return NULL and set the error code in ERR. 1514146040Stjr Note: - We assume NULL as the invalid state, then it is possible that 1515146040Stjr return value is NULL and ERR is REG_NOERROR. 1516146040Stjr - We never return non-NULL value in case of any errors, it is for 1517146040Stjr optimization. */ 1518146040Stjr 1519250724Sjkimstatic re_dfastate_t * 1520250724Sjkiminternal_function __attribute_warn_unused_result__ 1521250724Sjkimre_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1522250724Sjkim const re_node_set *nodes, unsigned int context) 1523146040Stjr{ 1524146040Stjr unsigned int hash; 1525146040Stjr re_dfastate_t *new_state; 1526146040Stjr struct re_state_table_entry *spot; 1527146040Stjr int i; 1528146040Stjr if (nodes->nelem == 0) 1529146040Stjr { 1530146040Stjr *err = REG_NOERROR; 1531146040Stjr return NULL; 1532146040Stjr } 1533146040Stjr hash = calc_state_hash (nodes, context); 1534146040Stjr spot = dfa->state_table + (hash & dfa->state_hash_mask); 1535146040Stjr 1536146040Stjr for (i = 0 ; i < spot->num ; i++) 1537146040Stjr { 1538146040Stjr re_dfastate_t *state = spot->array[i]; 1539146040Stjr if (state->hash == hash 1540146040Stjr && state->context == context 1541146040Stjr && re_node_set_compare (state->entrance_nodes, nodes)) 1542146040Stjr return state; 1543146040Stjr } 1544146040Stjr /* There are no appropriate state in `dfa', create the new one. */ 1545146040Stjr new_state = create_cd_newstate (dfa, nodes, context, hash); 1546250724Sjkim if (BE (new_state == NULL, 0)) 1547250724Sjkim *err = REG_ESPACE; 1548250724Sjkim 1549250724Sjkim return new_state; 1550146040Stjr} 1551146040Stjr 1552146040Stjr/* Finish initialization of the new state NEWSTATE, and using its hash value 1553146040Stjr HASH put in the appropriate bucket of DFA's state table. Return value 1554146040Stjr indicates the error code if failed. */ 1555146040Stjr 1556146040Stjrstatic reg_errcode_t 1557250724Sjkim__attribute_warn_unused_result__ 1558250724Sjkimregister_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1559250724Sjkim unsigned int hash) 1560146040Stjr{ 1561146040Stjr struct re_state_table_entry *spot; 1562146040Stjr reg_errcode_t err; 1563146040Stjr int i; 1564146040Stjr 1565146040Stjr newstate->hash = hash; 1566146040Stjr err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1567146040Stjr if (BE (err != REG_NOERROR, 0)) 1568146040Stjr return REG_ESPACE; 1569146040Stjr for (i = 0; i < newstate->nodes.nelem; i++) 1570146040Stjr { 1571146040Stjr int elem = newstate->nodes.elems[i]; 1572146040Stjr if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1573250724Sjkim if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0) 1574250724Sjkim return REG_ESPACE; 1575146040Stjr } 1576146040Stjr 1577146040Stjr spot = dfa->state_table + (hash & dfa->state_hash_mask); 1578146040Stjr if (BE (spot->alloc <= spot->num, 0)) 1579146040Stjr { 1580146040Stjr int new_alloc = 2 * spot->num + 2; 1581146040Stjr re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1582146040Stjr new_alloc); 1583146040Stjr if (BE (new_array == NULL, 0)) 1584146040Stjr return REG_ESPACE; 1585146040Stjr spot->array = new_array; 1586146040Stjr spot->alloc = new_alloc; 1587146040Stjr } 1588146040Stjr spot->array[spot->num++] = newstate; 1589146040Stjr return REG_NOERROR; 1590146040Stjr} 1591146040Stjr 1592250724Sjkimstatic void 1593250724Sjkimfree_state (re_dfastate_t *state) 1594250724Sjkim{ 1595250724Sjkim re_node_set_free (&state->non_eps_nodes); 1596250724Sjkim re_node_set_free (&state->inveclosure); 1597250724Sjkim if (state->entrance_nodes != &state->nodes) 1598250724Sjkim { 1599250724Sjkim re_node_set_free (state->entrance_nodes); 1600250724Sjkim re_free (state->entrance_nodes); 1601250724Sjkim } 1602250724Sjkim re_node_set_free (&state->nodes); 1603250724Sjkim re_free (state->word_trtable); 1604250724Sjkim re_free (state->trtable); 1605250724Sjkim re_free (state); 1606250724Sjkim} 1607250724Sjkim 1608146040Stjr/* Create the new state which is independ of contexts. 1609146040Stjr Return the new state if succeeded, otherwise return NULL. */ 1610146040Stjr 1611146040Stjrstatic re_dfastate_t * 1612250724Sjkiminternal_function __attribute_warn_unused_result__ 1613250724Sjkimcreate_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1614250724Sjkim unsigned int hash) 1615146040Stjr{ 1616146040Stjr int i; 1617146040Stjr reg_errcode_t err; 1618146040Stjr re_dfastate_t *newstate; 1619146040Stjr 1620146040Stjr newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1621146040Stjr if (BE (newstate == NULL, 0)) 1622146040Stjr return NULL; 1623146040Stjr err = re_node_set_init_copy (&newstate->nodes, nodes); 1624146040Stjr if (BE (err != REG_NOERROR, 0)) 1625146040Stjr { 1626146040Stjr re_free (newstate); 1627146040Stjr return NULL; 1628146040Stjr } 1629146040Stjr 1630146040Stjr newstate->entrance_nodes = &newstate->nodes; 1631146040Stjr for (i = 0 ; i < nodes->nelem ; i++) 1632146040Stjr { 1633146040Stjr re_token_t *node = dfa->nodes + nodes->elems[i]; 1634146040Stjr re_token_type_t type = node->type; 1635146040Stjr if (type == CHARACTER && !node->constraint) 1636146040Stjr continue; 1637146040Stjr#ifdef RE_ENABLE_I18N 1638146040Stjr newstate->accept_mb |= node->accept_mb; 1639146040Stjr#endif /* RE_ENABLE_I18N */ 1640146040Stjr 1641146040Stjr /* If the state has the halt node, the state is a halt state. */ 1642146040Stjr if (type == END_OF_RE) 1643146040Stjr newstate->halt = 1; 1644146040Stjr else if (type == OP_BACK_REF) 1645146040Stjr newstate->has_backref = 1; 1646146040Stjr else if (type == ANCHOR || node->constraint) 1647146040Stjr newstate->has_constraint = 1; 1648146040Stjr } 1649146040Stjr err = register_state (dfa, newstate, hash); 1650146040Stjr if (BE (err != REG_NOERROR, 0)) 1651146040Stjr { 1652146040Stjr free_state (newstate); 1653146040Stjr newstate = NULL; 1654146040Stjr } 1655146040Stjr return newstate; 1656146040Stjr} 1657146040Stjr 1658146040Stjr/* Create the new state which is depend on the context CONTEXT. 1659146040Stjr Return the new state if succeeded, otherwise return NULL. */ 1660146040Stjr 1661146040Stjrstatic re_dfastate_t * 1662250724Sjkiminternal_function __attribute_warn_unused_result__ 1663250724Sjkimcreate_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1664250724Sjkim unsigned int context, unsigned int hash) 1665146040Stjr{ 1666146040Stjr int i, nctx_nodes = 0; 1667146040Stjr reg_errcode_t err; 1668146040Stjr re_dfastate_t *newstate; 1669146040Stjr 1670146040Stjr newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1671146040Stjr if (BE (newstate == NULL, 0)) 1672146040Stjr return NULL; 1673146040Stjr err = re_node_set_init_copy (&newstate->nodes, nodes); 1674146040Stjr if (BE (err != REG_NOERROR, 0)) 1675146040Stjr { 1676146040Stjr re_free (newstate); 1677146040Stjr return NULL; 1678146040Stjr } 1679146040Stjr 1680146040Stjr newstate->context = context; 1681146040Stjr newstate->entrance_nodes = &newstate->nodes; 1682146040Stjr 1683146040Stjr for (i = 0 ; i < nodes->nelem ; i++) 1684146040Stjr { 1685146040Stjr re_token_t *node = dfa->nodes + nodes->elems[i]; 1686146040Stjr re_token_type_t type = node->type; 1687250724Sjkim unsigned int constraint = node->constraint; 1688146040Stjr 1689146040Stjr if (type == CHARACTER && !constraint) 1690146040Stjr continue; 1691146040Stjr#ifdef RE_ENABLE_I18N 1692146040Stjr newstate->accept_mb |= node->accept_mb; 1693146040Stjr#endif /* RE_ENABLE_I18N */ 1694146040Stjr 1695146040Stjr /* If the state has the halt node, the state is a halt state. */ 1696146040Stjr if (type == END_OF_RE) 1697146040Stjr newstate->halt = 1; 1698146040Stjr else if (type == OP_BACK_REF) 1699146040Stjr newstate->has_backref = 1; 1700146040Stjr 1701146040Stjr if (constraint) 1702146040Stjr { 1703146040Stjr if (newstate->entrance_nodes == &newstate->nodes) 1704146040Stjr { 1705146040Stjr newstate->entrance_nodes = re_malloc (re_node_set, 1); 1706146040Stjr if (BE (newstate->entrance_nodes == NULL, 0)) 1707146040Stjr { 1708146040Stjr free_state (newstate); 1709146040Stjr return NULL; 1710146040Stjr } 1711250724Sjkim if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1712250724Sjkim != REG_NOERROR) 1713250724Sjkim return NULL; 1714146040Stjr nctx_nodes = 0; 1715146040Stjr newstate->has_constraint = 1; 1716146040Stjr } 1717146040Stjr 1718146040Stjr if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1719146040Stjr { 1720146040Stjr re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1721146040Stjr ++nctx_nodes; 1722146040Stjr } 1723146040Stjr } 1724146040Stjr } 1725146040Stjr err = register_state (dfa, newstate, hash); 1726146040Stjr if (BE (err != REG_NOERROR, 0)) 1727146040Stjr { 1728146040Stjr free_state (newstate); 1729146040Stjr newstate = NULL; 1730146040Stjr } 1731146040Stjr return newstate; 1732146040Stjr} 1733