1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2005,2007,2009,2010,2011 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 21 int n) internal_function; 22static void match_ctx_clean (re_match_context_t *mctx) internal_function; 23static void match_ctx_free (re_match_context_t *cache) internal_function; 24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 25 int str_idx, int from, int to) 26 internal_function; 27static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 28 internal_function; 29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 30 int str_idx) internal_function; 31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 32 int node, int str_idx) 33 internal_function; 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 35 re_dfastate_t **limited_sts, int last_node, 36 int last_str_idx) 37 internal_function; 38static reg_errcode_t re_search_internal (const regex_t *preg, 39 const char *string, int length, 40 int start, int range, int stop, 41 size_t nmatch, regmatch_t pmatch[], 42 int eflags) internal_function; 43static int re_search_2_stub (struct re_pattern_buffer *bufp, 44 const char *string1, int length1, 45 const char *string2, int length2, 46 int start, int range, struct re_registers *regs, 47 int stop, int ret_len) internal_function; 48static int re_search_stub (struct re_pattern_buffer *bufp, 49 const char *string, int length, int start, 50 int range, int stop, struct re_registers *regs, 51 int ret_len) internal_function; 52static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 53 int nregs, int regs_allocated) internal_function; 54static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 55 internal_function; 56static int check_matching (re_match_context_t *mctx, int fl_longest_match, 57 int *p_match_first) internal_function; 58static int check_halt_state_context (const re_match_context_t *mctx, 59 const re_dfastate_t *state, int idx) 60 internal_function; 61static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 62 regmatch_t *prev_idx_match, int cur_node, 63 int cur_idx, int nmatch) internal_function; 64static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 65 int str_idx, int dest_node, int nregs, 66 regmatch_t *regs, 67 re_node_set *eps_via_nodes) 68 internal_function; 69static reg_errcode_t set_regs (const regex_t *preg, 70 const re_match_context_t *mctx, 71 size_t nmatch, regmatch_t *pmatch, 72 int fl_backtrack) internal_function; 73static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 74 internal_function; 75 76#ifdef RE_ENABLE_I18N 77static int sift_states_iter_mb (const re_match_context_t *mctx, 78 re_sift_context_t *sctx, 79 int node_idx, int str_idx, int max_str_idx) 80 internal_function; 81#endif /* RE_ENABLE_I18N */ 82static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 83 re_sift_context_t *sctx) 84 internal_function; 85static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 86 re_sift_context_t *sctx, int str_idx, 87 re_node_set *cur_dest) 88 internal_function; 89static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 90 re_sift_context_t *sctx, 91 int str_idx, 92 re_node_set *dest_nodes) 93 internal_function; 94static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 95 re_node_set *dest_nodes, 96 const re_node_set *candidates) 97 internal_function; 98static int check_dst_limits (const re_match_context_t *mctx, 99 re_node_set *limits, 100 int dst_node, int dst_idx, int src_node, 101 int src_idx) internal_function; 102static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 103 int boundaries, int subexp_idx, 104 int from_node, int bkref_idx) 105 internal_function; 106static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 107 int limit, int subexp_idx, 108 int node, int str_idx, 109 int bkref_idx) internal_function; 110static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 111 re_node_set *dest_nodes, 112 const re_node_set *candidates, 113 re_node_set *limits, 114 struct re_backref_cache_entry *bkref_ents, 115 int str_idx) internal_function; 116static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 117 re_sift_context_t *sctx, 118 int str_idx, const re_node_set *candidates) 119 internal_function; 120static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 121 re_dfastate_t **dst, 122 re_dfastate_t **src, int num) 123 internal_function; 124static re_dfastate_t *find_recover_state (reg_errcode_t *err, 125 re_match_context_t *mctx) internal_function; 126static re_dfastate_t *transit_state (reg_errcode_t *err, 127 re_match_context_t *mctx, 128 re_dfastate_t *state) internal_function; 129static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 130 re_match_context_t *mctx, 131 re_dfastate_t *next_state) 132 internal_function; 133static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 134 re_node_set *cur_nodes, 135 int str_idx) internal_function; 136#if 0 137static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 138 re_match_context_t *mctx, 139 re_dfastate_t *pstate) 140 internal_function; 141#endif 142#ifdef RE_ENABLE_I18N 143static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 144 re_dfastate_t *pstate) 145 internal_function; 146#endif /* RE_ENABLE_I18N */ 147static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 148 const re_node_set *nodes) 149 internal_function; 150static reg_errcode_t get_subexp (re_match_context_t *mctx, 151 int bkref_node, int bkref_str_idx) 152 internal_function; 153static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 154 const re_sub_match_top_t *sub_top, 155 re_sub_match_last_t *sub_last, 156 int bkref_node, int bkref_str) 157 internal_function; 158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 159 int subexp_idx, int type) internal_function; 160static reg_errcode_t check_arrival (re_match_context_t *mctx, 161 state_array_t *path, int top_node, 162 int top_str, int last_node, int last_str, 163 int type) internal_function; 164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 165 int str_idx, 166 re_node_set *cur_nodes, 167 re_node_set *next_nodes) 168 internal_function; 169static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 170 re_node_set *cur_nodes, 171 int ex_subexp, int type) 172 internal_function; 173static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 174 re_node_set *dst_nodes, 175 int target, int ex_subexp, 176 int type) internal_function; 177static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 178 re_node_set *cur_nodes, int cur_str, 179 int subexp_num, int type) 180 internal_function; 181static int build_trtable (const re_dfa_t *dfa, 182 re_dfastate_t *state) internal_function; 183#ifdef RE_ENABLE_I18N 184static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 185 const re_string_t *input, int idx) 186 internal_function; 187# ifdef _LIBC 188static unsigned int find_collation_sequence_value (const unsigned char *mbs, 189 size_t name_len) 190 internal_function; 191# endif /* _LIBC */ 192#endif /* RE_ENABLE_I18N */ 193static int group_nodes_into_DFAstates (const re_dfa_t *dfa, 194 const re_dfastate_t *state, 195 re_node_set *states_node, 196 bitset_t *states_ch) internal_function; 197static int check_node_accept (const re_match_context_t *mctx, 198 const re_token_t *node, int idx) 199 internal_function; 200static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len) 201 internal_function; 202 203/* Entry point for POSIX code. */ 204 205/* regexec searches for a given pattern, specified by PREG, in the 206 string STRING. 207 208 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 209 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 210 least NMATCH elements, and we set them to the offsets of the 211 corresponding matched substrings. 212 213 EFLAGS specifies `execution flags' which affect matching: if 214 REG_NOTBOL is set, then ^ does not match at the beginning of the 215 string; if REG_NOTEOL is set, then $ does not match at the end. 216 217 We return 0 if we find a match and REG_NOMATCH if not. */ 218 219int 220regexec (preg, string, nmatch, pmatch, eflags) 221 const regex_t *__restrict preg; 222 const char *__restrict string; 223 size_t nmatch; 224 regmatch_t pmatch[]; 225 int eflags; 226{ 227 reg_errcode_t err; 228 int start, length; 229 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 230 231 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 232 return REG_BADPAT; 233 234 if (eflags & REG_STARTEND) 235 { 236 start = pmatch[0].rm_so; 237 length = pmatch[0].rm_eo; 238 } 239 else 240 { 241 start = 0; 242 length = strlen (string); 243 } 244 245 __libc_lock_lock (dfa->lock); 246 if (preg->no_sub) 247 err = re_search_internal (preg, string, length, start, length - start, 248 length, 0, NULL, eflags); 249 else 250 err = re_search_internal (preg, string, length, start, length - start, 251 length, nmatch, pmatch, eflags); 252 __libc_lock_unlock (dfa->lock); 253 return err != REG_NOERROR; 254} 255 256#ifdef _LIBC 257# include <shlib-compat.h> 258versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 259 260# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 261__typeof__ (__regexec) __compat_regexec; 262 263int 264attribute_compat_text_section 265__compat_regexec (const regex_t *__restrict preg, 266 const char *__restrict string, size_t nmatch, 267 regmatch_t pmatch[], int eflags) 268{ 269 return regexec (preg, string, nmatch, pmatch, 270 eflags & (REG_NOTBOL | REG_NOTEOL)); 271} 272compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 273# endif 274#endif 275 276/* Entry points for GNU code. */ 277 278/* re_match, re_search, re_match_2, re_search_2 279 280 The former two functions operate on STRING with length LENGTH, 281 while the later two operate on concatenation of STRING1 and STRING2 282 with lengths LENGTH1 and LENGTH2, respectively. 283 284 re_match() matches the compiled pattern in BUFP against the string, 285 starting at index START. 286 287 re_search() first tries matching at index START, then it tries to match 288 starting from index START + 1, and so on. The last start position tried 289 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 290 way as re_match().) 291 292 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 293 the first STOP characters of the concatenation of the strings should be 294 concerned. 295 296 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 297 and all groups is stroed in REGS. (For the "_2" variants, the offsets are 298 computed relative to the concatenation, not relative to the individual 299 strings.) 300 301 On success, re_match* functions return the length of the match, re_search* 302 return the position of the start of the match. Return value -1 means no 303 match was found and -2 indicates an internal error. */ 304 305int 306re_match (bufp, string, length, start, regs) 307 struct re_pattern_buffer *bufp; 308 const char *string; 309 int length, start; 310 struct re_registers *regs; 311{ 312 return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 313} 314#ifdef _LIBC 315weak_alias (__re_match, re_match) 316#endif 317 318int 319re_search (bufp, string, length, start, range, regs) 320 struct re_pattern_buffer *bufp; 321 const char *string; 322 int length, start, range; 323 struct re_registers *regs; 324{ 325 return re_search_stub (bufp, string, length, start, range, length, regs, 0); 326} 327#ifdef _LIBC 328weak_alias (__re_search, re_search) 329#endif 330 331int 332re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 333 struct re_pattern_buffer *bufp; 334 const char *string1, *string2; 335 int length1, length2, start, stop; 336 struct re_registers *regs; 337{ 338 return re_search_2_stub (bufp, string1, length1, string2, length2, 339 start, 0, regs, stop, 1); 340} 341#ifdef _LIBC 342weak_alias (__re_match_2, re_match_2) 343#endif 344 345int 346re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 347 struct re_pattern_buffer *bufp; 348 const char *string1, *string2; 349 int length1, length2, start, range, stop; 350 struct re_registers *regs; 351{ 352 return re_search_2_stub (bufp, string1, length1, string2, length2, 353 start, range, regs, stop, 0); 354} 355#ifdef _LIBC 356weak_alias (__re_search_2, re_search_2) 357#endif 358 359static int 360re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 361 stop, ret_len) 362 struct re_pattern_buffer *bufp; 363 const char *string1, *string2; 364 int length1, length2, start, range, stop, ret_len; 365 struct re_registers *regs; 366{ 367 const char *str; 368 int rval; 369 int len = length1 + length2; 370 char *s = NULL; 371 372 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0)) 373 return -2; 374 375 /* Concatenate the strings. */ 376 if (length2 > 0) 377 if (length1 > 0) 378 { 379 s = re_malloc (char, len); 380 381 if (BE (s == NULL, 0)) 382 return -2; 383#ifdef _LIBC 384 memcpy (__mempcpy (s, string1, length1), string2, length2); 385#else 386 memcpy (s, string1, length1); 387 memcpy (s + length1, string2, length2); 388#endif 389 str = s; 390 } 391 else 392 str = string2; 393 else 394 str = string1; 395 396 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len); 397 re_free (s); 398 return rval; 399} 400 401/* The parameters have the same meaning as those of re_search. 402 Additional parameters: 403 If RET_LEN is nonzero the length of the match is returned (re_match style); 404 otherwise the position of the match is returned. */ 405 406static int 407re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 408 struct re_pattern_buffer *bufp; 409 const char *string; 410 int length, start, range, stop, ret_len; 411 struct re_registers *regs; 412{ 413 reg_errcode_t result; 414 regmatch_t *pmatch; 415 int nregs, rval; 416 int eflags = 0; 417 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 418 419 /* Check for out-of-range. */ 420 if (BE (start < 0 || start > length, 0)) 421 return -1; 422 if (BE (start + range > length, 0)) 423 range = length - start; 424 else if (BE (start + range < 0, 0)) 425 range = -start; 426 427 __libc_lock_lock (dfa->lock); 428 429 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 430 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 431 432 /* Compile fastmap if we haven't yet. */ 433 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 434 re_compile_fastmap (bufp); 435 436 if (BE (bufp->no_sub, 0)) 437 regs = NULL; 438 439 /* We need at least 1 register. */ 440 if (regs == NULL) 441 nregs = 1; 442 else if (BE (bufp->regs_allocated == REGS_FIXED && 443 regs->num_regs < bufp->re_nsub + 1, 0)) 444 { 445 nregs = regs->num_regs; 446 if (BE (nregs < 1, 0)) 447 { 448 /* Nothing can be copied to regs. */ 449 regs = NULL; 450 nregs = 1; 451 } 452 } 453 else 454 nregs = bufp->re_nsub + 1; 455 pmatch = re_malloc (regmatch_t, nregs); 456 if (BE (pmatch == NULL, 0)) 457 { 458 rval = -2; 459 goto out; 460 } 461 462 result = re_search_internal (bufp, string, length, start, range, stop, 463 nregs, pmatch, eflags); 464 465 rval = 0; 466 467 /* I hope we needn't fill ther regs with -1's when no match was found. */ 468 if (result != REG_NOERROR) 469 rval = -1; 470 else if (regs != NULL) 471 { 472 /* If caller wants register contents data back, copy them. */ 473 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 474 bufp->regs_allocated); 475 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 476 rval = -2; 477 } 478 479 if (BE (rval == 0, 1)) 480 { 481 if (ret_len) 482 { 483 assert (pmatch[0].rm_so == start); 484 rval = pmatch[0].rm_eo - start; 485 } 486 else 487 rval = pmatch[0].rm_so; 488 } 489 re_free (pmatch); 490 out: 491 __libc_lock_unlock (dfa->lock); 492 return rval; 493} 494 495static unsigned 496re_copy_regs (regs, pmatch, nregs, regs_allocated) 497 struct re_registers *regs; 498 regmatch_t *pmatch; 499 int nregs, regs_allocated; 500{ 501 int rval = REGS_REALLOCATE; 502 int i; 503 int need_regs = nregs + 1; 504 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 505 uses. */ 506 507 /* Have the register data arrays been allocated? */ 508 if (regs_allocated == REGS_UNALLOCATED) 509 { /* No. So allocate them with malloc. */ 510 regs->start = re_malloc (regoff_t, need_regs); 511 if (BE (regs->start == NULL, 0)) 512 return REGS_UNALLOCATED; 513 regs->end = re_malloc (regoff_t, need_regs); 514 if (BE (regs->end == NULL, 0)) 515 { 516 re_free (regs->start); 517 return REGS_UNALLOCATED; 518 } 519 regs->num_regs = need_regs; 520 } 521 else if (regs_allocated == REGS_REALLOCATE) 522 { /* Yes. If we need more elements than were already 523 allocated, reallocate them. If we need fewer, just 524 leave it alone. */ 525 if (BE (need_regs > regs->num_regs, 0)) 526 { 527 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 528 regoff_t *new_end; 529 if (BE (new_start == NULL, 0)) 530 return REGS_UNALLOCATED; 531 new_end = re_realloc (regs->end, regoff_t, need_regs); 532 if (BE (new_end == NULL, 0)) 533 { 534 re_free (new_start); 535 return REGS_UNALLOCATED; 536 } 537 regs->start = new_start; 538 regs->end = new_end; 539 regs->num_regs = need_regs; 540 } 541 } 542 else 543 { 544 assert (regs_allocated == REGS_FIXED); 545 /* This function may not be called with REGS_FIXED and nregs too big. */ 546 assert (regs->num_regs >= nregs); 547 rval = REGS_FIXED; 548 } 549 550 /* Copy the regs. */ 551 for (i = 0; i < nregs; ++i) 552 { 553 regs->start[i] = pmatch[i].rm_so; 554 regs->end[i] = pmatch[i].rm_eo; 555 } 556 for ( ; i < regs->num_regs; ++i) 557 regs->start[i] = regs->end[i] = -1; 558 559 return rval; 560} 561 562/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 563 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 564 this memory for recording register information. STARTS and ENDS 565 must be allocated using the malloc library routine, and must each 566 be at least NUM_REGS * sizeof (regoff_t) bytes long. 567 568 If NUM_REGS == 0, then subsequent matches should allocate their own 569 register data. 570 571 Unless this function is called, the first search or match using 572 PATTERN_BUFFER will allocate its own register data, without 573 freeing the old data. */ 574 575void 576re_set_registers (bufp, regs, num_regs, starts, ends) 577 struct re_pattern_buffer *bufp; 578 struct re_registers *regs; 579 unsigned num_regs; 580 regoff_t *starts, *ends; 581{ 582 if (num_regs) 583 { 584 bufp->regs_allocated = REGS_REALLOCATE; 585 regs->num_regs = num_regs; 586 regs->start = starts; 587 regs->end = ends; 588 } 589 else 590 { 591 bufp->regs_allocated = REGS_UNALLOCATED; 592 regs->num_regs = 0; 593 regs->start = regs->end = (regoff_t *) 0; 594 } 595} 596#ifdef _LIBC 597weak_alias (__re_set_registers, re_set_registers) 598#endif 599 600/* Entry points compatible with 4.2 BSD regex library. We don't define 601 them unless specifically requested. */ 602 603#if defined _REGEX_RE_COMP || defined _LIBC 604int 605# ifdef _LIBC 606weak_function 607# endif 608re_exec (s) 609 const char *s; 610{ 611 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 612} 613#endif /* _REGEX_RE_COMP */ 614 615/* Internal entry point. */ 616 617/* Searches for a compiled pattern PREG in the string STRING, whose 618 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 619 mingings with regexec. START, and RANGE have the same meanings 620 with re_search. 621 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 622 otherwise return the error code. 623 Note: We assume front end functions already check ranges. 624 (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 625 626static reg_errcode_t 627__attribute_warn_unused_result__ 628re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 629 eflags) 630 const regex_t *preg; 631 const char *string; 632 int length, start, range, stop, eflags; 633 size_t nmatch; 634 regmatch_t pmatch[]; 635{ 636 reg_errcode_t err; 637 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 638 int left_lim, right_lim, incr; 639 int fl_longest_match, match_first, match_kind, match_last = -1; 640 int extra_nmatch; 641 int sb, ch; 642#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 643 re_match_context_t mctx = { .dfa = dfa }; 644#else 645 re_match_context_t mctx; 646#endif 647 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate 648 && range && !preg->can_be_null) ? preg->fastmap : NULL; 649 RE_TRANSLATE_TYPE t = preg->translate; 650 651#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 652 memset (&mctx, '\0', sizeof (re_match_context_t)); 653 mctx.dfa = dfa; 654#endif 655 656 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 657 nmatch -= extra_nmatch; 658 659 /* Check if the DFA haven't been compiled. */ 660 if (BE (preg->used == 0 || dfa->init_state == NULL 661 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 662 || dfa->init_state_begbuf == NULL, 0)) 663 return REG_NOMATCH; 664 665#ifdef DEBUG 666 /* We assume front-end functions already check them. */ 667 assert (start + range >= 0 && start + range <= length); 668#endif 669 670 /* If initial states with non-begbuf contexts have no elements, 671 the regex must be anchored. If preg->newline_anchor is set, 672 we'll never use init_state_nl, so do not check it. */ 673 if (dfa->init_state->nodes.nelem == 0 674 && dfa->init_state_word->nodes.nelem == 0 675 && (dfa->init_state_nl->nodes.nelem == 0 676 || !preg->newline_anchor)) 677 { 678 if (start != 0 && start + range != 0) 679 return REG_NOMATCH; 680 start = range = 0; 681 } 682 683 /* We must check the longest matching, if nmatch > 0. */ 684 fl_longest_match = (nmatch != 0 || dfa->nbackref); 685 686 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 687 preg->translate, preg->syntax & RE_ICASE, dfa); 688 if (BE (err != REG_NOERROR, 0)) 689 goto free_return; 690 mctx.input.stop = stop; 691 mctx.input.raw_stop = stop; 692 mctx.input.newline_anchor = preg->newline_anchor; 693 694 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 695 if (BE (err != REG_NOERROR, 0)) 696 goto free_return; 697 698 /* We will log all the DFA states through which the dfa pass, 699 if nmatch > 1, or this dfa has "multibyte node", which is a 700 back-reference or a node which can accept multibyte character or 701 multi character collating element. */ 702 if (nmatch > 1 || dfa->has_mb_node) 703 { 704 /* Avoid overflow. */ 705 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) 706 { 707 err = REG_ESPACE; 708 goto free_return; 709 } 710 711 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 712 if (BE (mctx.state_log == NULL, 0)) 713 { 714 err = REG_ESPACE; 715 goto free_return; 716 } 717 } 718 else 719 mctx.state_log = NULL; 720 721 match_first = start; 722 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 723 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 724 725 /* Check incrementally whether of not the input string match. */ 726 incr = (range < 0) ? -1 : 1; 727 left_lim = (range < 0) ? start + range : start; 728 right_lim = (range < 0) ? start : start + range; 729 sb = dfa->mb_cur_max == 1; 730 match_kind = 731 (fastmap 732 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 733 | (range >= 0 ? 2 : 0) 734 | (t != NULL ? 1 : 0)) 735 : 8); 736 737 for (;; match_first += incr) 738 { 739 err = REG_NOMATCH; 740 if (match_first < left_lim || right_lim < match_first) 741 goto free_return; 742 743 /* Advance as rapidly as possible through the string, until we 744 find a plausible place to start matching. This may be done 745 with varying efficiency, so there are various possibilities: 746 only the most common of them are specialized, in order to 747 save on code size. We use a switch statement for speed. */ 748 switch (match_kind) 749 { 750 case 8: 751 /* No fastmap. */ 752 break; 753 754 case 7: 755 /* Fastmap with single-byte translation, match forward. */ 756 while (BE (match_first < right_lim, 1) 757 && !fastmap[t[(unsigned char) string[match_first]]]) 758 ++match_first; 759 goto forward_match_found_start_or_reached_end; 760 761 case 6: 762 /* Fastmap without translation, match forward. */ 763 while (BE (match_first < right_lim, 1) 764 && !fastmap[(unsigned char) string[match_first]]) 765 ++match_first; 766 767 forward_match_found_start_or_reached_end: 768 if (BE (match_first == right_lim, 0)) 769 { 770 ch = match_first >= length 771 ? 0 : (unsigned char) string[match_first]; 772 if (!fastmap[t ? t[ch] : ch]) 773 goto free_return; 774 } 775 break; 776 777 case 4: 778 case 5: 779 /* Fastmap without multi-byte translation, match backwards. */ 780 while (match_first >= left_lim) 781 { 782 ch = match_first >= length 783 ? 0 : (unsigned char) string[match_first]; 784 if (fastmap[t ? t[ch] : ch]) 785 break; 786 --match_first; 787 } 788 if (match_first < left_lim) 789 goto free_return; 790 break; 791 792 default: 793 /* In this case, we can't determine easily the current byte, 794 since it might be a component byte of a multibyte 795 character. Then we use the constructed buffer instead. */ 796 for (;;) 797 { 798 /* If MATCH_FIRST is out of the valid range, reconstruct the 799 buffers. */ 800 unsigned int offset = match_first - mctx.input.raw_mbs_idx; 801 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) 802 { 803 err = re_string_reconstruct (&mctx.input, match_first, 804 eflags); 805 if (BE (err != REG_NOERROR, 0)) 806 goto free_return; 807 808 offset = match_first - mctx.input.raw_mbs_idx; 809 } 810 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 811 Note that MATCH_FIRST must not be smaller than 0. */ 812 ch = (match_first >= length 813 ? 0 : re_string_byte_at (&mctx.input, offset)); 814 if (fastmap[ch]) 815 break; 816 match_first += incr; 817 if (match_first < left_lim || match_first > right_lim) 818 { 819 err = REG_NOMATCH; 820 goto free_return; 821 } 822 } 823 break; 824 } 825 826 /* Reconstruct the buffers so that the matcher can assume that 827 the matching starts from the beginning of the buffer. */ 828 err = re_string_reconstruct (&mctx.input, match_first, eflags); 829 if (BE (err != REG_NOERROR, 0)) 830 goto free_return; 831 832#ifdef RE_ENABLE_I18N 833 /* Don't consider this char as a possible match start if it part, 834 yet isn't the head, of a multibyte character. */ 835 if (!sb && !re_string_first_byte (&mctx.input, 0)) 836 continue; 837#endif 838 839 /* It seems to be appropriate one, then use the matcher. */ 840 /* We assume that the matching starts from 0. */ 841 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 842 match_last = check_matching (&mctx, fl_longest_match, 843 range >= 0 ? &match_first : NULL); 844 if (match_last != -1) 845 { 846 if (BE (match_last == -2, 0)) 847 { 848 err = REG_ESPACE; 849 goto free_return; 850 } 851 else 852 { 853 mctx.match_last = match_last; 854 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 855 { 856 re_dfastate_t *pstate = mctx.state_log[match_last]; 857 mctx.last_node = check_halt_state_context (&mctx, pstate, 858 match_last); 859 } 860 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 861 || dfa->nbackref) 862 { 863 err = prune_impossible_nodes (&mctx); 864 if (err == REG_NOERROR) 865 break; 866 if (BE (err != REG_NOMATCH, 0)) 867 goto free_return; 868 match_last = -1; 869 } 870 else 871 break; /* We found a match. */ 872 } 873 } 874 875 match_ctx_clean (&mctx); 876 } 877 878#ifdef DEBUG 879 assert (match_last != -1); 880 assert (err == REG_NOERROR); 881#endif 882 883 /* Set pmatch[] if we need. */ 884 if (nmatch > 0) 885 { 886 int reg_idx; 887 888 /* Initialize registers. */ 889 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 890 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 891 892 /* Set the points where matching start/end. */ 893 pmatch[0].rm_so = 0; 894 pmatch[0].rm_eo = mctx.match_last; 895 896 if (!preg->no_sub && nmatch > 1) 897 { 898 err = set_regs (preg, &mctx, nmatch, pmatch, 899 dfa->has_plural_match && dfa->nbackref > 0); 900 if (BE (err != REG_NOERROR, 0)) 901 goto free_return; 902 } 903 904 /* At last, add the offset to the each registers, since we slided 905 the buffers so that we could assume that the matching starts 906 from 0. */ 907 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 908 if (pmatch[reg_idx].rm_so != -1) 909 { 910#ifdef RE_ENABLE_I18N 911 if (BE (mctx.input.offsets_needed != 0, 0)) 912 { 913 pmatch[reg_idx].rm_so = 914 (pmatch[reg_idx].rm_so == mctx.input.valid_len 915 ? mctx.input.valid_raw_len 916 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 917 pmatch[reg_idx].rm_eo = 918 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 919 ? mctx.input.valid_raw_len 920 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 921 } 922#else 923 assert (mctx.input.offsets_needed == 0); 924#endif 925 pmatch[reg_idx].rm_so += match_first; 926 pmatch[reg_idx].rm_eo += match_first; 927 } 928 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 929 { 930 pmatch[nmatch + reg_idx].rm_so = -1; 931 pmatch[nmatch + reg_idx].rm_eo = -1; 932 } 933 934 if (dfa->subexp_map) 935 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 936 if (dfa->subexp_map[reg_idx] != reg_idx) 937 { 938 pmatch[reg_idx + 1].rm_so 939 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 940 pmatch[reg_idx + 1].rm_eo 941 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 942 } 943 } 944 945 free_return: 946 re_free (mctx.state_log); 947 if (dfa->nbackref) 948 match_ctx_free (&mctx); 949 re_string_destruct (&mctx.input); 950 return err; 951} 952 953static reg_errcode_t 954__attribute_warn_unused_result__ 955prune_impossible_nodes (mctx) 956 re_match_context_t *mctx; 957{ 958 const re_dfa_t *const dfa = mctx->dfa; 959 int halt_node, match_last; 960 reg_errcode_t ret; 961 re_dfastate_t **sifted_states; 962 re_dfastate_t **lim_states = NULL; 963 re_sift_context_t sctx; 964#ifdef DEBUG 965 assert (mctx->state_log != NULL); 966#endif 967 match_last = mctx->match_last; 968 halt_node = mctx->last_node; 969 970 /* Avoid overflow. */ 971 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) 972 return REG_ESPACE; 973 974 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 975 if (BE (sifted_states == NULL, 0)) 976 { 977 ret = REG_ESPACE; 978 goto free_return; 979 } 980 if (dfa->nbackref) 981 { 982 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 983 if (BE (lim_states == NULL, 0)) 984 { 985 ret = REG_ESPACE; 986 goto free_return; 987 } 988 while (1) 989 { 990 memset (lim_states, '\0', 991 sizeof (re_dfastate_t *) * (match_last + 1)); 992 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 993 match_last); 994 ret = sift_states_backward (mctx, &sctx); 995 re_node_set_free (&sctx.limits); 996 if (BE (ret != REG_NOERROR, 0)) 997 goto free_return; 998 if (sifted_states[0] != NULL || lim_states[0] != NULL) 999 break; 1000 do 1001 { 1002 --match_last; 1003 if (match_last < 0) 1004 { 1005 ret = REG_NOMATCH; 1006 goto free_return; 1007 } 1008 } while (mctx->state_log[match_last] == NULL 1009 || !mctx->state_log[match_last]->halt); 1010 halt_node = check_halt_state_context (mctx, 1011 mctx->state_log[match_last], 1012 match_last); 1013 } 1014 ret = merge_state_array (dfa, sifted_states, lim_states, 1015 match_last + 1); 1016 re_free (lim_states); 1017 lim_states = NULL; 1018 if (BE (ret != REG_NOERROR, 0)) 1019 goto free_return; 1020 } 1021 else 1022 { 1023 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1024 ret = sift_states_backward (mctx, &sctx); 1025 re_node_set_free (&sctx.limits); 1026 if (BE (ret != REG_NOERROR, 0)) 1027 goto free_return; 1028 if (sifted_states[0] == NULL) 1029 { 1030 ret = REG_NOMATCH; 1031 goto free_return; 1032 } 1033 } 1034 re_free (mctx->state_log); 1035 mctx->state_log = sifted_states; 1036 sifted_states = NULL; 1037 mctx->last_node = halt_node; 1038 mctx->match_last = match_last; 1039 ret = REG_NOERROR; 1040 free_return: 1041 re_free (sifted_states); 1042 re_free (lim_states); 1043 return ret; 1044} 1045 1046/* Acquire an initial state and return it. 1047 We must select appropriate initial state depending on the context, 1048 since initial states may have constraints like "\<", "^", etc.. */ 1049 1050static inline re_dfastate_t * 1051__attribute ((always_inline)) internal_function 1052acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1053 int idx) 1054{ 1055 const re_dfa_t *const dfa = mctx->dfa; 1056 if (dfa->init_state->has_constraint) 1057 { 1058 unsigned int context; 1059 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1060 if (IS_WORD_CONTEXT (context)) 1061 return dfa->init_state_word; 1062 else if (IS_ORDINARY_CONTEXT (context)) 1063 return dfa->init_state; 1064 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1065 return dfa->init_state_begbuf; 1066 else if (IS_NEWLINE_CONTEXT (context)) 1067 return dfa->init_state_nl; 1068 else if (IS_BEGBUF_CONTEXT (context)) 1069 { 1070 /* It is relatively rare case, then calculate on demand. */ 1071 return re_acquire_state_context (err, dfa, 1072 dfa->init_state->entrance_nodes, 1073 context); 1074 } 1075 else 1076 /* Must not happen? */ 1077 return dfa->init_state; 1078 } 1079 else 1080 return dfa->init_state; 1081} 1082 1083/* Check whether the regular expression match input string INPUT or not, 1084 and return the index where the matching end, return -1 if not match, 1085 or return -2 in case of an error. 1086 FL_LONGEST_MATCH means we want the POSIX longest matching. 1087 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1088 next place where we may want to try matching. 1089 Note that the matcher assume that the maching starts from the current 1090 index of the buffer. */ 1091 1092static int 1093internal_function __attribute_warn_unused_result__ 1094check_matching (re_match_context_t *mctx, int fl_longest_match, 1095 int *p_match_first) 1096{ 1097 const re_dfa_t *const dfa = mctx->dfa; 1098 reg_errcode_t err; 1099 int match = 0; 1100 int match_last = -1; 1101 int cur_str_idx = re_string_cur_idx (&mctx->input); 1102 re_dfastate_t *cur_state; 1103 int at_init_state = p_match_first != NULL; 1104 int next_start_idx = cur_str_idx; 1105 1106 err = REG_NOERROR; 1107 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1108 /* An initial state must not be NULL (invalid). */ 1109 if (BE (cur_state == NULL, 0)) 1110 { 1111 assert (err == REG_ESPACE); 1112 return -2; 1113 } 1114 1115 if (mctx->state_log != NULL) 1116 { 1117 mctx->state_log[cur_str_idx] = cur_state; 1118 1119 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1120 later. E.g. Processing back references. */ 1121 if (BE (dfa->nbackref, 0)) 1122 { 1123 at_init_state = 0; 1124 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1125 if (BE (err != REG_NOERROR, 0)) 1126 return err; 1127 1128 if (cur_state->has_backref) 1129 { 1130 err = transit_state_bkref (mctx, &cur_state->nodes); 1131 if (BE (err != REG_NOERROR, 0)) 1132 return err; 1133 } 1134 } 1135 } 1136 1137 /* If the RE accepts NULL string. */ 1138 if (BE (cur_state->halt, 0)) 1139 { 1140 if (!cur_state->has_constraint 1141 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1142 { 1143 if (!fl_longest_match) 1144 return cur_str_idx; 1145 else 1146 { 1147 match_last = cur_str_idx; 1148 match = 1; 1149 } 1150 } 1151 } 1152 1153 while (!re_string_eoi (&mctx->input)) 1154 { 1155 re_dfastate_t *old_state = cur_state; 1156 int next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1157 1158 if ((BE (next_char_idx >= mctx->input.bufs_len, 0) 1159 && mctx->input.bufs_len < mctx->input.len) 1160 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1161 && mctx->input.valid_len < mctx->input.len)) 1162 { 1163 err = extend_buffers (mctx, next_char_idx + 1); 1164 if (BE (err != REG_NOERROR, 0)) 1165 { 1166 assert (err == REG_ESPACE); 1167 return -2; 1168 } 1169 } 1170 1171 cur_state = transit_state (&err, mctx, cur_state); 1172 if (mctx->state_log != NULL) 1173 cur_state = merge_state_with_log (&err, mctx, cur_state); 1174 1175 if (cur_state == NULL) 1176 { 1177 /* Reached the invalid state or an error. Try to recover a valid 1178 state using the state log, if available and if we have not 1179 already found a valid (even if not the longest) match. */ 1180 if (BE (err != REG_NOERROR, 0)) 1181 return -2; 1182 1183 if (mctx->state_log == NULL 1184 || (match && !fl_longest_match) 1185 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1186 break; 1187 } 1188 1189 if (BE (at_init_state, 0)) 1190 { 1191 if (old_state == cur_state) 1192 next_start_idx = next_char_idx; 1193 else 1194 at_init_state = 0; 1195 } 1196 1197 if (cur_state->halt) 1198 { 1199 /* Reached a halt state. 1200 Check the halt state can satisfy the current context. */ 1201 if (!cur_state->has_constraint 1202 || check_halt_state_context (mctx, cur_state, 1203 re_string_cur_idx (&mctx->input))) 1204 { 1205 /* We found an appropriate halt state. */ 1206 match_last = re_string_cur_idx (&mctx->input); 1207 match = 1; 1208 1209 /* We found a match, do not modify match_first below. */ 1210 p_match_first = NULL; 1211 if (!fl_longest_match) 1212 break; 1213 } 1214 } 1215 } 1216 1217 if (p_match_first) 1218 *p_match_first += next_start_idx; 1219 1220 return match_last; 1221} 1222 1223/* Check NODE match the current context. */ 1224 1225static int 1226internal_function 1227check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context) 1228{ 1229 re_token_type_t type = dfa->nodes[node].type; 1230 unsigned int constraint = dfa->nodes[node].constraint; 1231 if (type != END_OF_RE) 1232 return 0; 1233 if (!constraint) 1234 return 1; 1235 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1236 return 0; 1237 return 1; 1238} 1239 1240/* Check the halt state STATE match the current context. 1241 Return 0 if not match, if the node, STATE has, is a halt node and 1242 match the context, return the node. */ 1243 1244static int 1245internal_function 1246check_halt_state_context (const re_match_context_t *mctx, 1247 const re_dfastate_t *state, int idx) 1248{ 1249 int i; 1250 unsigned int context; 1251#ifdef DEBUG 1252 assert (state->halt); 1253#endif 1254 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1255 for (i = 0; i < state->nodes.nelem; ++i) 1256 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1257 return state->nodes.elems[i]; 1258 return 0; 1259} 1260 1261/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1262 corresponding to the DFA). 1263 Return the destination node, and update EPS_VIA_NODES, return -1 in case 1264 of errors. */ 1265 1266static int 1267internal_function 1268proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs, 1269 int *pidx, int node, re_node_set *eps_via_nodes, 1270 struct re_fail_stack_t *fs) 1271{ 1272 const re_dfa_t *const dfa = mctx->dfa; 1273 int i, err; 1274 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1275 { 1276 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1277 re_node_set *edests = &dfa->edests[node]; 1278 int dest_node; 1279 err = re_node_set_insert (eps_via_nodes, node); 1280 if (BE (err < 0, 0)) 1281 return -2; 1282 /* Pick up a valid destination, or return -1 if none is found. */ 1283 for (dest_node = -1, i = 0; i < edests->nelem; ++i) 1284 { 1285 int candidate = edests->elems[i]; 1286 if (!re_node_set_contains (cur_nodes, candidate)) 1287 continue; 1288 if (dest_node == -1) 1289 dest_node = candidate; 1290 1291 else 1292 { 1293 /* In order to avoid infinite loop like "(a*)*", return the second 1294 epsilon-transition if the first was already considered. */ 1295 if (re_node_set_contains (eps_via_nodes, dest_node)) 1296 return candidate; 1297 1298 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1299 else if (fs != NULL 1300 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1301 eps_via_nodes)) 1302 return -2; 1303 1304 /* We know we are going to exit. */ 1305 break; 1306 } 1307 } 1308 return dest_node; 1309 } 1310 else 1311 { 1312 int naccepted = 0; 1313 re_token_type_t type = dfa->nodes[node].type; 1314 1315#ifdef RE_ENABLE_I18N 1316 if (dfa->nodes[node].accept_mb) 1317 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1318 else 1319#endif /* RE_ENABLE_I18N */ 1320 if (type == OP_BACK_REF) 1321 { 1322 int subexp_idx = dfa->nodes[node].opr.idx + 1; 1323 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1324 if (fs != NULL) 1325 { 1326 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1327 return -1; 1328 else if (naccepted) 1329 { 1330 char *buf = (char *) re_string_get_buffer (&mctx->input); 1331 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1332 naccepted) != 0) 1333 return -1; 1334 } 1335 } 1336 1337 if (naccepted == 0) 1338 { 1339 int dest_node; 1340 err = re_node_set_insert (eps_via_nodes, node); 1341 if (BE (err < 0, 0)) 1342 return -2; 1343 dest_node = dfa->edests[node].elems[0]; 1344 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1345 dest_node)) 1346 return dest_node; 1347 } 1348 } 1349 1350 if (naccepted != 0 1351 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1352 { 1353 int dest_node = dfa->nexts[node]; 1354 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1355 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1356 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1357 dest_node))) 1358 return -1; 1359 re_node_set_empty (eps_via_nodes); 1360 return dest_node; 1361 } 1362 } 1363 return -1; 1364} 1365 1366static reg_errcode_t 1367internal_function __attribute_warn_unused_result__ 1368push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node, 1369 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1370{ 1371 reg_errcode_t err; 1372 int num = fs->num++; 1373 if (fs->num == fs->alloc) 1374 { 1375 struct re_fail_stack_ent_t *new_array; 1376 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1377 * fs->alloc * 2)); 1378 if (new_array == NULL) 1379 return REG_ESPACE; 1380 fs->alloc *= 2; 1381 fs->stack = new_array; 1382 } 1383 fs->stack[num].idx = str_idx; 1384 fs->stack[num].node = dest_node; 1385 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1386 if (fs->stack[num].regs == NULL) 1387 return REG_ESPACE; 1388 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1389 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1390 return err; 1391} 1392 1393static int 1394internal_function 1395pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 1396 regmatch_t *regs, re_node_set *eps_via_nodes) 1397{ 1398 int num = --fs->num; 1399 assert (num >= 0); 1400 *pidx = fs->stack[num].idx; 1401 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1402 re_node_set_free (eps_via_nodes); 1403 re_free (fs->stack[num].regs); 1404 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1405 return fs->stack[num].node; 1406} 1407 1408/* Set the positions where the subexpressions are starts/ends to registers 1409 PMATCH. 1410 Note: We assume that pmatch[0] is already set, and 1411 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1412 1413static reg_errcode_t 1414internal_function __attribute_warn_unused_result__ 1415set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1416 regmatch_t *pmatch, int fl_backtrack) 1417{ 1418 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 1419 int idx, cur_node; 1420 re_node_set eps_via_nodes; 1421 struct re_fail_stack_t *fs; 1422 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1423 regmatch_t *prev_idx_match; 1424 int prev_idx_match_malloced = 0; 1425 1426#ifdef DEBUG 1427 assert (nmatch > 1); 1428 assert (mctx->state_log != NULL); 1429#endif 1430 if (fl_backtrack) 1431 { 1432 fs = &fs_body; 1433 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1434 if (fs->stack == NULL) 1435 return REG_ESPACE; 1436 } 1437 else 1438 fs = NULL; 1439 1440 cur_node = dfa->init_node; 1441 re_node_set_init_empty (&eps_via_nodes); 1442 1443 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1444 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1445 else 1446 { 1447 prev_idx_match = re_malloc (regmatch_t, nmatch); 1448 if (prev_idx_match == NULL) 1449 { 1450 free_fail_stack_return (fs); 1451 return REG_ESPACE; 1452 } 1453 prev_idx_match_malloced = 1; 1454 } 1455 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1456 1457 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1458 { 1459 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1460 1461 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1462 { 1463 int reg_idx; 1464 if (fs) 1465 { 1466 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1467 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1468 break; 1469 if (reg_idx == nmatch) 1470 { 1471 re_node_set_free (&eps_via_nodes); 1472 if (prev_idx_match_malloced) 1473 re_free (prev_idx_match); 1474 return free_fail_stack_return (fs); 1475 } 1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1477 &eps_via_nodes); 1478 } 1479 else 1480 { 1481 re_node_set_free (&eps_via_nodes); 1482 if (prev_idx_match_malloced) 1483 re_free (prev_idx_match); 1484 return REG_NOERROR; 1485 } 1486 } 1487 1488 /* Proceed to next node. */ 1489 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1490 &eps_via_nodes, fs); 1491 1492 if (BE (cur_node < 0, 0)) 1493 { 1494 if (BE (cur_node == -2, 0)) 1495 { 1496 re_node_set_free (&eps_via_nodes); 1497 if (prev_idx_match_malloced) 1498 re_free (prev_idx_match); 1499 free_fail_stack_return (fs); 1500 return REG_ESPACE; 1501 } 1502 if (fs) 1503 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1504 &eps_via_nodes); 1505 else 1506 { 1507 re_node_set_free (&eps_via_nodes); 1508 if (prev_idx_match_malloced) 1509 re_free (prev_idx_match); 1510 return REG_NOMATCH; 1511 } 1512 } 1513 } 1514 re_node_set_free (&eps_via_nodes); 1515 if (prev_idx_match_malloced) 1516 re_free (prev_idx_match); 1517 return free_fail_stack_return (fs); 1518} 1519 1520static reg_errcode_t 1521internal_function 1522free_fail_stack_return (struct re_fail_stack_t *fs) 1523{ 1524 if (fs) 1525 { 1526 int fs_idx; 1527 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1528 { 1529 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1530 re_free (fs->stack[fs_idx].regs); 1531 } 1532 re_free (fs->stack); 1533 } 1534 return REG_NOERROR; 1535} 1536 1537static void 1538internal_function 1539update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1540 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) 1541{ 1542 int type = dfa->nodes[cur_node].type; 1543 if (type == OP_OPEN_SUBEXP) 1544 { 1545 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1546 1547 /* We are at the first node of this sub expression. */ 1548 if (reg_num < nmatch) 1549 { 1550 pmatch[reg_num].rm_so = cur_idx; 1551 pmatch[reg_num].rm_eo = -1; 1552 } 1553 } 1554 else if (type == OP_CLOSE_SUBEXP) 1555 { 1556 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1557 if (reg_num < nmatch) 1558 { 1559 /* We are at the last node of this sub expression. */ 1560 if (pmatch[reg_num].rm_so < cur_idx) 1561 { 1562 pmatch[reg_num].rm_eo = cur_idx; 1563 /* This is a non-empty match or we are not inside an optional 1564 subexpression. Accept this right away. */ 1565 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1566 } 1567 else 1568 { 1569 if (dfa->nodes[cur_node].opt_subexp 1570 && prev_idx_match[reg_num].rm_so != -1) 1571 /* We transited through an empty match for an optional 1572 subexpression, like (a?)*, and this is not the subexp's 1573 first match. Copy back the old content of the registers 1574 so that matches of an inner subexpression are undone as 1575 well, like in ((a?))*. */ 1576 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1577 else 1578 /* We completed a subexpression, but it may be part of 1579 an optional one, so do not update PREV_IDX_MATCH. */ 1580 pmatch[reg_num].rm_eo = cur_idx; 1581 } 1582 } 1583 } 1584} 1585 1586/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1587 and sift the nodes in each states according to the following rules. 1588 Updated state_log will be wrote to STATE_LOG. 1589 1590 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1591 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1592 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1593 the LAST_NODE, we throw away the node `a'. 1594 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1595 string `s' and transit to `b': 1596 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1597 away the node `a'. 1598 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1599 thrown away, we throw away the node `a'. 1600 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1601 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1602 node `a'. 1603 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1604 we throw away the node `a'. */ 1605 1606#define STATE_NODE_CONTAINS(state,node) \ 1607 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1608 1609static reg_errcode_t 1610internal_function 1611sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1612{ 1613 reg_errcode_t err; 1614 int null_cnt = 0; 1615 int str_idx = sctx->last_str_idx; 1616 re_node_set cur_dest; 1617 1618#ifdef DEBUG 1619 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1620#endif 1621 1622 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1623 transit to the last_node and the last_node itself. */ 1624 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1625 if (BE (err != REG_NOERROR, 0)) 1626 return err; 1627 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1628 if (BE (err != REG_NOERROR, 0)) 1629 goto free_return; 1630 1631 /* Then check each states in the state_log. */ 1632 while (str_idx > 0) 1633 { 1634 /* Update counters. */ 1635 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1636 if (null_cnt > mctx->max_mb_elem_len) 1637 { 1638 memset (sctx->sifted_states, '\0', 1639 sizeof (re_dfastate_t *) * str_idx); 1640 re_node_set_free (&cur_dest); 1641 return REG_NOERROR; 1642 } 1643 re_node_set_empty (&cur_dest); 1644 --str_idx; 1645 1646 if (mctx->state_log[str_idx]) 1647 { 1648 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1649 if (BE (err != REG_NOERROR, 0)) 1650 goto free_return; 1651 } 1652 1653 /* Add all the nodes which satisfy the following conditions: 1654 - It can epsilon transit to a node in CUR_DEST. 1655 - It is in CUR_SRC. 1656 And update state_log. */ 1657 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1658 if (BE (err != REG_NOERROR, 0)) 1659 goto free_return; 1660 } 1661 err = REG_NOERROR; 1662 free_return: 1663 re_node_set_free (&cur_dest); 1664 return err; 1665} 1666 1667static reg_errcode_t 1668internal_function __attribute_warn_unused_result__ 1669build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1670 int str_idx, re_node_set *cur_dest) 1671{ 1672 const re_dfa_t *const dfa = mctx->dfa; 1673 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1674 int i; 1675 1676 /* Then build the next sifted state. 1677 We build the next sifted state on `cur_dest', and update 1678 `sifted_states[str_idx]' with `cur_dest'. 1679 Note: 1680 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1681 `cur_src' points the node_set of the old `state_log[str_idx]' 1682 (with the epsilon nodes pre-filtered out). */ 1683 for (i = 0; i < cur_src->nelem; i++) 1684 { 1685 int prev_node = cur_src->elems[i]; 1686 int naccepted = 0; 1687 int ret; 1688 1689#ifdef DEBUG 1690 re_token_type_t type = dfa->nodes[prev_node].type; 1691 assert (!IS_EPSILON_NODE (type)); 1692#endif 1693#ifdef RE_ENABLE_I18N 1694 /* If the node may accept `multi byte'. */ 1695 if (dfa->nodes[prev_node].accept_mb) 1696 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1697 str_idx, sctx->last_str_idx); 1698#endif /* RE_ENABLE_I18N */ 1699 1700 /* We don't check backreferences here. 1701 See update_cur_sifted_state(). */ 1702 if (!naccepted 1703 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1704 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1705 dfa->nexts[prev_node])) 1706 naccepted = 1; 1707 1708 if (naccepted == 0) 1709 continue; 1710 1711 if (sctx->limits.nelem) 1712 { 1713 int to_idx = str_idx + naccepted; 1714 if (check_dst_limits (mctx, &sctx->limits, 1715 dfa->nexts[prev_node], to_idx, 1716 prev_node, str_idx)) 1717 continue; 1718 } 1719 ret = re_node_set_insert (cur_dest, prev_node); 1720 if (BE (ret == -1, 0)) 1721 return REG_ESPACE; 1722 } 1723 1724 return REG_NOERROR; 1725} 1726 1727/* Helper functions. */ 1728 1729static reg_errcode_t 1730internal_function 1731clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx) 1732{ 1733 int top = mctx->state_log_top; 1734 1735 if ((next_state_log_idx >= mctx->input.bufs_len 1736 && mctx->input.bufs_len < mctx->input.len) 1737 || (next_state_log_idx >= mctx->input.valid_len 1738 && mctx->input.valid_len < mctx->input.len)) 1739 { 1740 reg_errcode_t err; 1741 err = extend_buffers (mctx, next_state_log_idx + 1); 1742 if (BE (err != REG_NOERROR, 0)) 1743 return err; 1744 } 1745 1746 if (top < next_state_log_idx) 1747 { 1748 memset (mctx->state_log + top + 1, '\0', 1749 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1750 mctx->state_log_top = next_state_log_idx; 1751 } 1752 return REG_NOERROR; 1753} 1754 1755static reg_errcode_t 1756internal_function 1757merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1758 re_dfastate_t **src, int num) 1759{ 1760 int st_idx; 1761 reg_errcode_t err; 1762 for (st_idx = 0; st_idx < num; ++st_idx) 1763 { 1764 if (dst[st_idx] == NULL) 1765 dst[st_idx] = src[st_idx]; 1766 else if (src[st_idx] != NULL) 1767 { 1768 re_node_set merged_set; 1769 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1770 &src[st_idx]->nodes); 1771 if (BE (err != REG_NOERROR, 0)) 1772 return err; 1773 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1774 re_node_set_free (&merged_set); 1775 if (BE (err != REG_NOERROR, 0)) 1776 return err; 1777 } 1778 } 1779 return REG_NOERROR; 1780} 1781 1782static reg_errcode_t 1783internal_function 1784update_cur_sifted_state (const re_match_context_t *mctx, 1785 re_sift_context_t *sctx, int str_idx, 1786 re_node_set *dest_nodes) 1787{ 1788 const re_dfa_t *const dfa = mctx->dfa; 1789 reg_errcode_t err = REG_NOERROR; 1790 const re_node_set *candidates; 1791 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1792 : &mctx->state_log[str_idx]->nodes); 1793 1794 if (dest_nodes->nelem == 0) 1795 sctx->sifted_states[str_idx] = NULL; 1796 else 1797 { 1798 if (candidates) 1799 { 1800 /* At first, add the nodes which can epsilon transit to a node in 1801 DEST_NODE. */ 1802 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1803 if (BE (err != REG_NOERROR, 0)) 1804 return err; 1805 1806 /* Then, check the limitations in the current sift_context. */ 1807 if (sctx->limits.nelem) 1808 { 1809 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1810 mctx->bkref_ents, str_idx); 1811 if (BE (err != REG_NOERROR, 0)) 1812 return err; 1813 } 1814 } 1815 1816 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1817 if (BE (err != REG_NOERROR, 0)) 1818 return err; 1819 } 1820 1821 if (candidates && mctx->state_log[str_idx]->has_backref) 1822 { 1823 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1824 if (BE (err != REG_NOERROR, 0)) 1825 return err; 1826 } 1827 return REG_NOERROR; 1828} 1829 1830static reg_errcode_t 1831internal_function __attribute_warn_unused_result__ 1832add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1833 const re_node_set *candidates) 1834{ 1835 reg_errcode_t err = REG_NOERROR; 1836 int i; 1837 1838 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1839 if (BE (err != REG_NOERROR, 0)) 1840 return err; 1841 1842 if (!state->inveclosure.alloc) 1843 { 1844 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1845 if (BE (err != REG_NOERROR, 0)) 1846 return REG_ESPACE; 1847 for (i = 0; i < dest_nodes->nelem; i++) 1848 { 1849 err = re_node_set_merge (&state->inveclosure, 1850 dfa->inveclosures + dest_nodes->elems[i]); 1851 if (BE (err != REG_NOERROR, 0)) 1852 return REG_ESPACE; 1853 } 1854 } 1855 return re_node_set_add_intersect (dest_nodes, candidates, 1856 &state->inveclosure); 1857} 1858 1859static reg_errcode_t 1860internal_function 1861sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes, 1862 const re_node_set *candidates) 1863{ 1864 int ecl_idx; 1865 reg_errcode_t err; 1866 re_node_set *inv_eclosure = dfa->inveclosures + node; 1867 re_node_set except_nodes; 1868 re_node_set_init_empty (&except_nodes); 1869 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1870 { 1871 int cur_node = inv_eclosure->elems[ecl_idx]; 1872 if (cur_node == node) 1873 continue; 1874 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1875 { 1876 int edst1 = dfa->edests[cur_node].elems[0]; 1877 int edst2 = ((dfa->edests[cur_node].nelem > 1) 1878 ? dfa->edests[cur_node].elems[1] : -1); 1879 if ((!re_node_set_contains (inv_eclosure, edst1) 1880 && re_node_set_contains (dest_nodes, edst1)) 1881 || (edst2 > 0 1882 && !re_node_set_contains (inv_eclosure, edst2) 1883 && re_node_set_contains (dest_nodes, edst2))) 1884 { 1885 err = re_node_set_add_intersect (&except_nodes, candidates, 1886 dfa->inveclosures + cur_node); 1887 if (BE (err != REG_NOERROR, 0)) 1888 { 1889 re_node_set_free (&except_nodes); 1890 return err; 1891 } 1892 } 1893 } 1894 } 1895 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1896 { 1897 int cur_node = inv_eclosure->elems[ecl_idx]; 1898 if (!re_node_set_contains (&except_nodes, cur_node)) 1899 { 1900 int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1901 re_node_set_remove_at (dest_nodes, idx); 1902 } 1903 } 1904 re_node_set_free (&except_nodes); 1905 return REG_NOERROR; 1906} 1907 1908static int 1909internal_function 1910check_dst_limits (const re_match_context_t *mctx, re_node_set *limits, 1911 int dst_node, int dst_idx, int src_node, int src_idx) 1912{ 1913 const re_dfa_t *const dfa = mctx->dfa; 1914 int lim_idx, src_pos, dst_pos; 1915 1916 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1917 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1918 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1919 { 1920 int subexp_idx; 1921 struct re_backref_cache_entry *ent; 1922 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1923 subexp_idx = dfa->nodes[ent->node].opr.idx; 1924 1925 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1926 subexp_idx, dst_node, dst_idx, 1927 dst_bkref_idx); 1928 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1929 subexp_idx, src_node, src_idx, 1930 src_bkref_idx); 1931 1932 /* In case of: 1933 <src> <dst> ( <subexp> ) 1934 ( <subexp> ) <src> <dst> 1935 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1936 if (src_pos == dst_pos) 1937 continue; /* This is unrelated limitation. */ 1938 else 1939 return 1; 1940 } 1941 return 0; 1942} 1943 1944static int 1945internal_function 1946check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1947 int subexp_idx, int from_node, int bkref_idx) 1948{ 1949 const re_dfa_t *const dfa = mctx->dfa; 1950 const re_node_set *eclosures = dfa->eclosures + from_node; 1951 int node_idx; 1952 1953 /* Else, we are on the boundary: examine the nodes on the epsilon 1954 closure. */ 1955 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1956 { 1957 int node = eclosures->elems[node_idx]; 1958 switch (dfa->nodes[node].type) 1959 { 1960 case OP_BACK_REF: 1961 if (bkref_idx != -1) 1962 { 1963 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1964 do 1965 { 1966 int dst, cpos; 1967 1968 if (ent->node != node) 1969 continue; 1970 1971 if (subexp_idx < BITSET_WORD_BITS 1972 && !(ent->eps_reachable_subexps_map 1973 & ((bitset_word_t) 1 << subexp_idx))) 1974 continue; 1975 1976 /* Recurse trying to reach the OP_OPEN_SUBEXP and 1977 OP_CLOSE_SUBEXP cases below. But, if the 1978 destination node is the same node as the source 1979 node, don't recurse because it would cause an 1980 infinite loop: a regex that exhibits this behavior 1981 is ()\1*\1* */ 1982 dst = dfa->edests[node].elems[0]; 1983 if (dst == from_node) 1984 { 1985 if (boundaries & 1) 1986 return -1; 1987 else /* if (boundaries & 2) */ 1988 return 0; 1989 } 1990 1991 cpos = 1992 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1993 dst, bkref_idx); 1994 if (cpos == -1 /* && (boundaries & 1) */) 1995 return -1; 1996 if (cpos == 0 && (boundaries & 2)) 1997 return 0; 1998 1999 if (subexp_idx < BITSET_WORD_BITS) 2000 ent->eps_reachable_subexps_map 2001 &= ~((bitset_word_t) 1 << subexp_idx); 2002 } 2003 while (ent++->more); 2004 } 2005 break; 2006 2007 case OP_OPEN_SUBEXP: 2008 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 2009 return -1; 2010 break; 2011 2012 case OP_CLOSE_SUBEXP: 2013 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 2014 return 0; 2015 break; 2016 2017 default: 2018 break; 2019 } 2020 } 2021 2022 return (boundaries & 2) ? 1 : 0; 2023} 2024 2025static int 2026internal_function 2027check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, 2028 int subexp_idx, int from_node, int str_idx, 2029 int bkref_idx) 2030{ 2031 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2032 int boundaries; 2033 2034 /* If we are outside the range of the subexpression, return -1 or 1. */ 2035 if (str_idx < lim->subexp_from) 2036 return -1; 2037 2038 if (lim->subexp_to < str_idx) 2039 return 1; 2040 2041 /* If we are within the subexpression, return 0. */ 2042 boundaries = (str_idx == lim->subexp_from); 2043 boundaries |= (str_idx == lim->subexp_to) << 1; 2044 if (boundaries == 0) 2045 return 0; 2046 2047 /* Else, examine epsilon closure. */ 2048 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2049 from_node, bkref_idx); 2050} 2051 2052/* Check the limitations of sub expressions LIMITS, and remove the nodes 2053 which are against limitations from DEST_NODES. */ 2054 2055static reg_errcode_t 2056internal_function 2057check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2058 const re_node_set *candidates, re_node_set *limits, 2059 struct re_backref_cache_entry *bkref_ents, int str_idx) 2060{ 2061 reg_errcode_t err; 2062 int node_idx, lim_idx; 2063 2064 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2065 { 2066 int subexp_idx; 2067 struct re_backref_cache_entry *ent; 2068 ent = bkref_ents + limits->elems[lim_idx]; 2069 2070 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2071 continue; /* This is unrelated limitation. */ 2072 2073 subexp_idx = dfa->nodes[ent->node].opr.idx; 2074 if (ent->subexp_to == str_idx) 2075 { 2076 int ops_node = -1; 2077 int cls_node = -1; 2078 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2079 { 2080 int node = dest_nodes->elems[node_idx]; 2081 re_token_type_t type = dfa->nodes[node].type; 2082 if (type == OP_OPEN_SUBEXP 2083 && subexp_idx == dfa->nodes[node].opr.idx) 2084 ops_node = node; 2085 else if (type == OP_CLOSE_SUBEXP 2086 && subexp_idx == dfa->nodes[node].opr.idx) 2087 cls_node = node; 2088 } 2089 2090 /* Check the limitation of the open subexpression. */ 2091 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2092 if (ops_node >= 0) 2093 { 2094 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2095 candidates); 2096 if (BE (err != REG_NOERROR, 0)) 2097 return err; 2098 } 2099 2100 /* Check the limitation of the close subexpression. */ 2101 if (cls_node >= 0) 2102 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2103 { 2104 int node = dest_nodes->elems[node_idx]; 2105 if (!re_node_set_contains (dfa->inveclosures + node, 2106 cls_node) 2107 && !re_node_set_contains (dfa->eclosures + node, 2108 cls_node)) 2109 { 2110 /* It is against this limitation. 2111 Remove it form the current sifted state. */ 2112 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2113 candidates); 2114 if (BE (err != REG_NOERROR, 0)) 2115 return err; 2116 --node_idx; 2117 } 2118 } 2119 } 2120 else /* (ent->subexp_to != str_idx) */ 2121 { 2122 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2123 { 2124 int node = dest_nodes->elems[node_idx]; 2125 re_token_type_t type = dfa->nodes[node].type; 2126 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2127 { 2128 if (subexp_idx != dfa->nodes[node].opr.idx) 2129 continue; 2130 /* It is against this limitation. 2131 Remove it form the current sifted state. */ 2132 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2133 candidates); 2134 if (BE (err != REG_NOERROR, 0)) 2135 return err; 2136 } 2137 } 2138 } 2139 } 2140 return REG_NOERROR; 2141} 2142 2143static reg_errcode_t 2144internal_function __attribute_warn_unused_result__ 2145sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2146 int str_idx, const re_node_set *candidates) 2147{ 2148 const re_dfa_t *const dfa = mctx->dfa; 2149 reg_errcode_t err; 2150 int node_idx, node; 2151 re_sift_context_t local_sctx; 2152 int first_idx = search_cur_bkref_entry (mctx, str_idx); 2153 2154 if (first_idx == -1) 2155 return REG_NOERROR; 2156 2157 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2158 2159 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2160 { 2161 int enabled_idx; 2162 re_token_type_t type; 2163 struct re_backref_cache_entry *entry; 2164 node = candidates->elems[node_idx]; 2165 type = dfa->nodes[node].type; 2166 /* Avoid infinite loop for the REs like "()\1+". */ 2167 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2168 continue; 2169 if (type != OP_BACK_REF) 2170 continue; 2171 2172 entry = mctx->bkref_ents + first_idx; 2173 enabled_idx = first_idx; 2174 do 2175 { 2176 int subexp_len; 2177 int to_idx; 2178 int dst_node; 2179 int ret; 2180 re_dfastate_t *cur_state; 2181 2182 if (entry->node != node) 2183 continue; 2184 subexp_len = entry->subexp_to - entry->subexp_from; 2185 to_idx = str_idx + subexp_len; 2186 dst_node = (subexp_len ? dfa->nexts[node] 2187 : dfa->edests[node].elems[0]); 2188 2189 if (to_idx > sctx->last_str_idx 2190 || sctx->sifted_states[to_idx] == NULL 2191 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2192 || check_dst_limits (mctx, &sctx->limits, node, 2193 str_idx, dst_node, to_idx)) 2194 continue; 2195 2196 if (local_sctx.sifted_states == NULL) 2197 { 2198 local_sctx = *sctx; 2199 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2200 if (BE (err != REG_NOERROR, 0)) 2201 goto free_return; 2202 } 2203 local_sctx.last_node = node; 2204 local_sctx.last_str_idx = str_idx; 2205 ret = re_node_set_insert (&local_sctx.limits, enabled_idx); 2206 if (BE (ret < 0, 0)) 2207 { 2208 err = REG_ESPACE; 2209 goto free_return; 2210 } 2211 cur_state = local_sctx.sifted_states[str_idx]; 2212 err = sift_states_backward (mctx, &local_sctx); 2213 if (BE (err != REG_NOERROR, 0)) 2214 goto free_return; 2215 if (sctx->limited_states != NULL) 2216 { 2217 err = merge_state_array (dfa, sctx->limited_states, 2218 local_sctx.sifted_states, 2219 str_idx + 1); 2220 if (BE (err != REG_NOERROR, 0)) 2221 goto free_return; 2222 } 2223 local_sctx.sifted_states[str_idx] = cur_state; 2224 re_node_set_remove (&local_sctx.limits, enabled_idx); 2225 2226 /* mctx->bkref_ents may have changed, reload the pointer. */ 2227 entry = mctx->bkref_ents + enabled_idx; 2228 } 2229 while (enabled_idx++, entry++->more); 2230 } 2231 err = REG_NOERROR; 2232 free_return: 2233 if (local_sctx.sifted_states != NULL) 2234 { 2235 re_node_set_free (&local_sctx.limits); 2236 } 2237 2238 return err; 2239} 2240 2241 2242#ifdef RE_ENABLE_I18N 2243static int 2244internal_function 2245sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2246 int node_idx, int str_idx, int max_str_idx) 2247{ 2248 const re_dfa_t *const dfa = mctx->dfa; 2249 int naccepted; 2250 /* Check the node can accept `multi byte'. */ 2251 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2252 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2253 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2254 dfa->nexts[node_idx])) 2255 /* The node can't accept the `multi byte', or the 2256 destination was already thrown away, then the node 2257 could't accept the current input `multi byte'. */ 2258 naccepted = 0; 2259 /* Otherwise, it is sure that the node could accept 2260 `naccepted' bytes input. */ 2261 return naccepted; 2262} 2263#endif /* RE_ENABLE_I18N */ 2264 2265 2266/* Functions for state transition. */ 2267 2268/* Return the next state to which the current state STATE will transit by 2269 accepting the current input byte, and update STATE_LOG if necessary. 2270 If STATE can accept a multibyte char/collating element/back reference 2271 update the destination of STATE_LOG. */ 2272 2273static re_dfastate_t * 2274internal_function __attribute_warn_unused_result__ 2275transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2276 re_dfastate_t *state) 2277{ 2278 re_dfastate_t **trtable; 2279 unsigned char ch; 2280 2281#ifdef RE_ENABLE_I18N 2282 /* If the current state can accept multibyte. */ 2283 if (BE (state->accept_mb, 0)) 2284 { 2285 *err = transit_state_mb (mctx, state); 2286 if (BE (*err != REG_NOERROR, 0)) 2287 return NULL; 2288 } 2289#endif /* RE_ENABLE_I18N */ 2290 2291 /* Then decide the next state with the single byte. */ 2292#if 0 2293 if (0) 2294 /* don't use transition table */ 2295 return transit_state_sb (err, mctx, state); 2296#endif 2297 2298 /* Use transition table */ 2299 ch = re_string_fetch_byte (&mctx->input); 2300 for (;;) 2301 { 2302 trtable = state->trtable; 2303 if (BE (trtable != NULL, 1)) 2304 return trtable[ch]; 2305 2306 trtable = state->word_trtable; 2307 if (BE (trtable != NULL, 1)) 2308 { 2309 unsigned int context; 2310 context 2311 = re_string_context_at (&mctx->input, 2312 re_string_cur_idx (&mctx->input) - 1, 2313 mctx->eflags); 2314 if (IS_WORD_CONTEXT (context)) 2315 return trtable[ch + SBC_MAX]; 2316 else 2317 return trtable[ch]; 2318 } 2319 2320 if (!build_trtable (mctx->dfa, state)) 2321 { 2322 *err = REG_ESPACE; 2323 return NULL; 2324 } 2325 2326 /* Retry, we now have a transition table. */ 2327 } 2328} 2329 2330/* Update the state_log if we need */ 2331re_dfastate_t * 2332internal_function 2333merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2334 re_dfastate_t *next_state) 2335{ 2336 const re_dfa_t *const dfa = mctx->dfa; 2337 int cur_idx = re_string_cur_idx (&mctx->input); 2338 2339 if (cur_idx > mctx->state_log_top) 2340 { 2341 mctx->state_log[cur_idx] = next_state; 2342 mctx->state_log_top = cur_idx; 2343 } 2344 else if (mctx->state_log[cur_idx] == 0) 2345 { 2346 mctx->state_log[cur_idx] = next_state; 2347 } 2348 else 2349 { 2350 re_dfastate_t *pstate; 2351 unsigned int context; 2352 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2353 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2354 the destination of a multibyte char/collating element/ 2355 back reference. Then the next state is the union set of 2356 these destinations and the results of the transition table. */ 2357 pstate = mctx->state_log[cur_idx]; 2358 log_nodes = pstate->entrance_nodes; 2359 if (next_state != NULL) 2360 { 2361 table_nodes = next_state->entrance_nodes; 2362 *err = re_node_set_init_union (&next_nodes, table_nodes, 2363 log_nodes); 2364 if (BE (*err != REG_NOERROR, 0)) 2365 return NULL; 2366 } 2367 else 2368 next_nodes = *log_nodes; 2369 /* Note: We already add the nodes of the initial state, 2370 then we don't need to add them here. */ 2371 2372 context = re_string_context_at (&mctx->input, 2373 re_string_cur_idx (&mctx->input) - 1, 2374 mctx->eflags); 2375 next_state = mctx->state_log[cur_idx] 2376 = re_acquire_state_context (err, dfa, &next_nodes, context); 2377 /* We don't need to check errors here, since the return value of 2378 this function is next_state and ERR is already set. */ 2379 2380 if (table_nodes != NULL) 2381 re_node_set_free (&next_nodes); 2382 } 2383 2384 if (BE (dfa->nbackref, 0) && next_state != NULL) 2385 { 2386 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2387 later. We must check them here, since the back references in the 2388 next state might use them. */ 2389 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2390 cur_idx); 2391 if (BE (*err != REG_NOERROR, 0)) 2392 return NULL; 2393 2394 /* If the next state has back references. */ 2395 if (next_state->has_backref) 2396 { 2397 *err = transit_state_bkref (mctx, &next_state->nodes); 2398 if (BE (*err != REG_NOERROR, 0)) 2399 return NULL; 2400 next_state = mctx->state_log[cur_idx]; 2401 } 2402 } 2403 2404 return next_state; 2405} 2406 2407/* Skip bytes in the input that correspond to part of a 2408 multi-byte match, then look in the log for a state 2409 from which to restart matching. */ 2410re_dfastate_t * 2411internal_function 2412find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2413{ 2414 re_dfastate_t *cur_state; 2415 do 2416 { 2417 int max = mctx->state_log_top; 2418 int cur_str_idx = re_string_cur_idx (&mctx->input); 2419 2420 do 2421 { 2422 if (++cur_str_idx > max) 2423 return NULL; 2424 re_string_skip_bytes (&mctx->input, 1); 2425 } 2426 while (mctx->state_log[cur_str_idx] == NULL); 2427 2428 cur_state = merge_state_with_log (err, mctx, NULL); 2429 } 2430 while (*err == REG_NOERROR && cur_state == NULL); 2431 return cur_state; 2432} 2433 2434/* Helper functions for transit_state. */ 2435 2436/* From the node set CUR_NODES, pick up the nodes whose types are 2437 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2438 expression. And register them to use them later for evaluating the 2439 correspoding back references. */ 2440 2441static reg_errcode_t 2442internal_function 2443check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2444 int str_idx) 2445{ 2446 const re_dfa_t *const dfa = mctx->dfa; 2447 int node_idx; 2448 reg_errcode_t err; 2449 2450 /* TODO: This isn't efficient. 2451 Because there might be more than one nodes whose types are 2452 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2453 nodes. 2454 E.g. RE: (a){2} */ 2455 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2456 { 2457 int node = cur_nodes->elems[node_idx]; 2458 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2459 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2460 && (dfa->used_bkref_map 2461 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2462 { 2463 err = match_ctx_add_subtop (mctx, node, str_idx); 2464 if (BE (err != REG_NOERROR, 0)) 2465 return err; 2466 } 2467 } 2468 return REG_NOERROR; 2469} 2470 2471#if 0 2472/* Return the next state to which the current state STATE will transit by 2473 accepting the current input byte. */ 2474 2475static re_dfastate_t * 2476transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2477 re_dfastate_t *state) 2478{ 2479 const re_dfa_t *const dfa = mctx->dfa; 2480 re_node_set next_nodes; 2481 re_dfastate_t *next_state; 2482 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2483 unsigned int context; 2484 2485 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2486 if (BE (*err != REG_NOERROR, 0)) 2487 return NULL; 2488 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2489 { 2490 int cur_node = state->nodes.elems[node_cnt]; 2491 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2492 { 2493 *err = re_node_set_merge (&next_nodes, 2494 dfa->eclosures + dfa->nexts[cur_node]); 2495 if (BE (*err != REG_NOERROR, 0)) 2496 { 2497 re_node_set_free (&next_nodes); 2498 return NULL; 2499 } 2500 } 2501 } 2502 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2503 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2504 /* We don't need to check errors here, since the return value of 2505 this function is next_state and ERR is already set. */ 2506 2507 re_node_set_free (&next_nodes); 2508 re_string_skip_bytes (&mctx->input, 1); 2509 return next_state; 2510} 2511#endif 2512 2513#ifdef RE_ENABLE_I18N 2514static reg_errcode_t 2515internal_function 2516transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2517{ 2518 const re_dfa_t *const dfa = mctx->dfa; 2519 reg_errcode_t err; 2520 int i; 2521 2522 for (i = 0; i < pstate->nodes.nelem; ++i) 2523 { 2524 re_node_set dest_nodes, *new_nodes; 2525 int cur_node_idx = pstate->nodes.elems[i]; 2526 int naccepted, dest_idx; 2527 unsigned int context; 2528 re_dfastate_t *dest_state; 2529 2530 if (!dfa->nodes[cur_node_idx].accept_mb) 2531 continue; 2532 2533 if (dfa->nodes[cur_node_idx].constraint) 2534 { 2535 context = re_string_context_at (&mctx->input, 2536 re_string_cur_idx (&mctx->input), 2537 mctx->eflags); 2538 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2539 context)) 2540 continue; 2541 } 2542 2543 /* How many bytes the node can accept? */ 2544 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2545 re_string_cur_idx (&mctx->input)); 2546 if (naccepted == 0) 2547 continue; 2548 2549 /* The node can accepts `naccepted' bytes. */ 2550 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2551 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2552 : mctx->max_mb_elem_len); 2553 err = clean_state_log_if_needed (mctx, dest_idx); 2554 if (BE (err != REG_NOERROR, 0)) 2555 return err; 2556#ifdef DEBUG 2557 assert (dfa->nexts[cur_node_idx] != -1); 2558#endif 2559 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2560 2561 dest_state = mctx->state_log[dest_idx]; 2562 if (dest_state == NULL) 2563 dest_nodes = *new_nodes; 2564 else 2565 { 2566 err = re_node_set_init_union (&dest_nodes, 2567 dest_state->entrance_nodes, new_nodes); 2568 if (BE (err != REG_NOERROR, 0)) 2569 return err; 2570 } 2571 context = re_string_context_at (&mctx->input, dest_idx - 1, 2572 mctx->eflags); 2573 mctx->state_log[dest_idx] 2574 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2575 if (dest_state != NULL) 2576 re_node_set_free (&dest_nodes); 2577 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2578 return err; 2579 } 2580 return REG_NOERROR; 2581} 2582#endif /* RE_ENABLE_I18N */ 2583 2584static reg_errcode_t 2585internal_function 2586transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2587{ 2588 const re_dfa_t *const dfa = mctx->dfa; 2589 reg_errcode_t err; 2590 int i; 2591 int cur_str_idx = re_string_cur_idx (&mctx->input); 2592 2593 for (i = 0; i < nodes->nelem; ++i) 2594 { 2595 int dest_str_idx, prev_nelem, bkc_idx; 2596 int node_idx = nodes->elems[i]; 2597 unsigned int context; 2598 const re_token_t *node = dfa->nodes + node_idx; 2599 re_node_set *new_dest_nodes; 2600 2601 /* Check whether `node' is a backreference or not. */ 2602 if (node->type != OP_BACK_REF) 2603 continue; 2604 2605 if (node->constraint) 2606 { 2607 context = re_string_context_at (&mctx->input, cur_str_idx, 2608 mctx->eflags); 2609 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2610 continue; 2611 } 2612 2613 /* `node' is a backreference. 2614 Check the substring which the substring matched. */ 2615 bkc_idx = mctx->nbkref_ents; 2616 err = get_subexp (mctx, node_idx, cur_str_idx); 2617 if (BE (err != REG_NOERROR, 0)) 2618 goto free_return; 2619 2620 /* And add the epsilon closures (which is `new_dest_nodes') of 2621 the backreference to appropriate state_log. */ 2622#ifdef DEBUG 2623 assert (dfa->nexts[node_idx] != -1); 2624#endif 2625 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2626 { 2627 int subexp_len; 2628 re_dfastate_t *dest_state; 2629 struct re_backref_cache_entry *bkref_ent; 2630 bkref_ent = mctx->bkref_ents + bkc_idx; 2631 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2632 continue; 2633 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2634 new_dest_nodes = (subexp_len == 0 2635 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2636 : dfa->eclosures + dfa->nexts[node_idx]); 2637 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2638 - bkref_ent->subexp_from); 2639 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2640 mctx->eflags); 2641 dest_state = mctx->state_log[dest_str_idx]; 2642 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2643 : mctx->state_log[cur_str_idx]->nodes.nelem); 2644 /* Add `new_dest_node' to state_log. */ 2645 if (dest_state == NULL) 2646 { 2647 mctx->state_log[dest_str_idx] 2648 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2649 context); 2650 if (BE (mctx->state_log[dest_str_idx] == NULL 2651 && err != REG_NOERROR, 0)) 2652 goto free_return; 2653 } 2654 else 2655 { 2656 re_node_set dest_nodes; 2657 err = re_node_set_init_union (&dest_nodes, 2658 dest_state->entrance_nodes, 2659 new_dest_nodes); 2660 if (BE (err != REG_NOERROR, 0)) 2661 { 2662 re_node_set_free (&dest_nodes); 2663 goto free_return; 2664 } 2665 mctx->state_log[dest_str_idx] 2666 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2667 re_node_set_free (&dest_nodes); 2668 if (BE (mctx->state_log[dest_str_idx] == NULL 2669 && err != REG_NOERROR, 0)) 2670 goto free_return; 2671 } 2672 /* We need to check recursively if the backreference can epsilon 2673 transit. */ 2674 if (subexp_len == 0 2675 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2676 { 2677 err = check_subexp_matching_top (mctx, new_dest_nodes, 2678 cur_str_idx); 2679 if (BE (err != REG_NOERROR, 0)) 2680 goto free_return; 2681 err = transit_state_bkref (mctx, new_dest_nodes); 2682 if (BE (err != REG_NOERROR, 0)) 2683 goto free_return; 2684 } 2685 } 2686 } 2687 err = REG_NOERROR; 2688 free_return: 2689 return err; 2690} 2691 2692/* Enumerate all the candidates which the backreference BKREF_NODE can match 2693 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2694 Note that we might collect inappropriate candidates here. 2695 However, the cost of checking them strictly here is too high, then we 2696 delay these checking for prune_impossible_nodes(). */ 2697 2698static reg_errcode_t 2699internal_function __attribute_warn_unused_result__ 2700get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx) 2701{ 2702 const re_dfa_t *const dfa = mctx->dfa; 2703 int subexp_num, sub_top_idx; 2704 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2705 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2706 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2707 if (cache_idx != -1) 2708 { 2709 const struct re_backref_cache_entry *entry 2710 = mctx->bkref_ents + cache_idx; 2711 do 2712 if (entry->node == bkref_node) 2713 return REG_NOERROR; /* We already checked it. */ 2714 while (entry++->more); 2715 } 2716 2717 subexp_num = dfa->nodes[bkref_node].opr.idx; 2718 2719 /* For each sub expression */ 2720 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2721 { 2722 reg_errcode_t err; 2723 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2724 re_sub_match_last_t *sub_last; 2725 int sub_last_idx, sl_str, bkref_str_off; 2726 2727 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2728 continue; /* It isn't related. */ 2729 2730 sl_str = sub_top->str_idx; 2731 bkref_str_off = bkref_str_idx; 2732 /* At first, check the last node of sub expressions we already 2733 evaluated. */ 2734 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2735 { 2736 int sl_str_diff; 2737 sub_last = sub_top->lasts[sub_last_idx]; 2738 sl_str_diff = sub_last->str_idx - sl_str; 2739 /* The matched string by the sub expression match with the substring 2740 at the back reference? */ 2741 if (sl_str_diff > 0) 2742 { 2743 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2744 { 2745 /* Not enough chars for a successful match. */ 2746 if (bkref_str_off + sl_str_diff > mctx->input.len) 2747 break; 2748 2749 err = clean_state_log_if_needed (mctx, 2750 bkref_str_off 2751 + sl_str_diff); 2752 if (BE (err != REG_NOERROR, 0)) 2753 return err; 2754 buf = (const char *) re_string_get_buffer (&mctx->input); 2755 } 2756 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2757 /* We don't need to search this sub expression any more. */ 2758 break; 2759 } 2760 bkref_str_off += sl_str_diff; 2761 sl_str += sl_str_diff; 2762 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2763 bkref_str_idx); 2764 2765 /* Reload buf, since the preceding call might have reallocated 2766 the buffer. */ 2767 buf = (const char *) re_string_get_buffer (&mctx->input); 2768 2769 if (err == REG_NOMATCH) 2770 continue; 2771 if (BE (err != REG_NOERROR, 0)) 2772 return err; 2773 } 2774 2775 if (sub_last_idx < sub_top->nlasts) 2776 continue; 2777 if (sub_last_idx > 0) 2778 ++sl_str; 2779 /* Then, search for the other last nodes of the sub expression. */ 2780 for (; sl_str <= bkref_str_idx; ++sl_str) 2781 { 2782 int cls_node, sl_str_off; 2783 const re_node_set *nodes; 2784 sl_str_off = sl_str - sub_top->str_idx; 2785 /* The matched string by the sub expression match with the substring 2786 at the back reference? */ 2787 if (sl_str_off > 0) 2788 { 2789 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2790 { 2791 /* If we are at the end of the input, we cannot match. */ 2792 if (bkref_str_off >= mctx->input.len) 2793 break; 2794 2795 err = extend_buffers (mctx, bkref_str_off + 1); 2796 if (BE (err != REG_NOERROR, 0)) 2797 return err; 2798 2799 buf = (const char *) re_string_get_buffer (&mctx->input); 2800 } 2801 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2802 break; /* We don't need to search this sub expression 2803 any more. */ 2804 } 2805 if (mctx->state_log[sl_str] == NULL) 2806 continue; 2807 /* Does this state have a ')' of the sub expression? */ 2808 nodes = &mctx->state_log[sl_str]->nodes; 2809 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2810 OP_CLOSE_SUBEXP); 2811 if (cls_node == -1) 2812 continue; /* No. */ 2813 if (sub_top->path == NULL) 2814 { 2815 sub_top->path = calloc (sizeof (state_array_t), 2816 sl_str - sub_top->str_idx + 1); 2817 if (sub_top->path == NULL) 2818 return REG_ESPACE; 2819 } 2820 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2821 in the current context? */ 2822 err = check_arrival (mctx, sub_top->path, sub_top->node, 2823 sub_top->str_idx, cls_node, sl_str, 2824 OP_CLOSE_SUBEXP); 2825 if (err == REG_NOMATCH) 2826 continue; 2827 if (BE (err != REG_NOERROR, 0)) 2828 return err; 2829 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2830 if (BE (sub_last == NULL, 0)) 2831 return REG_ESPACE; 2832 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2833 bkref_str_idx); 2834 if (err == REG_NOMATCH) 2835 continue; 2836 } 2837 } 2838 return REG_NOERROR; 2839} 2840 2841/* Helper functions for get_subexp(). */ 2842 2843/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2844 If it can arrive, register the sub expression expressed with SUB_TOP 2845 and SUB_LAST. */ 2846 2847static reg_errcode_t 2848internal_function 2849get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2850 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str) 2851{ 2852 reg_errcode_t err; 2853 int to_idx; 2854 /* Can the subexpression arrive the back reference? */ 2855 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2856 sub_last->str_idx, bkref_node, bkref_str, 2857 OP_OPEN_SUBEXP); 2858 if (err != REG_NOERROR) 2859 return err; 2860 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2861 sub_last->str_idx); 2862 if (BE (err != REG_NOERROR, 0)) 2863 return err; 2864 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2865 return clean_state_log_if_needed (mctx, to_idx); 2866} 2867 2868/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2869 Search '(' if FL_OPEN, or search ')' otherwise. 2870 TODO: This function isn't efficient... 2871 Because there might be more than one nodes whose types are 2872 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2873 nodes. 2874 E.g. RE: (a){2} */ 2875 2876static int 2877internal_function 2878find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2879 int subexp_idx, int type) 2880{ 2881 int cls_idx; 2882 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2883 { 2884 int cls_node = nodes->elems[cls_idx]; 2885 const re_token_t *node = dfa->nodes + cls_node; 2886 if (node->type == type 2887 && node->opr.idx == subexp_idx) 2888 return cls_node; 2889 } 2890 return -1; 2891} 2892 2893/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2894 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2895 heavily reused. 2896 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2897 2898static reg_errcode_t 2899internal_function __attribute_warn_unused_result__ 2900check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node, 2901 int top_str, int last_node, int last_str, int type) 2902{ 2903 const re_dfa_t *const dfa = mctx->dfa; 2904 reg_errcode_t err = REG_NOERROR; 2905 int subexp_num, backup_cur_idx, str_idx, null_cnt; 2906 re_dfastate_t *cur_state = NULL; 2907 re_node_set *cur_nodes, next_nodes; 2908 re_dfastate_t **backup_state_log; 2909 unsigned int context; 2910 2911 subexp_num = dfa->nodes[top_node].opr.idx; 2912 /* Extend the buffer if we need. */ 2913 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2914 { 2915 re_dfastate_t **new_array; 2916 int old_alloc = path->alloc; 2917 path->alloc += last_str + mctx->max_mb_elem_len + 1; 2918 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2919 if (BE (new_array == NULL, 0)) 2920 { 2921 path->alloc = old_alloc; 2922 return REG_ESPACE; 2923 } 2924 path->array = new_array; 2925 memset (new_array + old_alloc, '\0', 2926 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2927 } 2928 2929 str_idx = path->next_idx ?: top_str; 2930 2931 /* Temporary modify MCTX. */ 2932 backup_state_log = mctx->state_log; 2933 backup_cur_idx = mctx->input.cur_idx; 2934 mctx->state_log = path->array; 2935 mctx->input.cur_idx = str_idx; 2936 2937 /* Setup initial node set. */ 2938 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2939 if (str_idx == top_str) 2940 { 2941 err = re_node_set_init_1 (&next_nodes, top_node); 2942 if (BE (err != REG_NOERROR, 0)) 2943 return err; 2944 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2945 if (BE (err != REG_NOERROR, 0)) 2946 { 2947 re_node_set_free (&next_nodes); 2948 return err; 2949 } 2950 } 2951 else 2952 { 2953 cur_state = mctx->state_log[str_idx]; 2954 if (cur_state && cur_state->has_backref) 2955 { 2956 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2957 if (BE (err != REG_NOERROR, 0)) 2958 return err; 2959 } 2960 else 2961 re_node_set_init_empty (&next_nodes); 2962 } 2963 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2964 { 2965 if (next_nodes.nelem) 2966 { 2967 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2968 subexp_num, type); 2969 if (BE (err != REG_NOERROR, 0)) 2970 { 2971 re_node_set_free (&next_nodes); 2972 return err; 2973 } 2974 } 2975 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2976 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2977 { 2978 re_node_set_free (&next_nodes); 2979 return err; 2980 } 2981 mctx->state_log[str_idx] = cur_state; 2982 } 2983 2984 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2985 { 2986 re_node_set_empty (&next_nodes); 2987 if (mctx->state_log[str_idx + 1]) 2988 { 2989 err = re_node_set_merge (&next_nodes, 2990 &mctx->state_log[str_idx + 1]->nodes); 2991 if (BE (err != REG_NOERROR, 0)) 2992 { 2993 re_node_set_free (&next_nodes); 2994 return err; 2995 } 2996 } 2997 if (cur_state) 2998 { 2999 err = check_arrival_add_next_nodes (mctx, str_idx, 3000 &cur_state->non_eps_nodes, 3001 &next_nodes); 3002 if (BE (err != REG_NOERROR, 0)) 3003 { 3004 re_node_set_free (&next_nodes); 3005 return err; 3006 } 3007 } 3008 ++str_idx; 3009 if (next_nodes.nelem) 3010 { 3011 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 3012 if (BE (err != REG_NOERROR, 0)) 3013 { 3014 re_node_set_free (&next_nodes); 3015 return err; 3016 } 3017 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3018 subexp_num, type); 3019 if (BE (err != REG_NOERROR, 0)) 3020 { 3021 re_node_set_free (&next_nodes); 3022 return err; 3023 } 3024 } 3025 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 3026 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3027 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3028 { 3029 re_node_set_free (&next_nodes); 3030 return err; 3031 } 3032 mctx->state_log[str_idx] = cur_state; 3033 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3034 } 3035 re_node_set_free (&next_nodes); 3036 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3037 : &mctx->state_log[last_str]->nodes); 3038 path->next_idx = str_idx; 3039 3040 /* Fix MCTX. */ 3041 mctx->state_log = backup_state_log; 3042 mctx->input.cur_idx = backup_cur_idx; 3043 3044 /* Then check the current node set has the node LAST_NODE. */ 3045 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3046 return REG_NOERROR; 3047 3048 return REG_NOMATCH; 3049} 3050 3051/* Helper functions for check_arrival. */ 3052 3053/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3054 to NEXT_NODES. 3055 TODO: This function is similar to the functions transit_state*(), 3056 however this function has many additional works. 3057 Can't we unify them? */ 3058 3059static reg_errcode_t 3060internal_function __attribute_warn_unused_result__ 3061check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx, 3062 re_node_set *cur_nodes, re_node_set *next_nodes) 3063{ 3064 const re_dfa_t *const dfa = mctx->dfa; 3065 int result; 3066 int cur_idx; 3067 reg_errcode_t err = REG_NOERROR; 3068 re_node_set union_set; 3069 re_node_set_init_empty (&union_set); 3070 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3071 { 3072 int naccepted = 0; 3073 int cur_node = cur_nodes->elems[cur_idx]; 3074#ifdef DEBUG 3075 re_token_type_t type = dfa->nodes[cur_node].type; 3076 assert (!IS_EPSILON_NODE (type)); 3077#endif 3078#ifdef RE_ENABLE_I18N 3079 /* If the node may accept `multi byte'. */ 3080 if (dfa->nodes[cur_node].accept_mb) 3081 { 3082 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3083 str_idx); 3084 if (naccepted > 1) 3085 { 3086 re_dfastate_t *dest_state; 3087 int next_node = dfa->nexts[cur_node]; 3088 int next_idx = str_idx + naccepted; 3089 dest_state = mctx->state_log[next_idx]; 3090 re_node_set_empty (&union_set); 3091 if (dest_state) 3092 { 3093 err = re_node_set_merge (&union_set, &dest_state->nodes); 3094 if (BE (err != REG_NOERROR, 0)) 3095 { 3096 re_node_set_free (&union_set); 3097 return err; 3098 } 3099 } 3100 result = re_node_set_insert (&union_set, next_node); 3101 if (BE (result < 0, 0)) 3102 { 3103 re_node_set_free (&union_set); 3104 return REG_ESPACE; 3105 } 3106 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3107 &union_set); 3108 if (BE (mctx->state_log[next_idx] == NULL 3109 && err != REG_NOERROR, 0)) 3110 { 3111 re_node_set_free (&union_set); 3112 return err; 3113 } 3114 } 3115 } 3116#endif /* RE_ENABLE_I18N */ 3117 if (naccepted 3118 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3119 { 3120 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3121 if (BE (result < 0, 0)) 3122 { 3123 re_node_set_free (&union_set); 3124 return REG_ESPACE; 3125 } 3126 } 3127 } 3128 re_node_set_free (&union_set); 3129 return REG_NOERROR; 3130} 3131 3132/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3133 CUR_NODES, however exclude the nodes which are: 3134 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3135 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3136*/ 3137 3138static reg_errcode_t 3139internal_function 3140check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3141 int ex_subexp, int type) 3142{ 3143 reg_errcode_t err; 3144 int idx, outside_node; 3145 re_node_set new_nodes; 3146#ifdef DEBUG 3147 assert (cur_nodes->nelem); 3148#endif 3149 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3150 if (BE (err != REG_NOERROR, 0)) 3151 return err; 3152 /* Create a new node set NEW_NODES with the nodes which are epsilon 3153 closures of the node in CUR_NODES. */ 3154 3155 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3156 { 3157 int cur_node = cur_nodes->elems[idx]; 3158 const re_node_set *eclosure = dfa->eclosures + cur_node; 3159 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3160 if (outside_node == -1) 3161 { 3162 /* There are no problematic nodes, just merge them. */ 3163 err = re_node_set_merge (&new_nodes, eclosure); 3164 if (BE (err != REG_NOERROR, 0)) 3165 { 3166 re_node_set_free (&new_nodes); 3167 return err; 3168 } 3169 } 3170 else 3171 { 3172 /* There are problematic nodes, re-calculate incrementally. */ 3173 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3174 ex_subexp, type); 3175 if (BE (err != REG_NOERROR, 0)) 3176 { 3177 re_node_set_free (&new_nodes); 3178 return err; 3179 } 3180 } 3181 } 3182 re_node_set_free (cur_nodes); 3183 *cur_nodes = new_nodes; 3184 return REG_NOERROR; 3185} 3186 3187/* Helper function for check_arrival_expand_ecl. 3188 Check incrementally the epsilon closure of TARGET, and if it isn't 3189 problematic append it to DST_NODES. */ 3190 3191static reg_errcode_t 3192internal_function __attribute_warn_unused_result__ 3193check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3194 int target, int ex_subexp, int type) 3195{ 3196 int cur_node; 3197 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3198 { 3199 int err; 3200 3201 if (dfa->nodes[cur_node].type == type 3202 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3203 { 3204 if (type == OP_CLOSE_SUBEXP) 3205 { 3206 err = re_node_set_insert (dst_nodes, cur_node); 3207 if (BE (err == -1, 0)) 3208 return REG_ESPACE; 3209 } 3210 break; 3211 } 3212 err = re_node_set_insert (dst_nodes, cur_node); 3213 if (BE (err == -1, 0)) 3214 return REG_ESPACE; 3215 if (dfa->edests[cur_node].nelem == 0) 3216 break; 3217 if (dfa->edests[cur_node].nelem == 2) 3218 { 3219 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3220 dfa->edests[cur_node].elems[1], 3221 ex_subexp, type); 3222 if (BE (err != REG_NOERROR, 0)) 3223 return err; 3224 } 3225 cur_node = dfa->edests[cur_node].elems[0]; 3226 } 3227 return REG_NOERROR; 3228} 3229 3230 3231/* For all the back references in the current state, calculate the 3232 destination of the back references by the appropriate entry 3233 in MCTX->BKREF_ENTS. */ 3234 3235static reg_errcode_t 3236internal_function __attribute_warn_unused_result__ 3237expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3238 int cur_str, int subexp_num, int type) 3239{ 3240 const re_dfa_t *const dfa = mctx->dfa; 3241 reg_errcode_t err; 3242 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3243 struct re_backref_cache_entry *ent; 3244 3245 if (cache_idx_start == -1) 3246 return REG_NOERROR; 3247 3248 restart: 3249 ent = mctx->bkref_ents + cache_idx_start; 3250 do 3251 { 3252 int to_idx, next_node; 3253 3254 /* Is this entry ENT is appropriate? */ 3255 if (!re_node_set_contains (cur_nodes, ent->node)) 3256 continue; /* No. */ 3257 3258 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3259 /* Calculate the destination of the back reference, and append it 3260 to MCTX->STATE_LOG. */ 3261 if (to_idx == cur_str) 3262 { 3263 /* The backreference did epsilon transit, we must re-check all the 3264 node in the current state. */ 3265 re_node_set new_dests; 3266 reg_errcode_t err2, err3; 3267 next_node = dfa->edests[ent->node].elems[0]; 3268 if (re_node_set_contains (cur_nodes, next_node)) 3269 continue; 3270 err = re_node_set_init_1 (&new_dests, next_node); 3271 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3272 err3 = re_node_set_merge (cur_nodes, &new_dests); 3273 re_node_set_free (&new_dests); 3274 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3275 || err3 != REG_NOERROR, 0)) 3276 { 3277 err = (err != REG_NOERROR ? err 3278 : (err2 != REG_NOERROR ? err2 : err3)); 3279 return err; 3280 } 3281 /* TODO: It is still inefficient... */ 3282 goto restart; 3283 } 3284 else 3285 { 3286 re_node_set union_set; 3287 next_node = dfa->nexts[ent->node]; 3288 if (mctx->state_log[to_idx]) 3289 { 3290 int ret; 3291 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3292 next_node)) 3293 continue; 3294 err = re_node_set_init_copy (&union_set, 3295 &mctx->state_log[to_idx]->nodes); 3296 ret = re_node_set_insert (&union_set, next_node); 3297 if (BE (err != REG_NOERROR || ret < 0, 0)) 3298 { 3299 re_node_set_free (&union_set); 3300 err = err != REG_NOERROR ? err : REG_ESPACE; 3301 return err; 3302 } 3303 } 3304 else 3305 { 3306 err = re_node_set_init_1 (&union_set, next_node); 3307 if (BE (err != REG_NOERROR, 0)) 3308 return err; 3309 } 3310 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3311 re_node_set_free (&union_set); 3312 if (BE (mctx->state_log[to_idx] == NULL 3313 && err != REG_NOERROR, 0)) 3314 return err; 3315 } 3316 } 3317 while (ent++->more); 3318 return REG_NOERROR; 3319} 3320 3321/* Build transition table for the state. 3322 Return 1 if succeeded, otherwise return NULL. */ 3323 3324static int 3325internal_function 3326build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3327{ 3328 reg_errcode_t err; 3329 int i, j, ch, need_word_trtable = 0; 3330 bitset_word_t elem, mask; 3331 bool dests_node_malloced = false; 3332 bool dest_states_malloced = false; 3333 int ndests; /* Number of the destination states from `state'. */ 3334 re_dfastate_t **trtable; 3335 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3336 re_node_set follows, *dests_node; 3337 bitset_t *dests_ch; 3338 bitset_t acceptable; 3339 3340 struct dests_alloc 3341 { 3342 re_node_set dests_node[SBC_MAX]; 3343 bitset_t dests_ch[SBC_MAX]; 3344 } *dests_alloc; 3345 3346 /* We build DFA states which corresponds to the destination nodes 3347 from `state'. `dests_node[i]' represents the nodes which i-th 3348 destination state contains, and `dests_ch[i]' represents the 3349 characters which i-th destination state accepts. */ 3350 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3351 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3352 else 3353 { 3354 dests_alloc = re_malloc (struct dests_alloc, 1); 3355 if (BE (dests_alloc == NULL, 0)) 3356 return 0; 3357 dests_node_malloced = true; 3358 } 3359 dests_node = dests_alloc->dests_node; 3360 dests_ch = dests_alloc->dests_ch; 3361 3362 /* Initialize transiton table. */ 3363 state->word_trtable = state->trtable = NULL; 3364 3365 /* At first, group all nodes belonging to `state' into several 3366 destinations. */ 3367 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3368 if (BE (ndests <= 0, 0)) 3369 { 3370 if (dests_node_malloced) 3371 free (dests_alloc); 3372 /* Return 0 in case of an error, 1 otherwise. */ 3373 if (ndests == 0) 3374 { 3375 state->trtable = (re_dfastate_t **) 3376 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3377 if (BE (state->trtable == NULL, 0)) 3378 return 0; 3379 return 1; 3380 } 3381 return 0; 3382 } 3383 3384 err = re_node_set_alloc (&follows, ndests + 1); 3385 if (BE (err != REG_NOERROR, 0)) 3386 goto out_free; 3387 3388 /* Avoid arithmetic overflow in size calculation. */ 3389 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) 3390 / (3 * sizeof (re_dfastate_t *))) 3391 < ndests), 3392 0)) 3393 goto out_free; 3394 3395 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3396 + ndests * 3 * sizeof (re_dfastate_t *))) 3397 dest_states = (re_dfastate_t **) 3398 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3399 else 3400 { 3401 dest_states = (re_dfastate_t **) 3402 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3403 if (BE (dest_states == NULL, 0)) 3404 { 3405out_free: 3406 if (dest_states_malloced) 3407 free (dest_states); 3408 re_node_set_free (&follows); 3409 for (i = 0; i < ndests; ++i) 3410 re_node_set_free (dests_node + i); 3411 if (dests_node_malloced) 3412 free (dests_alloc); 3413 return 0; 3414 } 3415 dest_states_malloced = true; 3416 } 3417 dest_states_word = dest_states + ndests; 3418 dest_states_nl = dest_states_word + ndests; 3419 bitset_empty (acceptable); 3420 3421 /* Then build the states for all destinations. */ 3422 for (i = 0; i < ndests; ++i) 3423 { 3424 int next_node; 3425 re_node_set_empty (&follows); 3426 /* Merge the follows of this destination states. */ 3427 for (j = 0; j < dests_node[i].nelem; ++j) 3428 { 3429 next_node = dfa->nexts[dests_node[i].elems[j]]; 3430 if (next_node != -1) 3431 { 3432 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3433 if (BE (err != REG_NOERROR, 0)) 3434 goto out_free; 3435 } 3436 } 3437 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3438 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3439 goto out_free; 3440 /* If the new state has context constraint, 3441 build appropriate states for these contexts. */ 3442 if (dest_states[i]->has_constraint) 3443 { 3444 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3445 CONTEXT_WORD); 3446 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3447 goto out_free; 3448 3449 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3450 need_word_trtable = 1; 3451 3452 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3453 CONTEXT_NEWLINE); 3454 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3455 goto out_free; 3456 } 3457 else 3458 { 3459 dest_states_word[i] = dest_states[i]; 3460 dest_states_nl[i] = dest_states[i]; 3461 } 3462 bitset_merge (acceptable, dests_ch[i]); 3463 } 3464 3465 if (!BE (need_word_trtable, 0)) 3466 { 3467 /* We don't care about whether the following character is a word 3468 character, or we are in a single-byte character set so we can 3469 discern by looking at the character code: allocate a 3470 256-entry transition table. */ 3471 trtable = state->trtable = 3472 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3473 if (BE (trtable == NULL, 0)) 3474 goto out_free; 3475 3476 /* For all characters ch...: */ 3477 for (i = 0; i < BITSET_WORDS; ++i) 3478 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3479 elem; 3480 mask <<= 1, elem >>= 1, ++ch) 3481 if (BE (elem & 1, 0)) 3482 { 3483 /* There must be exactly one destination which accepts 3484 character ch. See group_nodes_into_DFAstates. */ 3485 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3486 ; 3487 3488 /* j-th destination accepts the word character ch. */ 3489 if (dfa->word_char[i] & mask) 3490 trtable[ch] = dest_states_word[j]; 3491 else 3492 trtable[ch] = dest_states[j]; 3493 } 3494 } 3495 else 3496 { 3497 /* We care about whether the following character is a word 3498 character, and we are in a multi-byte character set: discern 3499 by looking at the character code: build two 256-entry 3500 transition tables, one starting at trtable[0] and one 3501 starting at trtable[SBC_MAX]. */ 3502 trtable = state->word_trtable = 3503 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3504 if (BE (trtable == NULL, 0)) 3505 goto out_free; 3506 3507 /* For all characters ch...: */ 3508 for (i = 0; i < BITSET_WORDS; ++i) 3509 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3510 elem; 3511 mask <<= 1, elem >>= 1, ++ch) 3512 if (BE (elem & 1, 0)) 3513 { 3514 /* There must be exactly one destination which accepts 3515 character ch. See group_nodes_into_DFAstates. */ 3516 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3517 ; 3518 3519 /* j-th destination accepts the word character ch. */ 3520 trtable[ch] = dest_states[j]; 3521 trtable[ch + SBC_MAX] = dest_states_word[j]; 3522 } 3523 } 3524 3525 /* new line */ 3526 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3527 { 3528 /* The current state accepts newline character. */ 3529 for (j = 0; j < ndests; ++j) 3530 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3531 { 3532 /* k-th destination accepts newline character. */ 3533 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3534 if (need_word_trtable) 3535 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3536 /* There must be only one destination which accepts 3537 newline. See group_nodes_into_DFAstates. */ 3538 break; 3539 } 3540 } 3541 3542 if (dest_states_malloced) 3543 free (dest_states); 3544 3545 re_node_set_free (&follows); 3546 for (i = 0; i < ndests; ++i) 3547 re_node_set_free (dests_node + i); 3548 3549 if (dests_node_malloced) 3550 free (dests_alloc); 3551 3552 return 1; 3553} 3554 3555/* Group all nodes belonging to STATE into several destinations. 3556 Then for all destinations, set the nodes belonging to the destination 3557 to DESTS_NODE[i] and set the characters accepted by the destination 3558 to DEST_CH[i]. This function return the number of destinations. */ 3559 3560static int 3561internal_function 3562group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3563 re_node_set *dests_node, bitset_t *dests_ch) 3564{ 3565 reg_errcode_t err; 3566 int result; 3567 int i, j, k; 3568 int ndests; /* Number of the destinations from `state'. */ 3569 bitset_t accepts; /* Characters a node can accept. */ 3570 const re_node_set *cur_nodes = &state->nodes; 3571 bitset_empty (accepts); 3572 ndests = 0; 3573 3574 /* For all the nodes belonging to `state', */ 3575 for (i = 0; i < cur_nodes->nelem; ++i) 3576 { 3577 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3578 re_token_type_t type = node->type; 3579 unsigned int constraint = node->constraint; 3580 3581 /* Enumerate all single byte character this node can accept. */ 3582 if (type == CHARACTER) 3583 bitset_set (accepts, node->opr.c); 3584 else if (type == SIMPLE_BRACKET) 3585 { 3586 bitset_merge (accepts, node->opr.sbcset); 3587 } 3588 else if (type == OP_PERIOD) 3589 { 3590#ifdef RE_ENABLE_I18N 3591 if (dfa->mb_cur_max > 1) 3592 bitset_merge (accepts, dfa->sb_char); 3593 else 3594#endif 3595 bitset_set_all (accepts); 3596 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3597 bitset_clear (accepts, '\n'); 3598 if (dfa->syntax & RE_DOT_NOT_NULL) 3599 bitset_clear (accepts, '\0'); 3600 } 3601#ifdef RE_ENABLE_I18N 3602 else if (type == OP_UTF8_PERIOD) 3603 { 3604 memset (accepts, '\xff', sizeof (bitset_t) / 2); 3605 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3606 bitset_clear (accepts, '\n'); 3607 if (dfa->syntax & RE_DOT_NOT_NULL) 3608 bitset_clear (accepts, '\0'); 3609 } 3610#endif 3611 else 3612 continue; 3613 3614 /* Check the `accepts' and sift the characters which are not 3615 match it the context. */ 3616 if (constraint) 3617 { 3618 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3619 { 3620 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3621 bitset_empty (accepts); 3622 if (accepts_newline) 3623 bitset_set (accepts, NEWLINE_CHAR); 3624 else 3625 continue; 3626 } 3627 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3628 { 3629 bitset_empty (accepts); 3630 continue; 3631 } 3632 3633 if (constraint & NEXT_WORD_CONSTRAINT) 3634 { 3635 bitset_word_t any_set = 0; 3636 if (type == CHARACTER && !node->word_char) 3637 { 3638 bitset_empty (accepts); 3639 continue; 3640 } 3641#ifdef RE_ENABLE_I18N 3642 if (dfa->mb_cur_max > 1) 3643 for (j = 0; j < BITSET_WORDS; ++j) 3644 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3645 else 3646#endif 3647 for (j = 0; j < BITSET_WORDS; ++j) 3648 any_set |= (accepts[j] &= dfa->word_char[j]); 3649 if (!any_set) 3650 continue; 3651 } 3652 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3653 { 3654 bitset_word_t any_set = 0; 3655 if (type == CHARACTER && node->word_char) 3656 { 3657 bitset_empty (accepts); 3658 continue; 3659 } 3660#ifdef RE_ENABLE_I18N 3661 if (dfa->mb_cur_max > 1) 3662 for (j = 0; j < BITSET_WORDS; ++j) 3663 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3664 else 3665#endif 3666 for (j = 0; j < BITSET_WORDS; ++j) 3667 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3668 if (!any_set) 3669 continue; 3670 } 3671 } 3672 3673 /* Then divide `accepts' into DFA states, or create a new 3674 state. Above, we make sure that accepts is not empty. */ 3675 for (j = 0; j < ndests; ++j) 3676 { 3677 bitset_t intersec; /* Intersection sets, see below. */ 3678 bitset_t remains; 3679 /* Flags, see below. */ 3680 bitset_word_t has_intersec, not_subset, not_consumed; 3681 3682 /* Optimization, skip if this state doesn't accept the character. */ 3683 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3684 continue; 3685 3686 /* Enumerate the intersection set of this state and `accepts'. */ 3687 has_intersec = 0; 3688 for (k = 0; k < BITSET_WORDS; ++k) 3689 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3690 /* And skip if the intersection set is empty. */ 3691 if (!has_intersec) 3692 continue; 3693 3694 /* Then check if this state is a subset of `accepts'. */ 3695 not_subset = not_consumed = 0; 3696 for (k = 0; k < BITSET_WORDS; ++k) 3697 { 3698 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3699 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3700 } 3701 3702 /* If this state isn't a subset of `accepts', create a 3703 new group state, which has the `remains'. */ 3704 if (not_subset) 3705 { 3706 bitset_copy (dests_ch[ndests], remains); 3707 bitset_copy (dests_ch[j], intersec); 3708 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3709 if (BE (err != REG_NOERROR, 0)) 3710 goto error_return; 3711 ++ndests; 3712 } 3713 3714 /* Put the position in the current group. */ 3715 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3716 if (BE (result < 0, 0)) 3717 goto error_return; 3718 3719 /* If all characters are consumed, go to next node. */ 3720 if (!not_consumed) 3721 break; 3722 } 3723 /* Some characters remain, create a new group. */ 3724 if (j == ndests) 3725 { 3726 bitset_copy (dests_ch[ndests], accepts); 3727 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3728 if (BE (err != REG_NOERROR, 0)) 3729 goto error_return; 3730 ++ndests; 3731 bitset_empty (accepts); 3732 } 3733 } 3734 return ndests; 3735 error_return: 3736 for (j = 0; j < ndests; ++j) 3737 re_node_set_free (dests_node + j); 3738 return -1; 3739} 3740 3741#ifdef RE_ENABLE_I18N 3742/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3743 Return the number of the bytes the node accepts. 3744 STR_IDX is the current index of the input string. 3745 3746 This function handles the nodes which can accept one character, or 3747 one collating element like '.', '[a-z]', opposite to the other nodes 3748 can only accept one byte. */ 3749 3750static int 3751internal_function 3752check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 3753 const re_string_t *input, int str_idx) 3754{ 3755 const re_token_t *node = dfa->nodes + node_idx; 3756 int char_len, elem_len; 3757 int i; 3758 3759 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3760 { 3761 unsigned char c = re_string_byte_at (input, str_idx), d; 3762 if (BE (c < 0xc2, 1)) 3763 return 0; 3764 3765 if (str_idx + 2 > input->len) 3766 return 0; 3767 3768 d = re_string_byte_at (input, str_idx + 1); 3769 if (c < 0xe0) 3770 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3771 else if (c < 0xf0) 3772 { 3773 char_len = 3; 3774 if (c == 0xe0 && d < 0xa0) 3775 return 0; 3776 } 3777 else if (c < 0xf8) 3778 { 3779 char_len = 4; 3780 if (c == 0xf0 && d < 0x90) 3781 return 0; 3782 } 3783 else if (c < 0xfc) 3784 { 3785 char_len = 5; 3786 if (c == 0xf8 && d < 0x88) 3787 return 0; 3788 } 3789 else if (c < 0xfe) 3790 { 3791 char_len = 6; 3792 if (c == 0xfc && d < 0x84) 3793 return 0; 3794 } 3795 else 3796 return 0; 3797 3798 if (str_idx + char_len > input->len) 3799 return 0; 3800 3801 for (i = 1; i < char_len; ++i) 3802 { 3803 d = re_string_byte_at (input, str_idx + i); 3804 if (d < 0x80 || d > 0xbf) 3805 return 0; 3806 } 3807 return char_len; 3808 } 3809 3810 char_len = re_string_char_size_at (input, str_idx); 3811 if (node->type == OP_PERIOD) 3812 { 3813 if (char_len <= 1) 3814 return 0; 3815 /* FIXME: I don't think this if is needed, as both '\n' 3816 and '\0' are char_len == 1. */ 3817 /* '.' accepts any one character except the following two cases. */ 3818 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3819 re_string_byte_at (input, str_idx) == '\n') || 3820 ((dfa->syntax & RE_DOT_NOT_NULL) && 3821 re_string_byte_at (input, str_idx) == '\0')) 3822 return 0; 3823 return char_len; 3824 } 3825 3826 elem_len = re_string_elem_size_at (input, str_idx); 3827 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3828 return 0; 3829 3830 if (node->type == COMPLEX_BRACKET) 3831 { 3832 const re_charset_t *cset = node->opr.mbcset; 3833# ifdef _LIBC 3834 const unsigned char *pin 3835 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3836 int j; 3837 uint32_t nrules; 3838# endif /* _LIBC */ 3839 int match_len = 0; 3840 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3841 ? re_string_wchar_at (input, str_idx) : 0); 3842 3843 /* match with multibyte character? */ 3844 for (i = 0; i < cset->nmbchars; ++i) 3845 if (wc == cset->mbchars[i]) 3846 { 3847 match_len = char_len; 3848 goto check_node_accept_bytes_match; 3849 } 3850 /* match with character_class? */ 3851 for (i = 0; i < cset->nchar_classes; ++i) 3852 { 3853 wctype_t wt = cset->char_classes[i]; 3854 if (__iswctype (wc, wt)) 3855 { 3856 match_len = char_len; 3857 goto check_node_accept_bytes_match; 3858 } 3859 } 3860 3861# ifdef _LIBC 3862 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3863 if (nrules != 0) 3864 { 3865 unsigned int in_collseq = 0; 3866 const int32_t *table, *indirect; 3867 const unsigned char *weights, *extra; 3868 const char *collseqwc; 3869 /* This #include defines a local function! */ 3870# include <locale/weight.h> 3871 3872 /* match with collating_symbol? */ 3873 if (cset->ncoll_syms) 3874 extra = (const unsigned char *) 3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3876 for (i = 0; i < cset->ncoll_syms; ++i) 3877 { 3878 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3879 /* Compare the length of input collating element and 3880 the length of current collating element. */ 3881 if (*coll_sym != elem_len) 3882 continue; 3883 /* Compare each bytes. */ 3884 for (j = 0; j < *coll_sym; j++) 3885 if (pin[j] != coll_sym[1 + j]) 3886 break; 3887 if (j == *coll_sym) 3888 { 3889 /* Match if every bytes is equal. */ 3890 match_len = j; 3891 goto check_node_accept_bytes_match; 3892 } 3893 } 3894 3895 if (cset->nranges) 3896 { 3897 if (elem_len <= char_len) 3898 { 3899 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3900 in_collseq = __collseq_table_lookup (collseqwc, wc); 3901 } 3902 else 3903 in_collseq = find_collation_sequence_value (pin, elem_len); 3904 } 3905 /* match with range expression? */ 3906 for (i = 0; i < cset->nranges; ++i) 3907 if (cset->range_starts[i] <= in_collseq 3908 && in_collseq <= cset->range_ends[i]) 3909 { 3910 match_len = elem_len; 3911 goto check_node_accept_bytes_match; 3912 } 3913 3914 /* match with equivalence_class? */ 3915 if (cset->nequiv_classes) 3916 { 3917 const unsigned char *cp = pin; 3918 table = (const int32_t *) 3919 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3920 weights = (const unsigned char *) 3921 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3922 extra = (const unsigned char *) 3923 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3924 indirect = (const int32_t *) 3925 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3926 int32_t idx = findidx (&cp, elem_len); 3927 if (idx > 0) 3928 for (i = 0; i < cset->nequiv_classes; ++i) 3929 { 3930 int32_t equiv_class_idx = cset->equiv_classes[i]; 3931 size_t weight_len = weights[idx & 0xffffff]; 3932 if (weight_len == weights[equiv_class_idx & 0xffffff] 3933 && (idx >> 24) == (equiv_class_idx >> 24)) 3934 { 3935 int cnt = 0; 3936 3937 idx &= 0xffffff; 3938 equiv_class_idx &= 0xffffff; 3939 3940 while (cnt <= weight_len 3941 && (weights[equiv_class_idx + 1 + cnt] 3942 == weights[idx + 1 + cnt])) 3943 ++cnt; 3944 if (cnt > weight_len) 3945 { 3946 match_len = elem_len; 3947 goto check_node_accept_bytes_match; 3948 } 3949 } 3950 } 3951 } 3952 } 3953 else 3954# endif /* _LIBC */ 3955 { 3956 /* match with range expression? */ 3957#if __GNUC__ >= 2 3958 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3959#else 3960 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3961 cmp_buf[2] = wc; 3962#endif 3963 for (i = 0; i < cset->nranges; ++i) 3964 { 3965 cmp_buf[0] = cset->range_starts[i]; 3966 cmp_buf[4] = cset->range_ends[i]; 3967 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3968 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3969 { 3970 match_len = char_len; 3971 goto check_node_accept_bytes_match; 3972 } 3973 } 3974 } 3975 check_node_accept_bytes_match: 3976 if (!cset->non_match) 3977 return match_len; 3978 else 3979 { 3980 if (match_len > 0) 3981 return 0; 3982 else 3983 return (elem_len > char_len) ? elem_len : char_len; 3984 } 3985 } 3986 return 0; 3987} 3988 3989# ifdef _LIBC 3990static unsigned int 3991internal_function 3992find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 3993{ 3994 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3995 if (nrules == 0) 3996 { 3997 if (mbs_len == 1) 3998 { 3999 /* No valid character. Match it as a single byte character. */ 4000 const unsigned char *collseq = (const unsigned char *) 4001 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 4002 return collseq[mbs[0]]; 4003 } 4004 return UINT_MAX; 4005 } 4006 else 4007 { 4008 int32_t idx; 4009 const unsigned char *extra = (const unsigned char *) 4010 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4011 int32_t extrasize = (const unsigned char *) 4012 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 4013 4014 for (idx = 0; idx < extrasize;) 4015 { 4016 int mbs_cnt, found = 0; 4017 int32_t elem_mbs_len; 4018 /* Skip the name of collating element name. */ 4019 idx = idx + extra[idx] + 1; 4020 elem_mbs_len = extra[idx++]; 4021 if (mbs_len == elem_mbs_len) 4022 { 4023 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 4024 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 4025 break; 4026 if (mbs_cnt == elem_mbs_len) 4027 /* Found the entry. */ 4028 found = 1; 4029 } 4030 /* Skip the byte sequence of the collating element. */ 4031 idx += elem_mbs_len; 4032 /* Adjust for the alignment. */ 4033 idx = (idx + 3) & ~3; 4034 /* Skip the collation sequence value. */ 4035 idx += sizeof (uint32_t); 4036 /* Skip the wide char sequence of the collating element. */ 4037 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1); 4038 /* If we found the entry, return the sequence value. */ 4039 if (found) 4040 return *(uint32_t *) (extra + idx); 4041 /* Skip the collation sequence value. */ 4042 idx += sizeof (uint32_t); 4043 } 4044 return UINT_MAX; 4045 } 4046} 4047# endif /* _LIBC */ 4048#endif /* RE_ENABLE_I18N */ 4049 4050/* Check whether the node accepts the byte which is IDX-th 4051 byte of the INPUT. */ 4052 4053static int 4054internal_function 4055check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4056 int idx) 4057{ 4058 unsigned char ch; 4059 ch = re_string_byte_at (&mctx->input, idx); 4060 switch (node->type) 4061 { 4062 case CHARACTER: 4063 if (node->opr.c != ch) 4064 return 0; 4065 break; 4066 4067 case SIMPLE_BRACKET: 4068 if (!bitset_contain (node->opr.sbcset, ch)) 4069 return 0; 4070 break; 4071 4072#ifdef RE_ENABLE_I18N 4073 case OP_UTF8_PERIOD: 4074 if (ch >= 0x80) 4075 return 0; 4076 /* FALLTHROUGH */ 4077#endif 4078 case OP_PERIOD: 4079 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4080 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4081 return 0; 4082 break; 4083 4084 default: 4085 return 0; 4086 } 4087 4088 if (node->constraint) 4089 { 4090 /* The node has constraints. Check whether the current context 4091 satisfies the constraints. */ 4092 unsigned int context = re_string_context_at (&mctx->input, idx, 4093 mctx->eflags); 4094 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4095 return 0; 4096 } 4097 4098 return 1; 4099} 4100 4101/* Extend the buffers, if the buffers have run out. */ 4102 4103static reg_errcode_t 4104internal_function __attribute_warn_unused_result__ 4105extend_buffers (re_match_context_t *mctx, int min_len) 4106{ 4107 reg_errcode_t ret; 4108 re_string_t *pstr = &mctx->input; 4109 4110 /* Avoid overflow. */ 4111 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) 4112 return REG_ESPACE; 4113 4114 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */ 4115 ret = re_string_realloc_buffers (pstr, 4116 MAX (min_len, 4117 MIN (pstr->len, pstr->bufs_len * 2))); 4118 if (BE (ret != REG_NOERROR, 0)) 4119 return ret; 4120 4121 if (mctx->state_log != NULL) 4122 { 4123 /* And double the length of state_log. */ 4124 /* XXX We have no indication of the size of this buffer. If this 4125 allocation fail we have no indication that the state_log array 4126 does not have the right size. */ 4127 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4128 pstr->bufs_len + 1); 4129 if (BE (new_array == NULL, 0)) 4130 return REG_ESPACE; 4131 mctx->state_log = new_array; 4132 } 4133 4134 /* Then reconstruct the buffers. */ 4135 if (pstr->icase) 4136 { 4137#ifdef RE_ENABLE_I18N 4138 if (pstr->mb_cur_max > 1) 4139 { 4140 ret = build_wcs_upper_buffer (pstr); 4141 if (BE (ret != REG_NOERROR, 0)) 4142 return ret; 4143 } 4144 else 4145#endif /* RE_ENABLE_I18N */ 4146 build_upper_buffer (pstr); 4147 } 4148 else 4149 { 4150#ifdef RE_ENABLE_I18N 4151 if (pstr->mb_cur_max > 1) 4152 build_wcs_buffer (pstr); 4153 else 4154#endif /* RE_ENABLE_I18N */ 4155 { 4156 if (pstr->trans != NULL) 4157 re_string_translate_buffer (pstr); 4158 } 4159 } 4160 return REG_NOERROR; 4161} 4162 4163 4164/* Functions for matching context. */ 4165 4166/* Initialize MCTX. */ 4167 4168static reg_errcode_t 4169internal_function __attribute_warn_unused_result__ 4170match_ctx_init (re_match_context_t *mctx, int eflags, int n) 4171{ 4172 mctx->eflags = eflags; 4173 mctx->match_last = -1; 4174 if (n > 0) 4175 { 4176 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4177 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4178 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4179 return REG_ESPACE; 4180 } 4181 /* Already zero-ed by the caller. 4182 else 4183 mctx->bkref_ents = NULL; 4184 mctx->nbkref_ents = 0; 4185 mctx->nsub_tops = 0; */ 4186 mctx->abkref_ents = n; 4187 mctx->max_mb_elem_len = 1; 4188 mctx->asub_tops = n; 4189 return REG_NOERROR; 4190} 4191 4192/* Clean the entries which depend on the current input in MCTX. 4193 This function must be invoked when the matcher changes the start index 4194 of the input, or changes the input string. */ 4195 4196static void 4197internal_function 4198match_ctx_clean (re_match_context_t *mctx) 4199{ 4200 int st_idx; 4201 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4202 { 4203 int sl_idx; 4204 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4205 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4206 { 4207 re_sub_match_last_t *last = top->lasts[sl_idx]; 4208 re_free (last->path.array); 4209 re_free (last); 4210 } 4211 re_free (top->lasts); 4212 if (top->path) 4213 { 4214 re_free (top->path->array); 4215 re_free (top->path); 4216 } 4217 free (top); 4218 } 4219 4220 mctx->nsub_tops = 0; 4221 mctx->nbkref_ents = 0; 4222} 4223 4224/* Free all the memory associated with MCTX. */ 4225 4226static void 4227internal_function 4228match_ctx_free (re_match_context_t *mctx) 4229{ 4230 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4231 match_ctx_clean (mctx); 4232 re_free (mctx->sub_tops); 4233 re_free (mctx->bkref_ents); 4234} 4235 4236/* Add a new backreference entry to MCTX. 4237 Note that we assume that caller never call this function with duplicate 4238 entry, and call with STR_IDX which isn't smaller than any existing entry. 4239*/ 4240 4241static reg_errcode_t 4242internal_function __attribute_warn_unused_result__ 4243match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from, 4244 int to) 4245{ 4246 if (mctx->nbkref_ents >= mctx->abkref_ents) 4247 { 4248 struct re_backref_cache_entry* new_entry; 4249 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4250 mctx->abkref_ents * 2); 4251 if (BE (new_entry == NULL, 0)) 4252 { 4253 re_free (mctx->bkref_ents); 4254 return REG_ESPACE; 4255 } 4256 mctx->bkref_ents = new_entry; 4257 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4258 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4259 mctx->abkref_ents *= 2; 4260 } 4261 if (mctx->nbkref_ents > 0 4262 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4263 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4264 4265 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4266 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4267 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4268 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4269 4270 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4271 If bit N is clear, means that this entry won't epsilon-transition to 4272 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4273 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4274 such node. 4275 4276 A backreference does not epsilon-transition unless it is empty, so set 4277 to all zeros if FROM != TO. */ 4278 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4279 = (from == to ? ~0 : 0); 4280 4281 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4282 if (mctx->max_mb_elem_len < to - from) 4283 mctx->max_mb_elem_len = to - from; 4284 return REG_NOERROR; 4285} 4286 4287/* Search for the first entry which has the same str_idx, or -1 if none is 4288 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4289 4290static int 4291internal_function 4292search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 4293{ 4294 int left, right, mid, last; 4295 last = right = mctx->nbkref_ents; 4296 for (left = 0; left < right;) 4297 { 4298 mid = (left + right) / 2; 4299 if (mctx->bkref_ents[mid].str_idx < str_idx) 4300 left = mid + 1; 4301 else 4302 right = mid; 4303 } 4304 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4305 return left; 4306 else 4307 return -1; 4308} 4309 4310/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4311 at STR_IDX. */ 4312 4313static reg_errcode_t 4314internal_function __attribute_warn_unused_result__ 4315match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) 4316{ 4317#ifdef DEBUG 4318 assert (mctx->sub_tops != NULL); 4319 assert (mctx->asub_tops > 0); 4320#endif 4321 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4322 { 4323 int new_asub_tops = mctx->asub_tops * 2; 4324 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4325 re_sub_match_top_t *, 4326 new_asub_tops); 4327 if (BE (new_array == NULL, 0)) 4328 return REG_ESPACE; 4329 mctx->sub_tops = new_array; 4330 mctx->asub_tops = new_asub_tops; 4331 } 4332 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4333 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4334 return REG_ESPACE; 4335 mctx->sub_tops[mctx->nsub_tops]->node = node; 4336 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4337 return REG_NOERROR; 4338} 4339 4340/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4341 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4342 4343static re_sub_match_last_t * 4344internal_function 4345match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx) 4346{ 4347 re_sub_match_last_t *new_entry; 4348 if (BE (subtop->nlasts == subtop->alasts, 0)) 4349 { 4350 int new_alasts = 2 * subtop->alasts + 1; 4351 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4352 re_sub_match_last_t *, 4353 new_alasts); 4354 if (BE (new_array == NULL, 0)) 4355 return NULL; 4356 subtop->lasts = new_array; 4357 subtop->alasts = new_alasts; 4358 } 4359 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4360 if (BE (new_entry != NULL, 1)) 4361 { 4362 subtop->lasts[subtop->nlasts] = new_entry; 4363 new_entry->node = node; 4364 new_entry->str_idx = str_idx; 4365 ++subtop->nlasts; 4366 } 4367 return new_entry; 4368} 4369 4370static void 4371internal_function 4372sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4373 re_dfastate_t **limited_sts, int last_node, int last_str_idx) 4374{ 4375 sctx->sifted_states = sifted_sts; 4376 sctx->limited_states = limited_sts; 4377 sctx->last_node = last_node; 4378 sctx->last_str_idx = last_str_idx; 4379 re_node_set_init_empty (&sctx->limits); 4380} 4381