regexec.c revision 146040
1146040Stjr/* Extended regular expression matching and search library. 2146040Stjr Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3146040Stjr This file is part of the GNU C Library. 4146040Stjr Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5146040Stjr 6146040Stjr The GNU C Library is free software; you can redistribute it and/or 7146040Stjr modify it under the terms of the GNU Lesser General Public 8146040Stjr License as published by the Free Software Foundation; either 9146040Stjr version 2.1 of the License, or (at your option) any later version. 10146040Stjr 11146040Stjr The GNU C Library is distributed in the hope that it will be useful, 12146040Stjr but WITHOUT ANY WARRANTY; without even the implied warranty of 13146040Stjr MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14146040Stjr Lesser General Public License for more details. 15146040Stjr 16146040Stjr You should have received a copy of the GNU Lesser General Public 17146040Stjr License along with the GNU C Library; if not, write to the Free 18146040Stjr Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19146040Stjr 02111-1307 USA. */ 20146040Stjr 21146040Stjrstatic reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 22146040Stjr int n) internal_function; 23146040Stjrstatic void match_ctx_clean (re_match_context_t *mctx) internal_function; 24146040Stjrstatic void match_ctx_free (re_match_context_t *cache) internal_function; 25146040Stjrstatic reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 26146040Stjr int str_idx, int from, int to) 27146040Stjr internal_function; 28146040Stjrstatic int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx) 29146040Stjr internal_function; 30146040Stjrstatic reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 31146040Stjr int str_idx) internal_function; 32146040Stjrstatic re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 33146040Stjr int node, int str_idx) 34146040Stjr internal_function; 35146040Stjrstatic void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 36146040Stjr re_dfastate_t **limited_sts, int last_node, 37146040Stjr int last_str_idx) 38146040Stjr internal_function; 39146040Stjrstatic reg_errcode_t re_search_internal (const regex_t *preg, 40146040Stjr const char *string, int length, 41146040Stjr int start, int range, int stop, 42146040Stjr size_t nmatch, regmatch_t pmatch[], 43146040Stjr int eflags) internal_function; 44146040Stjrstatic int re_search_2_stub (struct re_pattern_buffer *bufp, 45146040Stjr const char *string1, int length1, 46146040Stjr const char *string2, int length2, 47146040Stjr int start, int range, struct re_registers *regs, 48146040Stjr int stop, int ret_len) internal_function; 49146040Stjrstatic int re_search_stub (struct re_pattern_buffer *bufp, 50146040Stjr const char *string, int length, int start, 51146040Stjr int range, int stop, struct re_registers *regs, 52146040Stjr int ret_len) internal_function; 53146040Stjrstatic unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 54146040Stjr int nregs, int regs_allocated) internal_function; 55146040Stjrstatic inline re_dfastate_t *acquire_init_state_context 56146040Stjr (reg_errcode_t *err, const re_match_context_t *mctx, int idx) 57146040Stjr __attribute ((always_inline)) internal_function; 58146040Stjrstatic reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 59146040Stjr internal_function; 60146040Stjrstatic int check_matching (re_match_context_t *mctx, int fl_longest_match, 61146040Stjr int *p_match_first) 62146040Stjr internal_function; 63146040Stjrstatic int check_halt_node_context (const re_dfa_t *dfa, int node, 64146040Stjr unsigned int context) internal_function; 65146040Stjrstatic int check_halt_state_context (const re_match_context_t *mctx, 66146040Stjr const re_dfastate_t *state, int idx) 67146040Stjr internal_function; 68146040Stjrstatic void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, 69146040Stjr regmatch_t *prev_idx_match, int cur_node, 70146040Stjr int cur_idx, int nmatch) internal_function; 71146040Stjrstatic int proceed_next_node (const re_match_context_t *mctx, 72146040Stjr int nregs, regmatch_t *regs, 73146040Stjr int *pidx, int node, re_node_set *eps_via_nodes, 74146040Stjr struct re_fail_stack_t *fs) internal_function; 75146040Stjrstatic reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 76146040Stjr int str_idx, int dest_node, int nregs, 77146040Stjr regmatch_t *regs, 78146040Stjr re_node_set *eps_via_nodes) internal_function; 79146040Stjrstatic int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 80146040Stjr regmatch_t *regs, re_node_set *eps_via_nodes) internal_function; 81146040Stjrstatic reg_errcode_t set_regs (const regex_t *preg, 82146040Stjr const re_match_context_t *mctx, 83146040Stjr size_t nmatch, regmatch_t *pmatch, 84146040Stjr int fl_backtrack) internal_function; 85146040Stjrstatic reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function; 86146040Stjr 87146040Stjr#ifdef RE_ENABLE_I18N 88146040Stjrstatic int sift_states_iter_mb (const re_match_context_t *mctx, 89146040Stjr re_sift_context_t *sctx, 90146040Stjr int node_idx, int str_idx, int max_str_idx) internal_function; 91146040Stjr#endif /* RE_ENABLE_I18N */ 92146040Stjrstatic reg_errcode_t sift_states_backward (re_match_context_t *mctx, 93146040Stjr re_sift_context_t *sctx) internal_function; 94146040Stjrstatic reg_errcode_t build_sifted_states (re_match_context_t *mctx, 95146040Stjr re_sift_context_t *sctx, int str_idx, 96146040Stjr re_node_set *cur_dest) internal_function; 97146040Stjrstatic reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx, 98146040Stjr re_sift_context_t *sctx, 99146040Stjr int str_idx, 100146040Stjr re_node_set *dest_nodes) internal_function; 101146040Stjrstatic reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, 102146040Stjr re_node_set *dest_nodes, 103146040Stjr const re_node_set *candidates) internal_function; 104146040Stjrstatic reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, 105146040Stjr re_node_set *dest_nodes, 106146040Stjr const re_node_set *and_nodes) internal_function; 107146040Stjrstatic int check_dst_limits (re_match_context_t *mctx, re_node_set *limits, 108146040Stjr int dst_node, int dst_idx, int src_node, 109146040Stjr int src_idx) internal_function; 110146040Stjrstatic int check_dst_limits_calc_pos_1 (re_match_context_t *mctx, 111146040Stjr int boundaries, int subexp_idx, 112146040Stjr int from_node, int bkref_idx) internal_function; 113146040Stjrstatic int check_dst_limits_calc_pos (re_match_context_t *mctx, 114146040Stjr int limit, int subexp_idx, 115146040Stjr int node, int str_idx, 116146040Stjr int bkref_idx) internal_function; 117146040Stjrstatic reg_errcode_t check_subexp_limits (re_dfa_t *dfa, 118146040Stjr re_node_set *dest_nodes, 119146040Stjr const re_node_set *candidates, 120146040Stjr re_node_set *limits, 121146040Stjr struct re_backref_cache_entry *bkref_ents, 122146040Stjr int str_idx) internal_function; 123146040Stjrstatic reg_errcode_t sift_states_bkref (re_match_context_t *mctx, 124146040Stjr re_sift_context_t *sctx, 125146040Stjr int str_idx, const re_node_set *candidates) internal_function; 126146040Stjrstatic reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx, 127146040Stjr int next_state_log_idx) internal_function; 128146040Stjrstatic reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, 129146040Stjr re_dfastate_t **src, int num) internal_function; 130146040Stjrstatic re_dfastate_t *find_recover_state (reg_errcode_t *err, 131146040Stjr re_match_context_t *mctx) internal_function; 132146040Stjrstatic re_dfastate_t *transit_state (reg_errcode_t *err, 133146040Stjr re_match_context_t *mctx, 134146040Stjr re_dfastate_t *state) internal_function; 135146040Stjrstatic re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 136146040Stjr re_match_context_t *mctx, 137146040Stjr re_dfastate_t *next_state) internal_function; 138146040Stjrstatic reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 139146040Stjr re_node_set *cur_nodes, 140146040Stjr int str_idx) internal_function; 141146040Stjr#if 0 142146040Stjrstatic re_dfastate_t *transit_state_sb (reg_errcode_t *err, 143146040Stjr re_match_context_t *mctx, 144146040Stjr re_dfastate_t *pstate) internal_function; 145146040Stjr#endif 146146040Stjr#ifdef RE_ENABLE_I18N 147146040Stjrstatic reg_errcode_t transit_state_mb (re_match_context_t *mctx, 148146040Stjr re_dfastate_t *pstate) internal_function; 149146040Stjr#endif /* RE_ENABLE_I18N */ 150146040Stjrstatic reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 151146040Stjr const re_node_set *nodes) internal_function; 152146040Stjrstatic reg_errcode_t get_subexp (re_match_context_t *mctx, 153146040Stjr int bkref_node, int bkref_str_idx) internal_function; 154146040Stjrstatic reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 155146040Stjr const re_sub_match_top_t *sub_top, 156146040Stjr re_sub_match_last_t *sub_last, 157146040Stjr int bkref_node, int bkref_str) internal_function; 158146040Stjrstatic int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 159146040Stjr int subexp_idx, int type) internal_function; 160146040Stjrstatic reg_errcode_t check_arrival (re_match_context_t *mctx, 161146040Stjr state_array_t *path, int top_node, 162146040Stjr int top_str, int last_node, int last_str, 163146040Stjr int type) internal_function; 164146040Stjrstatic reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 165146040Stjr int str_idx, 166146040Stjr re_node_set *cur_nodes, 167146040Stjr re_node_set *next_nodes) internal_function; 168146040Stjrstatic reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa, 169146040Stjr re_node_set *cur_nodes, 170146040Stjr int ex_subexp, int type) internal_function; 171146040Stjrstatic reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, 172146040Stjr re_node_set *dst_nodes, 173146040Stjr int target, int ex_subexp, 174146040Stjr int type) internal_function; 175146040Stjrstatic reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 176146040Stjr re_node_set *cur_nodes, int cur_str, 177146040Stjr int subexp_num, int type) internal_function; 178146040Stjrstatic int build_trtable (re_dfa_t *dfa, 179146040Stjr re_dfastate_t *state) internal_function; 180146040Stjr#ifdef RE_ENABLE_I18N 181146040Stjrstatic int check_node_accept_bytes (re_dfa_t *dfa, int node_idx, 182146040Stjr const re_string_t *input, int idx) internal_function; 183146040Stjr# ifdef _LIBC 184146040Stjrstatic unsigned int find_collation_sequence_value (const unsigned char *mbs, 185146040Stjr size_t name_len) internal_function; 186146040Stjr# endif /* _LIBC */ 187146040Stjr#endif /* RE_ENABLE_I18N */ 188146040Stjrstatic int group_nodes_into_DFAstates (re_dfa_t *dfa, 189146040Stjr const re_dfastate_t *state, 190146040Stjr re_node_set *states_node, 191146040Stjr bitset *states_ch) internal_function; 192146040Stjrstatic int check_node_accept (const re_match_context_t *mctx, 193146040Stjr const re_token_t *node, int idx) internal_function; 194146040Stjrstatic reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function; 195146040Stjr 196146040Stjr/* Entry point for POSIX code. */ 197146040Stjr 198146040Stjr/* regexec searches for a given pattern, specified by PREG, in the 199146040Stjr string STRING. 200146040Stjr 201146040Stjr If NMATCH is zero or REG_NOSUB was set in the cflags argument to 202146040Stjr `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 203146040Stjr least NMATCH elements, and we set them to the offsets of the 204146040Stjr corresponding matched substrings. 205146040Stjr 206146040Stjr EFLAGS specifies `execution flags' which affect matching: if 207146040Stjr REG_NOTBOL is set, then ^ does not match at the beginning of the 208146040Stjr string; if REG_NOTEOL is set, then $ does not match at the end. 209146040Stjr 210146040Stjr We return 0 if we find a match and REG_NOMATCH if not. */ 211146040Stjr 212146040Stjrint 213146040Stjrregexec (preg, string, nmatch, pmatch, eflags) 214146040Stjr const regex_t *__restrict preg; 215146040Stjr const char *__restrict string; 216146040Stjr size_t nmatch; 217146040Stjr regmatch_t pmatch[]; 218146040Stjr int eflags; 219146040Stjr{ 220146040Stjr reg_errcode_t err; 221146040Stjr int start, length; 222146040Stjr 223146040Stjr if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 224146040Stjr return REG_BADPAT; 225146040Stjr 226146040Stjr if (eflags & REG_STARTEND) 227146040Stjr { 228146040Stjr start = pmatch[0].rm_so; 229146040Stjr length = pmatch[0].rm_eo; 230146040Stjr } 231146040Stjr else 232146040Stjr { 233146040Stjr start = 0; 234146040Stjr length = strlen (string); 235146040Stjr } 236146040Stjr if (preg->no_sub) 237146040Stjr err = re_search_internal (preg, string, length, start, length - start, 238146040Stjr length, 0, NULL, eflags); 239146040Stjr else 240146040Stjr err = re_search_internal (preg, string, length, start, length - start, 241146040Stjr length, nmatch, pmatch, eflags); 242146040Stjr return err != REG_NOERROR; 243146040Stjr} 244146040Stjr 245146040Stjr#ifdef _LIBC 246146040Stjr# include <shlib-compat.h> 247146040Stjrversioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 248146040Stjr 249146040Stjr# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 250146040Stjr__typeof__ (__regexec) __compat_regexec; 251146040Stjr 252146040Stjrint 253146040Stjrattribute_compat_text_section 254146040Stjr__compat_regexec (const regex_t *__restrict preg, 255146040Stjr const char *__restrict string, size_t nmatch, 256146040Stjr regmatch_t pmatch[], int eflags) 257146040Stjr{ 258146040Stjr return regexec (preg, string, nmatch, pmatch, 259146040Stjr eflags & (REG_NOTBOL | REG_NOTEOL)); 260146040Stjr} 261146040Stjrcompat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 262146040Stjr# endif 263146040Stjr#endif 264146040Stjr 265146040Stjr/* Entry points for GNU code. */ 266146040Stjr 267146040Stjr/* re_match, re_search, re_match_2, re_search_2 268146040Stjr 269146040Stjr The former two functions operate on STRING with length LENGTH, 270146040Stjr while the later two operate on concatenation of STRING1 and STRING2 271146040Stjr with lengths LENGTH1 and LENGTH2, respectively. 272146040Stjr 273146040Stjr re_match() matches the compiled pattern in BUFP against the string, 274146040Stjr starting at index START. 275146040Stjr 276146040Stjr re_search() first tries matching at index START, then it tries to match 277146040Stjr starting from index START + 1, and so on. The last start position tried 278146040Stjr is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 279146040Stjr way as re_match().) 280146040Stjr 281146040Stjr The parameter STOP of re_{match,search}_2 specifies that no match exceeding 282146040Stjr the first STOP characters of the concatenation of the strings should be 283146040Stjr concerned. 284146040Stjr 285146040Stjr If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 286146040Stjr and all groups is stroed in REGS. (For the "_2" variants, the offsets are 287146040Stjr computed relative to the concatenation, not relative to the individual 288146040Stjr strings.) 289146040Stjr 290146040Stjr On success, re_match* functions return the length of the match, re_search* 291146040Stjr return the position of the start of the match. Return value -1 means no 292146040Stjr match was found and -2 indicates an internal error. */ 293146040Stjr 294146040Stjrint 295146040Stjrre_match (bufp, string, length, start, regs) 296146040Stjr struct re_pattern_buffer *bufp; 297146040Stjr const char *string; 298146040Stjr int length, start; 299146040Stjr struct re_registers *regs; 300146040Stjr{ 301146040Stjr return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 302146040Stjr} 303146040Stjr#ifdef _LIBC 304146040Stjrweak_alias (__re_match, re_match) 305146040Stjr#endif 306146040Stjr 307146040Stjrint 308146040Stjrre_search (bufp, string, length, start, range, regs) 309146040Stjr struct re_pattern_buffer *bufp; 310146040Stjr const char *string; 311146040Stjr int length, start, range; 312146040Stjr struct re_registers *regs; 313146040Stjr{ 314146040Stjr return re_search_stub (bufp, string, length, start, range, length, regs, 0); 315146040Stjr} 316146040Stjr#ifdef _LIBC 317146040Stjrweak_alias (__re_search, re_search) 318146040Stjr#endif 319146040Stjr 320146040Stjrint 321146040Stjrre_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 322146040Stjr struct re_pattern_buffer *bufp; 323146040Stjr const char *string1, *string2; 324146040Stjr int length1, length2, start, stop; 325146040Stjr struct re_registers *regs; 326146040Stjr{ 327146040Stjr return re_search_2_stub (bufp, string1, length1, string2, length2, 328146040Stjr start, 0, regs, stop, 1); 329146040Stjr} 330146040Stjr#ifdef _LIBC 331146040Stjrweak_alias (__re_match_2, re_match_2) 332146040Stjr#endif 333146040Stjr 334146040Stjrint 335146040Stjrre_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 336146040Stjr struct re_pattern_buffer *bufp; 337146040Stjr const char *string1, *string2; 338146040Stjr int length1, length2, start, range, stop; 339146040Stjr struct re_registers *regs; 340146040Stjr{ 341146040Stjr return re_search_2_stub (bufp, string1, length1, string2, length2, 342146040Stjr start, range, regs, stop, 0); 343146040Stjr} 344146040Stjr#ifdef _LIBC 345146040Stjrweak_alias (__re_search_2, re_search_2) 346146040Stjr#endif 347146040Stjr 348146040Stjrstatic int 349146040Stjrre_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 350146040Stjr stop, ret_len) 351146040Stjr struct re_pattern_buffer *bufp; 352146040Stjr const char *string1, *string2; 353146040Stjr int length1, length2, start, range, stop, ret_len; 354146040Stjr struct re_registers *regs; 355146040Stjr{ 356146040Stjr const char *str; 357146040Stjr int rval; 358146040Stjr int len = length1 + length2; 359146040Stjr int free_str = 0; 360146040Stjr 361146040Stjr if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 362146040Stjr return -2; 363146040Stjr 364146040Stjr /* Concatenate the strings. */ 365146040Stjr if (length2 > 0) 366146040Stjr if (length1 > 0) 367146040Stjr { 368146040Stjr char *s = re_malloc (char, len); 369146040Stjr 370146040Stjr if (BE (s == NULL, 0)) 371146040Stjr return -2; 372146040Stjr memcpy (s, string1, length1); 373146040Stjr memcpy (s + length1, string2, length2); 374146040Stjr str = s; 375146040Stjr free_str = 1; 376146040Stjr } 377146040Stjr else 378146040Stjr str = string2; 379146040Stjr else 380146040Stjr str = string1; 381146040Stjr 382146040Stjr rval = re_search_stub (bufp, str, len, start, range, stop, regs, 383146040Stjr ret_len); 384146040Stjr if (free_str) 385146040Stjr re_free ((char *) str); 386146040Stjr return rval; 387146040Stjr} 388146040Stjr 389146040Stjr/* The parameters have the same meaning as those of re_search. 390146040Stjr Additional parameters: 391146040Stjr If RET_LEN is nonzero the length of the match is returned (re_match style); 392146040Stjr otherwise the position of the match is returned. */ 393146040Stjr 394146040Stjrstatic int 395146040Stjrre_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 396146040Stjr struct re_pattern_buffer *bufp; 397146040Stjr const char *string; 398146040Stjr int length, start, range, stop, ret_len; 399146040Stjr struct re_registers *regs; 400146040Stjr{ 401146040Stjr reg_errcode_t result; 402146040Stjr regmatch_t *pmatch; 403146040Stjr int nregs, rval; 404146040Stjr int eflags = 0; 405146040Stjr 406146040Stjr /* Check for out-of-range. */ 407146040Stjr if (BE (start < 0 || start > length, 0)) 408146040Stjr return -1; 409146040Stjr if (BE (start + range > length, 0)) 410146040Stjr range = length - start; 411146040Stjr else if (BE (start + range < 0, 0)) 412146040Stjr range = -start; 413146040Stjr 414146040Stjr eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 415146040Stjr eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 416146040Stjr 417146040Stjr /* Compile fastmap if we haven't yet. */ 418146040Stjr if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 419146040Stjr re_compile_fastmap (bufp); 420146040Stjr 421146040Stjr if (BE (bufp->no_sub, 0)) 422146040Stjr regs = NULL; 423146040Stjr 424146040Stjr /* We need at least 1 register. */ 425146040Stjr if (regs == NULL) 426146040Stjr nregs = 1; 427146040Stjr else if (BE (bufp->regs_allocated == REGS_FIXED && 428146040Stjr regs->num_regs < bufp->re_nsub + 1, 0)) 429146040Stjr { 430146040Stjr nregs = regs->num_regs; 431146040Stjr if (BE (nregs < 1, 0)) 432146040Stjr { 433146040Stjr /* Nothing can be copied to regs. */ 434146040Stjr regs = NULL; 435146040Stjr nregs = 1; 436146040Stjr } 437146040Stjr } 438146040Stjr else 439146040Stjr nregs = bufp->re_nsub + 1; 440146040Stjr pmatch = re_malloc (regmatch_t, nregs); 441146040Stjr if (BE (pmatch == NULL, 0)) 442146040Stjr return -2; 443146040Stjr 444146040Stjr result = re_search_internal (bufp, string, length, start, range, stop, 445146040Stjr nregs, pmatch, eflags); 446146040Stjr 447146040Stjr rval = 0; 448146040Stjr 449146040Stjr /* I hope we needn't fill ther regs with -1's when no match was found. */ 450146040Stjr if (result != REG_NOERROR) 451146040Stjr rval = -1; 452146040Stjr else if (regs != NULL) 453146040Stjr { 454146040Stjr /* If caller wants register contents data back, copy them. */ 455146040Stjr bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 456146040Stjr bufp->regs_allocated); 457146040Stjr if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 458146040Stjr rval = -2; 459146040Stjr } 460146040Stjr 461146040Stjr if (BE (rval == 0, 1)) 462146040Stjr { 463146040Stjr if (ret_len) 464146040Stjr { 465146040Stjr assert (pmatch[0].rm_so == start); 466146040Stjr rval = pmatch[0].rm_eo - start; 467146040Stjr } 468146040Stjr else 469146040Stjr rval = pmatch[0].rm_so; 470146040Stjr } 471146040Stjr re_free (pmatch); 472146040Stjr return rval; 473146040Stjr} 474146040Stjr 475146040Stjrstatic unsigned 476146040Stjrre_copy_regs (regs, pmatch, nregs, regs_allocated) 477146040Stjr struct re_registers *regs; 478146040Stjr regmatch_t *pmatch; 479146040Stjr int nregs, regs_allocated; 480146040Stjr{ 481146040Stjr int rval = REGS_REALLOCATE; 482146040Stjr int i; 483146040Stjr int need_regs = nregs + 1; 484146040Stjr /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 485146040Stjr uses. */ 486146040Stjr 487146040Stjr /* Have the register data arrays been allocated? */ 488146040Stjr if (regs_allocated == REGS_UNALLOCATED) 489146040Stjr { /* No. So allocate them with malloc. */ 490146040Stjr regs->start = re_malloc (regoff_t, need_regs); 491146040Stjr regs->end = re_malloc (regoff_t, need_regs); 492146040Stjr if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0)) 493146040Stjr return REGS_UNALLOCATED; 494146040Stjr regs->num_regs = need_regs; 495146040Stjr } 496146040Stjr else if (regs_allocated == REGS_REALLOCATE) 497146040Stjr { /* Yes. If we need more elements than were already 498146040Stjr allocated, reallocate them. If we need fewer, just 499146040Stjr leave it alone. */ 500146040Stjr if (BE (need_regs > regs->num_regs, 0)) 501146040Stjr { 502146040Stjr regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 503146040Stjr regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs); 504146040Stjr if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0)) 505146040Stjr return REGS_UNALLOCATED; 506146040Stjr regs->start = new_start; 507146040Stjr regs->end = new_end; 508146040Stjr regs->num_regs = need_regs; 509146040Stjr } 510146040Stjr } 511146040Stjr else 512146040Stjr { 513146040Stjr assert (regs_allocated == REGS_FIXED); 514146040Stjr /* This function may not be called with REGS_FIXED and nregs too big. */ 515146040Stjr assert (regs->num_regs >= nregs); 516146040Stjr rval = REGS_FIXED; 517146040Stjr } 518146040Stjr 519146040Stjr /* Copy the regs. */ 520146040Stjr for (i = 0; i < nregs; ++i) 521146040Stjr { 522146040Stjr regs->start[i] = pmatch[i].rm_so; 523146040Stjr regs->end[i] = pmatch[i].rm_eo; 524146040Stjr } 525146040Stjr for ( ; i < regs->num_regs; ++i) 526146040Stjr regs->start[i] = regs->end[i] = -1; 527146040Stjr 528146040Stjr return rval; 529146040Stjr} 530146040Stjr 531146040Stjr/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 532146040Stjr ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 533146040Stjr this memory for recording register information. STARTS and ENDS 534146040Stjr must be allocated using the malloc library routine, and must each 535146040Stjr be at least NUM_REGS * sizeof (regoff_t) bytes long. 536146040Stjr 537146040Stjr If NUM_REGS == 0, then subsequent matches should allocate their own 538146040Stjr register data. 539146040Stjr 540146040Stjr Unless this function is called, the first search or match using 541146040Stjr PATTERN_BUFFER will allocate its own register data, without 542146040Stjr freeing the old data. */ 543146040Stjr 544146040Stjrvoid 545146040Stjrre_set_registers (bufp, regs, num_regs, starts, ends) 546146040Stjr struct re_pattern_buffer *bufp; 547146040Stjr struct re_registers *regs; 548146040Stjr unsigned num_regs; 549146040Stjr regoff_t *starts, *ends; 550146040Stjr{ 551146040Stjr if (num_regs) 552146040Stjr { 553146040Stjr bufp->regs_allocated = REGS_REALLOCATE; 554146040Stjr regs->num_regs = num_regs; 555146040Stjr regs->start = starts; 556146040Stjr regs->end = ends; 557146040Stjr } 558146040Stjr else 559146040Stjr { 560146040Stjr bufp->regs_allocated = REGS_UNALLOCATED; 561146040Stjr regs->num_regs = 0; 562146040Stjr regs->start = regs->end = (regoff_t *) 0; 563146040Stjr } 564146040Stjr} 565146040Stjr#ifdef _LIBC 566146040Stjrweak_alias (__re_set_registers, re_set_registers) 567146040Stjr#endif 568146040Stjr 569146040Stjr/* Entry points compatible with 4.2 BSD regex library. We don't define 570146040Stjr them unless specifically requested. */ 571146040Stjr 572146040Stjr#if defined _REGEX_RE_COMP || defined _LIBC 573146040Stjrint 574146040Stjr# ifdef _LIBC 575146040Stjrweak_function 576146040Stjr# endif 577146040Stjrre_exec (s) 578146040Stjr const char *s; 579146040Stjr{ 580146040Stjr return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 581146040Stjr} 582146040Stjr#endif /* _REGEX_RE_COMP */ 583146040Stjr 584146040Stjr/* Internal entry point. */ 585146040Stjr 586146040Stjr/* Searches for a compiled pattern PREG in the string STRING, whose 587146040Stjr length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 588146040Stjr mingings with regexec. START, and RANGE have the same meanings 589146040Stjr with re_search. 590146040Stjr Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 591146040Stjr otherwise return the error code. 592146040Stjr Note: We assume front end functions already check ranges. 593146040Stjr (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 594146040Stjr 595146040Stjrstatic reg_errcode_t 596146040Stjrre_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 597146040Stjr eflags) 598146040Stjr const regex_t *preg; 599146040Stjr const char *string; 600146040Stjr int length, start, range, stop, eflags; 601146040Stjr size_t nmatch; 602146040Stjr regmatch_t pmatch[]; 603146040Stjr{ 604146040Stjr reg_errcode_t err; 605146040Stjr re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 606146040Stjr int left_lim, right_lim, incr; 607146040Stjr int fl_longest_match, match_first, match_kind, match_last = -1; 608146040Stjr int extra_nmatch; 609146040Stjr int sb, ch; 610146040Stjr#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 611146040Stjr re_match_context_t mctx = { .dfa = dfa }; 612146040Stjr#else 613146040Stjr re_match_context_t mctx; 614146040Stjr#endif 615146040Stjr char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate 616146040Stjr && range && !preg->can_be_null) ? preg->fastmap : NULL; 617146040Stjr unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate; 618146040Stjr 619146040Stjr#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 620146040Stjr memset (&mctx, '\0', sizeof (re_match_context_t)); 621146040Stjr mctx.dfa = dfa; 622146040Stjr#endif 623146040Stjr 624146040Stjr extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 625146040Stjr nmatch -= extra_nmatch; 626146040Stjr 627146040Stjr /* Check if the DFA haven't been compiled. */ 628146040Stjr if (BE (preg->used == 0 || dfa->init_state == NULL 629146040Stjr || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 630146040Stjr || dfa->init_state_begbuf == NULL, 0)) 631146040Stjr return REG_NOMATCH; 632146040Stjr 633146040Stjr#ifdef DEBUG 634146040Stjr /* We assume front-end functions already check them. */ 635146040Stjr assert (start + range >= 0 && start + range <= length); 636146040Stjr#endif 637146040Stjr 638146040Stjr /* If initial states with non-begbuf contexts have no elements, 639146040Stjr the regex must be anchored. If preg->newline_anchor is set, 640146040Stjr we'll never use init_state_nl, so do not check it. */ 641146040Stjr if (dfa->init_state->nodes.nelem == 0 642146040Stjr && dfa->init_state_word->nodes.nelem == 0 643146040Stjr && (dfa->init_state_nl->nodes.nelem == 0 644146040Stjr || !preg->newline_anchor)) 645146040Stjr { 646146040Stjr if (start != 0 && start + range != 0) 647146040Stjr return REG_NOMATCH; 648146040Stjr start = range = 0; 649146040Stjr } 650146040Stjr 651146040Stjr /* We must check the longest matching, if nmatch > 0. */ 652146040Stjr fl_longest_match = (nmatch != 0 || dfa->nbackref); 653146040Stjr 654146040Stjr err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 655146040Stjr preg->translate, preg->syntax & RE_ICASE, dfa); 656146040Stjr if (BE (err != REG_NOERROR, 0)) 657146040Stjr goto free_return; 658146040Stjr mctx.input.stop = stop; 659146040Stjr mctx.input.raw_stop = stop; 660146040Stjr mctx.input.newline_anchor = preg->newline_anchor; 661146040Stjr 662146040Stjr err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 663146040Stjr if (BE (err != REG_NOERROR, 0)) 664146040Stjr goto free_return; 665146040Stjr 666146040Stjr /* We will log all the DFA states through which the dfa pass, 667146040Stjr if nmatch > 1, or this dfa has "multibyte node", which is a 668146040Stjr back-reference or a node which can accept multibyte character or 669146040Stjr multi character collating element. */ 670146040Stjr if (nmatch > 1 || dfa->has_mb_node) 671146040Stjr { 672146040Stjr mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 673146040Stjr if (BE (mctx.state_log == NULL, 0)) 674146040Stjr { 675146040Stjr err = REG_ESPACE; 676146040Stjr goto free_return; 677146040Stjr } 678146040Stjr } 679146040Stjr else 680146040Stjr mctx.state_log = NULL; 681146040Stjr 682146040Stjr match_first = start; 683146040Stjr mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 684146040Stjr : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 685146040Stjr 686146040Stjr /* Check incrementally whether of not the input string match. */ 687146040Stjr incr = (range < 0) ? -1 : 1; 688146040Stjr left_lim = (range < 0) ? start + range : start; 689146040Stjr right_lim = (range < 0) ? start : start + range; 690146040Stjr sb = dfa->mb_cur_max == 1; 691146040Stjr match_kind = 692146040Stjr (fastmap 693146040Stjr ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 694146040Stjr | (range >= 0 ? 2 : 0) 695146040Stjr | (t != NULL ? 1 : 0)) 696146040Stjr : 8); 697146040Stjr 698146040Stjr for (;; match_first += incr) 699146040Stjr { 700146040Stjr err = REG_NOMATCH; 701146040Stjr if (match_first < left_lim || right_lim < match_first) 702146040Stjr goto free_return; 703146040Stjr 704146040Stjr /* Advance as rapidly as possible through the string, until we 705146040Stjr find a plausible place to start matching. This may be done 706146040Stjr with varying efficiency, so there are various possibilities: 707146040Stjr only the most common of them are specialized, in order to 708146040Stjr save on code size. We use a switch statement for speed. */ 709146040Stjr switch (match_kind) 710146040Stjr { 711146040Stjr case 8: 712146040Stjr /* No fastmap. */ 713146040Stjr break; 714146040Stjr 715146040Stjr case 7: 716146040Stjr /* Fastmap with single-byte translation, match forward. */ 717146040Stjr while (BE (match_first < right_lim, 1) 718146040Stjr && !fastmap[t[(unsigned char) string[match_first]]]) 719146040Stjr ++match_first; 720146040Stjr goto forward_match_found_start_or_reached_end; 721146040Stjr 722146040Stjr case 6: 723146040Stjr /* Fastmap without translation, match forward. */ 724146040Stjr while (BE (match_first < right_lim, 1) 725146040Stjr && !fastmap[(unsigned char) string[match_first]]) 726146040Stjr ++match_first; 727146040Stjr 728146040Stjr forward_match_found_start_or_reached_end: 729146040Stjr if (BE (match_first == right_lim, 0)) 730146040Stjr { 731146040Stjr ch = match_first >= length 732146040Stjr ? 0 : (unsigned char) string[match_first]; 733146040Stjr if (!fastmap[t ? t[ch] : ch]) 734146040Stjr goto free_return; 735146040Stjr } 736146040Stjr break; 737146040Stjr 738146040Stjr case 4: 739146040Stjr case 5: 740146040Stjr /* Fastmap without multi-byte translation, match backwards. */ 741146040Stjr while (match_first >= left_lim) 742146040Stjr { 743146040Stjr ch = match_first >= length 744146040Stjr ? 0 : (unsigned char) string[match_first]; 745146040Stjr if (fastmap[t ? t[ch] : ch]) 746146040Stjr break; 747146040Stjr --match_first; 748146040Stjr } 749146040Stjr if (match_first < left_lim) 750146040Stjr goto free_return; 751146040Stjr break; 752146040Stjr 753146040Stjr default: 754146040Stjr /* In this case, we can't determine easily the current byte, 755146040Stjr since it might be a component byte of a multibyte 756146040Stjr character. Then we use the constructed buffer instead. */ 757146040Stjr for (;;) 758146040Stjr { 759146040Stjr /* If MATCH_FIRST is out of the valid range, reconstruct the 760146040Stjr buffers. */ 761146040Stjr unsigned int offset = match_first - mctx.input.raw_mbs_idx; 762146040Stjr if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) 763146040Stjr { 764146040Stjr err = re_string_reconstruct (&mctx.input, match_first, 765146040Stjr eflags); 766146040Stjr if (BE (err != REG_NOERROR, 0)) 767146040Stjr goto free_return; 768146040Stjr 769146040Stjr offset = match_first - mctx.input.raw_mbs_idx; 770146040Stjr } 771146040Stjr /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 772146040Stjr Note that MATCH_FIRST must not be smaller than 0. */ 773146040Stjr ch = (match_first >= length 774146040Stjr ? 0 : re_string_byte_at (&mctx.input, offset)); 775146040Stjr if (fastmap[ch]) 776146040Stjr break; 777146040Stjr match_first += incr; 778146040Stjr if (match_first < left_lim || match_first > right_lim) 779146040Stjr { 780146040Stjr err = REG_NOMATCH; 781146040Stjr goto free_return; 782146040Stjr } 783146040Stjr } 784146040Stjr break; 785146040Stjr } 786146040Stjr 787146040Stjr /* Reconstruct the buffers so that the matcher can assume that 788146040Stjr the matching starts from the beginning of the buffer. */ 789146040Stjr err = re_string_reconstruct (&mctx.input, match_first, eflags); 790146040Stjr if (BE (err != REG_NOERROR, 0)) 791146040Stjr goto free_return; 792146040Stjr 793146040Stjr#ifdef RE_ENABLE_I18N 794146040Stjr /* Don't consider this char as a possible match start if it part, 795146040Stjr yet isn't the head, of a multibyte character. */ 796146040Stjr if (!sb && !re_string_first_byte (&mctx.input, 0)) 797146040Stjr continue; 798146040Stjr#endif 799146040Stjr 800146040Stjr /* It seems to be appropriate one, then use the matcher. */ 801146040Stjr /* We assume that the matching starts from 0. */ 802146040Stjr mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 803146040Stjr match_last = check_matching (&mctx, fl_longest_match, 804146040Stjr range >= 0 ? &match_first : NULL); 805146040Stjr if (match_last != -1) 806146040Stjr { 807146040Stjr if (BE (match_last == -2, 0)) 808146040Stjr { 809146040Stjr err = REG_ESPACE; 810146040Stjr goto free_return; 811146040Stjr } 812146040Stjr else 813146040Stjr { 814146040Stjr mctx.match_last = match_last; 815146040Stjr if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 816146040Stjr { 817146040Stjr re_dfastate_t *pstate = mctx.state_log[match_last]; 818146040Stjr mctx.last_node = check_halt_state_context (&mctx, pstate, 819146040Stjr match_last); 820146040Stjr } 821146040Stjr if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 822146040Stjr || dfa->nbackref) 823146040Stjr { 824146040Stjr err = prune_impossible_nodes (&mctx); 825146040Stjr if (err == REG_NOERROR) 826146040Stjr break; 827146040Stjr if (BE (err != REG_NOMATCH, 0)) 828146040Stjr goto free_return; 829146040Stjr match_last = -1; 830146040Stjr } 831146040Stjr else 832146040Stjr break; /* We found a match. */ 833146040Stjr } 834146040Stjr } 835146040Stjr 836146040Stjr match_ctx_clean (&mctx); 837146040Stjr } 838146040Stjr 839146040Stjr#ifdef DEBUG 840146040Stjr assert (match_last != -1); 841146040Stjr assert (err == REG_NOERROR); 842146040Stjr#endif 843146040Stjr 844146040Stjr /* Set pmatch[] if we need. */ 845146040Stjr if (nmatch > 0) 846146040Stjr { 847146040Stjr int reg_idx; 848146040Stjr 849146040Stjr /* Initialize registers. */ 850146040Stjr for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 851146040Stjr pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 852146040Stjr 853146040Stjr /* Set the points where matching start/end. */ 854146040Stjr pmatch[0].rm_so = 0; 855146040Stjr pmatch[0].rm_eo = mctx.match_last; 856146040Stjr 857146040Stjr if (!preg->no_sub && nmatch > 1) 858146040Stjr { 859146040Stjr err = set_regs (preg, &mctx, nmatch, pmatch, 860146040Stjr dfa->has_plural_match && dfa->nbackref > 0); 861146040Stjr if (BE (err != REG_NOERROR, 0)) 862146040Stjr goto free_return; 863146040Stjr } 864146040Stjr 865146040Stjr /* At last, add the offset to the each registers, since we slided 866146040Stjr the buffers so that we could assume that the matching starts 867146040Stjr from 0. */ 868146040Stjr for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 869146040Stjr if (pmatch[reg_idx].rm_so != -1) 870146040Stjr { 871146040Stjr#ifdef RE_ENABLE_I18N 872146040Stjr if (BE (mctx.input.offsets_needed != 0, 0)) 873146040Stjr { 874146040Stjr if (pmatch[reg_idx].rm_so == mctx.input.valid_len) 875146040Stjr pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len; 876146040Stjr else 877146040Stjr pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so]; 878146040Stjr if (pmatch[reg_idx].rm_eo == mctx.input.valid_len) 879146040Stjr pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len; 880146040Stjr else 881146040Stjr pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo]; 882146040Stjr } 883146040Stjr#else 884146040Stjr assert (mctx.input.offsets_needed == 0); 885146040Stjr#endif 886146040Stjr pmatch[reg_idx].rm_so += match_first; 887146040Stjr pmatch[reg_idx].rm_eo += match_first; 888146040Stjr } 889146040Stjr for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 890146040Stjr { 891146040Stjr pmatch[nmatch + reg_idx].rm_so = -1; 892146040Stjr pmatch[nmatch + reg_idx].rm_eo = -1; 893146040Stjr } 894146040Stjr 895146040Stjr if (dfa->subexp_map) 896146040Stjr for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 897146040Stjr if (dfa->subexp_map[reg_idx] != reg_idx) 898146040Stjr { 899146040Stjr pmatch[reg_idx + 1].rm_so 900146040Stjr = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 901146040Stjr pmatch[reg_idx + 1].rm_eo 902146040Stjr = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 903146040Stjr } 904146040Stjr } 905146040Stjr 906146040Stjr free_return: 907146040Stjr re_free (mctx.state_log); 908146040Stjr if (dfa->nbackref) 909146040Stjr match_ctx_free (&mctx); 910146040Stjr re_string_destruct (&mctx.input); 911146040Stjr return err; 912146040Stjr} 913146040Stjr 914146040Stjrstatic reg_errcode_t 915146040Stjrprune_impossible_nodes (mctx) 916146040Stjr re_match_context_t *mctx; 917146040Stjr{ 918146040Stjr re_dfa_t *const dfa = mctx->dfa; 919146040Stjr int halt_node, match_last; 920146040Stjr reg_errcode_t ret; 921146040Stjr re_dfastate_t **sifted_states; 922146040Stjr re_dfastate_t **lim_states = NULL; 923146040Stjr re_sift_context_t sctx; 924146040Stjr#ifdef DEBUG 925146040Stjr assert (mctx->state_log != NULL); 926146040Stjr#endif 927146040Stjr match_last = mctx->match_last; 928146040Stjr halt_node = mctx->last_node; 929146040Stjr sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 930146040Stjr if (BE (sifted_states == NULL, 0)) 931146040Stjr { 932146040Stjr ret = REG_ESPACE; 933146040Stjr goto free_return; 934146040Stjr } 935146040Stjr if (dfa->nbackref) 936146040Stjr { 937146040Stjr lim_states = re_malloc (re_dfastate_t *, match_last + 1); 938146040Stjr if (BE (lim_states == NULL, 0)) 939146040Stjr { 940146040Stjr ret = REG_ESPACE; 941146040Stjr goto free_return; 942146040Stjr } 943146040Stjr while (1) 944146040Stjr { 945146040Stjr memset (lim_states, '\0', 946146040Stjr sizeof (re_dfastate_t *) * (match_last + 1)); 947146040Stjr sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 948146040Stjr match_last); 949146040Stjr ret = sift_states_backward (mctx, &sctx); 950146040Stjr re_node_set_free (&sctx.limits); 951146040Stjr if (BE (ret != REG_NOERROR, 0)) 952146040Stjr goto free_return; 953146040Stjr if (sifted_states[0] != NULL || lim_states[0] != NULL) 954146040Stjr break; 955146040Stjr do 956146040Stjr { 957146040Stjr --match_last; 958146040Stjr if (match_last < 0) 959146040Stjr { 960146040Stjr ret = REG_NOMATCH; 961146040Stjr goto free_return; 962146040Stjr } 963146040Stjr } while (mctx->state_log[match_last] == NULL 964146040Stjr || !mctx->state_log[match_last]->halt); 965146040Stjr halt_node = check_halt_state_context (mctx, 966146040Stjr mctx->state_log[match_last], 967146040Stjr match_last); 968146040Stjr } 969146040Stjr ret = merge_state_array (dfa, sifted_states, lim_states, 970146040Stjr match_last + 1); 971146040Stjr re_free (lim_states); 972146040Stjr lim_states = NULL; 973146040Stjr if (BE (ret != REG_NOERROR, 0)) 974146040Stjr goto free_return; 975146040Stjr } 976146040Stjr else 977146040Stjr { 978146040Stjr sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 979146040Stjr ret = sift_states_backward (mctx, &sctx); 980146040Stjr re_node_set_free (&sctx.limits); 981146040Stjr if (BE (ret != REG_NOERROR, 0)) 982146040Stjr goto free_return; 983146040Stjr } 984146040Stjr re_free (mctx->state_log); 985146040Stjr mctx->state_log = sifted_states; 986146040Stjr sifted_states = NULL; 987146040Stjr mctx->last_node = halt_node; 988146040Stjr mctx->match_last = match_last; 989146040Stjr ret = REG_NOERROR; 990146040Stjr free_return: 991146040Stjr re_free (sifted_states); 992146040Stjr re_free (lim_states); 993146040Stjr return ret; 994146040Stjr} 995146040Stjr 996146040Stjr/* Acquire an initial state and return it. 997146040Stjr We must select appropriate initial state depending on the context, 998146040Stjr since initial states may have constraints like "\<", "^", etc.. */ 999146040Stjr 1000146040Stjrstatic inline re_dfastate_t * 1001146040Stjracquire_init_state_context (err, mctx, idx) 1002146040Stjr reg_errcode_t *err; 1003146040Stjr const re_match_context_t *mctx; 1004146040Stjr int idx; 1005146040Stjr{ 1006146040Stjr re_dfa_t *const dfa = mctx->dfa; 1007146040Stjr if (dfa->init_state->has_constraint) 1008146040Stjr { 1009146040Stjr unsigned int context; 1010146040Stjr context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1011146040Stjr if (IS_WORD_CONTEXT (context)) 1012146040Stjr return dfa->init_state_word; 1013146040Stjr else if (IS_ORDINARY_CONTEXT (context)) 1014146040Stjr return dfa->init_state; 1015146040Stjr else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1016146040Stjr return dfa->init_state_begbuf; 1017146040Stjr else if (IS_NEWLINE_CONTEXT (context)) 1018146040Stjr return dfa->init_state_nl; 1019146040Stjr else if (IS_BEGBUF_CONTEXT (context)) 1020146040Stjr { 1021146040Stjr /* It is relatively rare case, then calculate on demand. */ 1022146040Stjr return re_acquire_state_context (err, dfa, 1023146040Stjr dfa->init_state->entrance_nodes, 1024146040Stjr context); 1025146040Stjr } 1026146040Stjr else 1027146040Stjr /* Must not happen? */ 1028146040Stjr return dfa->init_state; 1029146040Stjr } 1030146040Stjr else 1031146040Stjr return dfa->init_state; 1032146040Stjr} 1033146040Stjr 1034146040Stjr/* Check whether the regular expression match input string INPUT or not, 1035146040Stjr and return the index where the matching end, return -1 if not match, 1036146040Stjr or return -2 in case of an error. 1037146040Stjr FL_LONGEST_MATCH means we want the POSIX longest matching. 1038146040Stjr If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1039146040Stjr next place where we may want to try matching. 1040146040Stjr Note that the matcher assume that the maching starts from the current 1041146040Stjr index of the buffer. */ 1042146040Stjr 1043146040Stjrstatic int 1044146040Stjrcheck_matching (mctx, fl_longest_match, p_match_first) 1045146040Stjr re_match_context_t *mctx; 1046146040Stjr int fl_longest_match; 1047146040Stjr int *p_match_first; 1048146040Stjr{ 1049146040Stjr re_dfa_t *const dfa = mctx->dfa; 1050146040Stjr reg_errcode_t err; 1051146040Stjr int match = 0; 1052146040Stjr int match_last = -1; 1053146040Stjr int cur_str_idx = re_string_cur_idx (&mctx->input); 1054146040Stjr re_dfastate_t *cur_state; 1055146040Stjr int at_init_state = p_match_first != NULL; 1056146040Stjr int next_start_idx = cur_str_idx; 1057146040Stjr 1058146040Stjr err = REG_NOERROR; 1059146040Stjr cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1060146040Stjr /* An initial state must not be NULL (invalid). */ 1061146040Stjr if (BE (cur_state == NULL, 0)) 1062146040Stjr { 1063146040Stjr assert (err == REG_ESPACE); 1064146040Stjr return -2; 1065146040Stjr } 1066146040Stjr 1067146040Stjr if (mctx->state_log != NULL) 1068146040Stjr { 1069146040Stjr mctx->state_log[cur_str_idx] = cur_state; 1070146040Stjr 1071146040Stjr /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1072146040Stjr later. E.g. Processing back references. */ 1073146040Stjr if (BE (dfa->nbackref, 0)) 1074146040Stjr { 1075146040Stjr at_init_state = 0; 1076146040Stjr err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1077146040Stjr if (BE (err != REG_NOERROR, 0)) 1078146040Stjr return err; 1079146040Stjr 1080146040Stjr if (cur_state->has_backref) 1081146040Stjr { 1082146040Stjr err = transit_state_bkref (mctx, &cur_state->nodes); 1083146040Stjr if (BE (err != REG_NOERROR, 0)) 1084146040Stjr return err; 1085146040Stjr } 1086146040Stjr } 1087146040Stjr } 1088146040Stjr 1089146040Stjr /* If the RE accepts NULL string. */ 1090146040Stjr if (BE (cur_state->halt, 0)) 1091146040Stjr { 1092146040Stjr if (!cur_state->has_constraint 1093146040Stjr || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1094146040Stjr { 1095146040Stjr if (!fl_longest_match) 1096146040Stjr return cur_str_idx; 1097146040Stjr else 1098146040Stjr { 1099146040Stjr match_last = cur_str_idx; 1100146040Stjr match = 1; 1101146040Stjr } 1102146040Stjr } 1103146040Stjr } 1104146040Stjr 1105146040Stjr while (!re_string_eoi (&mctx->input)) 1106146040Stjr { 1107146040Stjr re_dfastate_t *old_state = cur_state; 1108146040Stjr int next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1109146040Stjr 1110146040Stjr if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1111146040Stjr || (BE (next_char_idx >= mctx->input.valid_len, 0) 1112146040Stjr && mctx->input.valid_len < mctx->input.len)) 1113146040Stjr { 1114146040Stjr err = extend_buffers (mctx); 1115146040Stjr if (BE (err != REG_NOERROR, 0)) 1116146040Stjr { 1117146040Stjr assert (err == REG_ESPACE); 1118146040Stjr return -2; 1119146040Stjr } 1120146040Stjr } 1121146040Stjr 1122146040Stjr cur_state = transit_state (&err, mctx, cur_state); 1123146040Stjr if (mctx->state_log != NULL) 1124146040Stjr cur_state = merge_state_with_log (&err, mctx, cur_state); 1125146040Stjr 1126146040Stjr if (cur_state == NULL) 1127146040Stjr { 1128146040Stjr /* Reached the invalid state or an error. Try to recover a valid 1129146040Stjr state using the state log, if available and if we have not 1130146040Stjr already found a valid (even if not the longest) match. */ 1131146040Stjr if (BE (err != REG_NOERROR, 0)) 1132146040Stjr return -2; 1133146040Stjr 1134146040Stjr if (mctx->state_log == NULL 1135146040Stjr || (match && !fl_longest_match) 1136146040Stjr || (cur_state = find_recover_state (&err, mctx)) == NULL) 1137146040Stjr break; 1138146040Stjr } 1139146040Stjr 1140146040Stjr if (BE (at_init_state, 0)) 1141146040Stjr { 1142146040Stjr if (old_state == cur_state) 1143146040Stjr next_start_idx = next_char_idx; 1144146040Stjr else 1145146040Stjr at_init_state = 0; 1146146040Stjr } 1147146040Stjr 1148146040Stjr if (cur_state->halt) 1149146040Stjr { 1150146040Stjr /* Reached a halt state. 1151146040Stjr Check the halt state can satisfy the current context. */ 1152146040Stjr if (!cur_state->has_constraint 1153146040Stjr || check_halt_state_context (mctx, cur_state, 1154146040Stjr re_string_cur_idx (&mctx->input))) 1155146040Stjr { 1156146040Stjr /* We found an appropriate halt state. */ 1157146040Stjr match_last = re_string_cur_idx (&mctx->input); 1158146040Stjr match = 1; 1159146040Stjr 1160146040Stjr /* We found a match, do not modify match_first below. */ 1161146040Stjr p_match_first = NULL; 1162146040Stjr if (!fl_longest_match) 1163146040Stjr break; 1164146040Stjr } 1165146040Stjr } 1166146040Stjr } 1167146040Stjr 1168146040Stjr if (p_match_first) 1169146040Stjr *p_match_first += next_start_idx; 1170146040Stjr 1171146040Stjr return match_last; 1172146040Stjr} 1173146040Stjr 1174146040Stjr/* Check NODE match the current context. */ 1175146040Stjr 1176146040Stjrstatic int check_halt_node_context (dfa, node, context) 1177146040Stjr const re_dfa_t *dfa; 1178146040Stjr int node; 1179146040Stjr unsigned int context; 1180146040Stjr{ 1181146040Stjr re_token_type_t type = dfa->nodes[node].type; 1182146040Stjr unsigned int constraint = dfa->nodes[node].constraint; 1183146040Stjr if (type != END_OF_RE) 1184146040Stjr return 0; 1185146040Stjr if (!constraint) 1186146040Stjr return 1; 1187146040Stjr if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1188146040Stjr return 0; 1189146040Stjr return 1; 1190146040Stjr} 1191146040Stjr 1192146040Stjr/* Check the halt state STATE match the current context. 1193146040Stjr Return 0 if not match, if the node, STATE has, is a halt node and 1194146040Stjr match the context, return the node. */ 1195146040Stjr 1196146040Stjrstatic int 1197146040Stjrcheck_halt_state_context (mctx, state, idx) 1198146040Stjr const re_match_context_t *mctx; 1199146040Stjr const re_dfastate_t *state; 1200146040Stjr int idx; 1201146040Stjr{ 1202146040Stjr int i; 1203146040Stjr unsigned int context; 1204146040Stjr#ifdef DEBUG 1205146040Stjr assert (state->halt); 1206146040Stjr#endif 1207146040Stjr context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1208146040Stjr for (i = 0; i < state->nodes.nelem; ++i) 1209146040Stjr if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1210146040Stjr return state->nodes.elems[i]; 1211146040Stjr return 0; 1212146040Stjr} 1213146040Stjr 1214146040Stjr/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1215146040Stjr corresponding to the DFA). 1216146040Stjr Return the destination node, and update EPS_VIA_NODES, return -1 in case 1217146040Stjr of errors. */ 1218146040Stjr 1219146040Stjrstatic int 1220146040Stjrproceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) 1221146040Stjr const re_match_context_t *mctx; 1222146040Stjr regmatch_t *regs; 1223146040Stjr int nregs, *pidx, node; 1224146040Stjr re_node_set *eps_via_nodes; 1225146040Stjr struct re_fail_stack_t *fs; 1226146040Stjr{ 1227146040Stjr re_dfa_t *const dfa = mctx->dfa; 1228146040Stjr int i, err, dest_node; 1229146040Stjr dest_node = -1; 1230146040Stjr if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1231146040Stjr { 1232146040Stjr re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1233146040Stjr re_node_set *edests = &dfa->edests[node]; 1234146040Stjr int dest_node; 1235146040Stjr err = re_node_set_insert (eps_via_nodes, node); 1236146040Stjr if (BE (err < 0, 0)) 1237146040Stjr return -2; 1238146040Stjr /* Pick up a valid destination, or return -1 if none is found. */ 1239146040Stjr for (dest_node = -1, i = 0; i < edests->nelem; ++i) 1240146040Stjr { 1241146040Stjr int candidate = edests->elems[i]; 1242146040Stjr if (!re_node_set_contains (cur_nodes, candidate)) 1243146040Stjr continue; 1244146040Stjr if (dest_node == -1) 1245146040Stjr dest_node = candidate; 1246146040Stjr 1247146040Stjr else 1248146040Stjr { 1249146040Stjr /* In order to avoid infinite loop like "(a*)*", return the second 1250146040Stjr epsilon-transition if the first was already considered. */ 1251146040Stjr if (re_node_set_contains (eps_via_nodes, dest_node)) 1252146040Stjr return candidate; 1253146040Stjr 1254146040Stjr /* Otherwise, push the second epsilon-transition on the fail stack. */ 1255146040Stjr else if (fs != NULL 1256146040Stjr && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1257146040Stjr eps_via_nodes)) 1258146040Stjr return -2; 1259146040Stjr 1260146040Stjr /* We know we are going to exit. */ 1261146040Stjr break; 1262146040Stjr } 1263146040Stjr } 1264146040Stjr return dest_node; 1265146040Stjr } 1266146040Stjr else 1267146040Stjr { 1268146040Stjr int naccepted = 0; 1269146040Stjr re_token_type_t type = dfa->nodes[node].type; 1270146040Stjr 1271146040Stjr#ifdef RE_ENABLE_I18N 1272146040Stjr if (dfa->nodes[node].accept_mb) 1273146040Stjr naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1274146040Stjr else 1275146040Stjr#endif /* RE_ENABLE_I18N */ 1276146040Stjr if (type == OP_BACK_REF) 1277146040Stjr { 1278146040Stjr int subexp_idx = dfa->nodes[node].opr.idx + 1; 1279146040Stjr naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1280146040Stjr if (fs != NULL) 1281146040Stjr { 1282146040Stjr if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1283146040Stjr return -1; 1284146040Stjr else if (naccepted) 1285146040Stjr { 1286146040Stjr char *buf = (char *) re_string_get_buffer (&mctx->input); 1287146040Stjr if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1288146040Stjr naccepted) != 0) 1289146040Stjr return -1; 1290146040Stjr } 1291146040Stjr } 1292146040Stjr 1293146040Stjr if (naccepted == 0) 1294146040Stjr { 1295146040Stjr err = re_node_set_insert (eps_via_nodes, node); 1296146040Stjr if (BE (err < 0, 0)) 1297146040Stjr return -2; 1298146040Stjr dest_node = dfa->edests[node].elems[0]; 1299146040Stjr if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1300146040Stjr dest_node)) 1301146040Stjr return dest_node; 1302146040Stjr } 1303146040Stjr } 1304146040Stjr 1305146040Stjr if (naccepted != 0 1306146040Stjr || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1307146040Stjr { 1308146040Stjr dest_node = dfa->nexts[node]; 1309146040Stjr *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1310146040Stjr if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1311146040Stjr || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1312146040Stjr dest_node))) 1313146040Stjr return -1; 1314146040Stjr re_node_set_empty (eps_via_nodes); 1315146040Stjr return dest_node; 1316146040Stjr } 1317146040Stjr } 1318146040Stjr return -1; 1319146040Stjr} 1320146040Stjr 1321146040Stjrstatic reg_errcode_t 1322146040Stjrpush_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes) 1323146040Stjr struct re_fail_stack_t *fs; 1324146040Stjr int str_idx, dest_node, nregs; 1325146040Stjr regmatch_t *regs; 1326146040Stjr re_node_set *eps_via_nodes; 1327146040Stjr{ 1328146040Stjr reg_errcode_t err; 1329146040Stjr int num = fs->num++; 1330146040Stjr if (fs->num == fs->alloc) 1331146040Stjr { 1332146040Stjr struct re_fail_stack_ent_t *new_array; 1333146040Stjr new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1334146040Stjr * fs->alloc * 2)); 1335146040Stjr if (new_array == NULL) 1336146040Stjr return REG_ESPACE; 1337146040Stjr fs->alloc *= 2; 1338146040Stjr fs->stack = new_array; 1339146040Stjr } 1340146040Stjr fs->stack[num].idx = str_idx; 1341146040Stjr fs->stack[num].node = dest_node; 1342146040Stjr fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1343146040Stjr if (fs->stack[num].regs == NULL) 1344146040Stjr return REG_ESPACE; 1345146040Stjr memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1346146040Stjr err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1347146040Stjr return err; 1348146040Stjr} 1349146040Stjr 1350146040Stjrstatic int 1351146040Stjrpop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) 1352146040Stjr struct re_fail_stack_t *fs; 1353146040Stjr int *pidx, nregs; 1354146040Stjr regmatch_t *regs; 1355146040Stjr re_node_set *eps_via_nodes; 1356146040Stjr{ 1357146040Stjr int num = --fs->num; 1358146040Stjr assert (num >= 0); 1359146040Stjr *pidx = fs->stack[num].idx; 1360146040Stjr memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1361146040Stjr re_node_set_free (eps_via_nodes); 1362146040Stjr re_free (fs->stack[num].regs); 1363146040Stjr *eps_via_nodes = fs->stack[num].eps_via_nodes; 1364146040Stjr return fs->stack[num].node; 1365146040Stjr} 1366146040Stjr 1367146040Stjr/* Set the positions where the subexpressions are starts/ends to registers 1368146040Stjr PMATCH. 1369146040Stjr Note: We assume that pmatch[0] is already set, and 1370146040Stjr pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1371146040Stjr 1372146040Stjrstatic reg_errcode_t 1373146040Stjrset_regs (preg, mctx, nmatch, pmatch, fl_backtrack) 1374146040Stjr const regex_t *preg; 1375146040Stjr const re_match_context_t *mctx; 1376146040Stjr size_t nmatch; 1377146040Stjr regmatch_t *pmatch; 1378146040Stjr int fl_backtrack; 1379146040Stjr{ 1380146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1381146040Stjr int idx, cur_node; 1382146040Stjr re_node_set eps_via_nodes; 1383146040Stjr struct re_fail_stack_t *fs; 1384146040Stjr struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1385146040Stjr regmatch_t *prev_idx_match; 1386146040Stjr 1387146040Stjr#ifdef DEBUG 1388146040Stjr assert (nmatch > 1); 1389146040Stjr assert (mctx->state_log != NULL); 1390146040Stjr#endif 1391146040Stjr if (fl_backtrack) 1392146040Stjr { 1393146040Stjr fs = &fs_body; 1394146040Stjr fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1395146040Stjr if (fs->stack == NULL) 1396146040Stjr return REG_ESPACE; 1397146040Stjr } 1398146040Stjr else 1399146040Stjr fs = NULL; 1400146040Stjr 1401146040Stjr cur_node = dfa->init_node; 1402146040Stjr re_node_set_init_empty (&eps_via_nodes); 1403146040Stjr 1404146040Stjr prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch); 1405146040Stjr memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1406146040Stjr 1407146040Stjr for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1408146040Stjr { 1409146040Stjr update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1410146040Stjr 1411146040Stjr if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1412146040Stjr { 1413146040Stjr int reg_idx; 1414146040Stjr if (fs) 1415146040Stjr { 1416146040Stjr for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1417146040Stjr if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1418146040Stjr break; 1419146040Stjr if (reg_idx == nmatch) 1420146040Stjr { 1421146040Stjr re_node_set_free (&eps_via_nodes); 1422146040Stjr return free_fail_stack_return (fs); 1423146040Stjr } 1424146040Stjr cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1425146040Stjr &eps_via_nodes); 1426146040Stjr } 1427146040Stjr else 1428146040Stjr { 1429146040Stjr re_node_set_free (&eps_via_nodes); 1430146040Stjr return REG_NOERROR; 1431146040Stjr } 1432146040Stjr } 1433146040Stjr 1434146040Stjr /* Proceed to next node. */ 1435146040Stjr cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1436146040Stjr &eps_via_nodes, fs); 1437146040Stjr 1438146040Stjr if (BE (cur_node < 0, 0)) 1439146040Stjr { 1440146040Stjr if (BE (cur_node == -2, 0)) 1441146040Stjr { 1442146040Stjr re_node_set_free (&eps_via_nodes); 1443146040Stjr free_fail_stack_return (fs); 1444146040Stjr return REG_ESPACE; 1445146040Stjr } 1446146040Stjr if (fs) 1447146040Stjr cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1448146040Stjr &eps_via_nodes); 1449146040Stjr else 1450146040Stjr { 1451146040Stjr re_node_set_free (&eps_via_nodes); 1452146040Stjr return REG_NOMATCH; 1453146040Stjr } 1454146040Stjr } 1455146040Stjr } 1456146040Stjr re_node_set_free (&eps_via_nodes); 1457146040Stjr return free_fail_stack_return (fs); 1458146040Stjr} 1459146040Stjr 1460146040Stjrstatic reg_errcode_t 1461146040Stjrfree_fail_stack_return (fs) 1462146040Stjr struct re_fail_stack_t *fs; 1463146040Stjr{ 1464146040Stjr if (fs) 1465146040Stjr { 1466146040Stjr int fs_idx; 1467146040Stjr for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1468146040Stjr { 1469146040Stjr re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1470146040Stjr re_free (fs->stack[fs_idx].regs); 1471146040Stjr } 1472146040Stjr re_free (fs->stack); 1473146040Stjr } 1474146040Stjr return REG_NOERROR; 1475146040Stjr} 1476146040Stjr 1477146040Stjrstatic void 1478146040Stjrupdate_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch) 1479146040Stjr re_dfa_t *dfa; 1480146040Stjr regmatch_t *pmatch, *prev_idx_match; 1481146040Stjr int cur_node, cur_idx, nmatch; 1482146040Stjr{ 1483146040Stjr int type = dfa->nodes[cur_node].type; 1484146040Stjr if (type == OP_OPEN_SUBEXP) 1485146040Stjr { 1486146040Stjr int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1487146040Stjr 1488146040Stjr /* We are at the first node of this sub expression. */ 1489146040Stjr if (reg_num < nmatch) 1490146040Stjr { 1491146040Stjr pmatch[reg_num].rm_so = cur_idx; 1492146040Stjr pmatch[reg_num].rm_eo = -1; 1493146040Stjr } 1494146040Stjr } 1495146040Stjr else if (type == OP_CLOSE_SUBEXP) 1496146040Stjr { 1497146040Stjr int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1498146040Stjr if (reg_num < nmatch) 1499146040Stjr { 1500146040Stjr /* We are at the last node of this sub expression. */ 1501146040Stjr if (pmatch[reg_num].rm_so < cur_idx) 1502146040Stjr { 1503146040Stjr pmatch[reg_num].rm_eo = cur_idx; 1504146040Stjr /* This is a non-empty match or we are not inside an optional 1505146040Stjr subexpression. Accept this right away. */ 1506146040Stjr memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1507146040Stjr } 1508146040Stjr else 1509146040Stjr { 1510146040Stjr if (dfa->nodes[cur_node].opt_subexp 1511146040Stjr && prev_idx_match[reg_num].rm_so != -1) 1512146040Stjr /* We transited through an empty match for an optional 1513146040Stjr subexpression, like (a?)*, and this is not the subexp's 1514146040Stjr first match. Copy back the old content of the registers 1515146040Stjr so that matches of an inner subexpression are undone as 1516146040Stjr well, like in ((a?))*. */ 1517146040Stjr memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1518146040Stjr else 1519146040Stjr /* We completed a subexpression, but it may be part of 1520146040Stjr an optional one, so do not update PREV_IDX_MATCH. */ 1521146040Stjr pmatch[reg_num].rm_eo = cur_idx; 1522146040Stjr } 1523146040Stjr } 1524146040Stjr } 1525146040Stjr} 1526146040Stjr 1527146040Stjr/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1528146040Stjr and sift the nodes in each states according to the following rules. 1529146040Stjr Updated state_log will be wrote to STATE_LOG. 1530146040Stjr 1531146040Stjr Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1532146040Stjr 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1533146040Stjr If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1534146040Stjr the LAST_NODE, we throw away the node `a'. 1535146040Stjr 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1536146040Stjr string `s' and transit to `b': 1537146040Stjr i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1538146040Stjr away the node `a'. 1539146040Stjr ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1540146040Stjr thrown away, we throw away the node `a'. 1541146040Stjr 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1542146040Stjr i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1543146040Stjr node `a'. 1544146040Stjr ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1545146040Stjr we throw away the node `a'. */ 1546146040Stjr 1547146040Stjr#define STATE_NODE_CONTAINS(state,node) \ 1548146040Stjr ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1549146040Stjr 1550146040Stjrstatic reg_errcode_t 1551146040Stjrsift_states_backward (mctx, sctx) 1552146040Stjr re_match_context_t *mctx; 1553146040Stjr re_sift_context_t *sctx; 1554146040Stjr{ 1555146040Stjr reg_errcode_t err; 1556146040Stjr int null_cnt = 0; 1557146040Stjr int str_idx = sctx->last_str_idx; 1558146040Stjr re_node_set cur_dest; 1559146040Stjr 1560146040Stjr#ifdef DEBUG 1561146040Stjr assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1562146040Stjr#endif 1563146040Stjr 1564146040Stjr /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1565146040Stjr transit to the last_node and the last_node itself. */ 1566146040Stjr err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1567146040Stjr if (BE (err != REG_NOERROR, 0)) 1568146040Stjr return err; 1569146040Stjr err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1570146040Stjr if (BE (err != REG_NOERROR, 0)) 1571146040Stjr goto free_return; 1572146040Stjr 1573146040Stjr /* Then check each states in the state_log. */ 1574146040Stjr while (str_idx > 0) 1575146040Stjr { 1576146040Stjr /* Update counters. */ 1577146040Stjr null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1578146040Stjr if (null_cnt > mctx->max_mb_elem_len) 1579146040Stjr { 1580146040Stjr memset (sctx->sifted_states, '\0', 1581146040Stjr sizeof (re_dfastate_t *) * str_idx); 1582146040Stjr re_node_set_free (&cur_dest); 1583146040Stjr return REG_NOERROR; 1584146040Stjr } 1585146040Stjr re_node_set_empty (&cur_dest); 1586146040Stjr --str_idx; 1587146040Stjr 1588146040Stjr if (mctx->state_log[str_idx]) 1589146040Stjr { 1590146040Stjr err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1591146040Stjr if (BE (err != REG_NOERROR, 0)) 1592146040Stjr goto free_return; 1593146040Stjr } 1594146040Stjr 1595146040Stjr /* Add all the nodes which satisfy the following conditions: 1596146040Stjr - It can epsilon transit to a node in CUR_DEST. 1597146040Stjr - It is in CUR_SRC. 1598146040Stjr And update state_log. */ 1599146040Stjr err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1600146040Stjr if (BE (err != REG_NOERROR, 0)) 1601146040Stjr goto free_return; 1602146040Stjr } 1603146040Stjr err = REG_NOERROR; 1604146040Stjr free_return: 1605146040Stjr re_node_set_free (&cur_dest); 1606146040Stjr return err; 1607146040Stjr} 1608146040Stjr 1609146040Stjrstatic reg_errcode_t 1610146040Stjrbuild_sifted_states (mctx, sctx, str_idx, cur_dest) 1611146040Stjr re_match_context_t *mctx; 1612146040Stjr re_sift_context_t *sctx; 1613146040Stjr int str_idx; 1614146040Stjr re_node_set *cur_dest; 1615146040Stjr{ 1616146040Stjr re_dfa_t *const dfa = mctx->dfa; 1617146040Stjr re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1618146040Stjr int i; 1619146040Stjr 1620146040Stjr /* Then build the next sifted state. 1621146040Stjr We build the next sifted state on `cur_dest', and update 1622146040Stjr `sifted_states[str_idx]' with `cur_dest'. 1623146040Stjr Note: 1624146040Stjr `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1625146040Stjr `cur_src' points the node_set of the old `state_log[str_idx]' 1626146040Stjr (with the epsilon nodes pre-filtered out). */ 1627146040Stjr for (i = 0; i < cur_src->nelem; i++) 1628146040Stjr { 1629146040Stjr int prev_node = cur_src->elems[i]; 1630146040Stjr int naccepted = 0; 1631146040Stjr int ret; 1632146040Stjr 1633146040Stjr#ifdef DEBUG 1634146040Stjr re_token_type_t type = dfa->nodes[prev_node].type; 1635146040Stjr assert (!IS_EPSILON_NODE (type)); 1636146040Stjr#endif 1637146040Stjr#ifdef RE_ENABLE_I18N 1638146040Stjr /* If the node may accept `multi byte'. */ 1639146040Stjr if (dfa->nodes[prev_node].accept_mb) 1640146040Stjr naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1641146040Stjr str_idx, sctx->last_str_idx); 1642146040Stjr#endif /* RE_ENABLE_I18N */ 1643146040Stjr 1644146040Stjr /* We don't check backreferences here. 1645146040Stjr See update_cur_sifted_state(). */ 1646146040Stjr if (!naccepted 1647146040Stjr && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1648146040Stjr && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1649146040Stjr dfa->nexts[prev_node])) 1650146040Stjr naccepted = 1; 1651146040Stjr 1652146040Stjr if (naccepted == 0) 1653146040Stjr continue; 1654146040Stjr 1655146040Stjr if (sctx->limits.nelem) 1656146040Stjr { 1657146040Stjr int to_idx = str_idx + naccepted; 1658146040Stjr if (check_dst_limits (mctx, &sctx->limits, 1659146040Stjr dfa->nexts[prev_node], to_idx, 1660146040Stjr prev_node, str_idx)) 1661146040Stjr continue; 1662146040Stjr } 1663146040Stjr ret = re_node_set_insert (cur_dest, prev_node); 1664146040Stjr if (BE (ret == -1, 0)) 1665146040Stjr return REG_ESPACE; 1666146040Stjr } 1667146040Stjr 1668146040Stjr return REG_NOERROR; 1669146040Stjr} 1670146040Stjr 1671146040Stjr/* Helper functions. */ 1672146040Stjr 1673146040Stjrstatic reg_errcode_t 1674146040Stjrclean_state_log_if_needed (mctx, next_state_log_idx) 1675146040Stjr re_match_context_t *mctx; 1676146040Stjr int next_state_log_idx; 1677146040Stjr{ 1678146040Stjr int top = mctx->state_log_top; 1679146040Stjr 1680146040Stjr if (next_state_log_idx >= mctx->input.bufs_len 1681146040Stjr || (next_state_log_idx >= mctx->input.valid_len 1682146040Stjr && mctx->input.valid_len < mctx->input.len)) 1683146040Stjr { 1684146040Stjr reg_errcode_t err; 1685146040Stjr err = extend_buffers (mctx); 1686146040Stjr if (BE (err != REG_NOERROR, 0)) 1687146040Stjr return err; 1688146040Stjr } 1689146040Stjr 1690146040Stjr if (top < next_state_log_idx) 1691146040Stjr { 1692146040Stjr memset (mctx->state_log + top + 1, '\0', 1693146040Stjr sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1694146040Stjr mctx->state_log_top = next_state_log_idx; 1695146040Stjr } 1696146040Stjr return REG_NOERROR; 1697146040Stjr} 1698146040Stjr 1699146040Stjrstatic reg_errcode_t 1700146040Stjrmerge_state_array (dfa, dst, src, num) 1701146040Stjr re_dfa_t *dfa; 1702146040Stjr re_dfastate_t **dst; 1703146040Stjr re_dfastate_t **src; 1704146040Stjr int num; 1705146040Stjr{ 1706146040Stjr int st_idx; 1707146040Stjr reg_errcode_t err; 1708146040Stjr for (st_idx = 0; st_idx < num; ++st_idx) 1709146040Stjr { 1710146040Stjr if (dst[st_idx] == NULL) 1711146040Stjr dst[st_idx] = src[st_idx]; 1712146040Stjr else if (src[st_idx] != NULL) 1713146040Stjr { 1714146040Stjr re_node_set merged_set; 1715146040Stjr err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1716146040Stjr &src[st_idx]->nodes); 1717146040Stjr if (BE (err != REG_NOERROR, 0)) 1718146040Stjr return err; 1719146040Stjr dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1720146040Stjr re_node_set_free (&merged_set); 1721146040Stjr if (BE (err != REG_NOERROR, 0)) 1722146040Stjr return err; 1723146040Stjr } 1724146040Stjr } 1725146040Stjr return REG_NOERROR; 1726146040Stjr} 1727146040Stjr 1728146040Stjrstatic reg_errcode_t 1729146040Stjrupdate_cur_sifted_state (mctx, sctx, str_idx, dest_nodes) 1730146040Stjr re_match_context_t *mctx; 1731146040Stjr re_sift_context_t *sctx; 1732146040Stjr int str_idx; 1733146040Stjr re_node_set *dest_nodes; 1734146040Stjr{ 1735146040Stjr re_dfa_t *const dfa = mctx->dfa; 1736146040Stjr reg_errcode_t err; 1737146040Stjr const re_node_set *candidates; 1738146040Stjr candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1739146040Stjr : &mctx->state_log[str_idx]->nodes); 1740146040Stjr 1741146040Stjr if (dest_nodes->nelem == 0) 1742146040Stjr sctx->sifted_states[str_idx] = NULL; 1743146040Stjr else 1744146040Stjr { 1745146040Stjr if (candidates) 1746146040Stjr { 1747146040Stjr /* At first, add the nodes which can epsilon transit to a node in 1748146040Stjr DEST_NODE. */ 1749146040Stjr err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1750146040Stjr if (BE (err != REG_NOERROR, 0)) 1751146040Stjr return err; 1752146040Stjr 1753146040Stjr /* Then, check the limitations in the current sift_context. */ 1754146040Stjr if (sctx->limits.nelem) 1755146040Stjr { 1756146040Stjr err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1757146040Stjr mctx->bkref_ents, str_idx); 1758146040Stjr if (BE (err != REG_NOERROR, 0)) 1759146040Stjr return err; 1760146040Stjr } 1761146040Stjr } 1762146040Stjr 1763146040Stjr sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1764146040Stjr if (BE (err != REG_NOERROR, 0)) 1765146040Stjr return err; 1766146040Stjr } 1767146040Stjr 1768146040Stjr if (candidates && mctx->state_log[str_idx]->has_backref) 1769146040Stjr { 1770146040Stjr err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1771146040Stjr if (BE (err != REG_NOERROR, 0)) 1772146040Stjr return err; 1773146040Stjr } 1774146040Stjr return REG_NOERROR; 1775146040Stjr} 1776146040Stjr 1777146040Stjrstatic reg_errcode_t 1778146040Stjradd_epsilon_src_nodes (dfa, dest_nodes, candidates) 1779146040Stjr re_dfa_t *dfa; 1780146040Stjr re_node_set *dest_nodes; 1781146040Stjr const re_node_set *candidates; 1782146040Stjr{ 1783146040Stjr reg_errcode_t err = REG_NOERROR; 1784146040Stjr int i; 1785146040Stjr 1786146040Stjr re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1787146040Stjr if (BE (err != REG_NOERROR, 0)) 1788146040Stjr return err; 1789146040Stjr 1790146040Stjr if (!state->inveclosure.alloc) 1791146040Stjr { 1792146040Stjr err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1793146040Stjr if (BE (err != REG_NOERROR, 0)) 1794146040Stjr return REG_ESPACE; 1795146040Stjr for (i = 0; i < dest_nodes->nelem; i++) 1796146040Stjr re_node_set_merge (&state->inveclosure, 1797146040Stjr dfa->inveclosures + dest_nodes->elems[i]); 1798146040Stjr } 1799146040Stjr return re_node_set_add_intersect (dest_nodes, candidates, 1800146040Stjr &state->inveclosure); 1801146040Stjr} 1802146040Stjr 1803146040Stjrstatic reg_errcode_t 1804146040Stjrsub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) 1805146040Stjr re_dfa_t *dfa; 1806146040Stjr int node; 1807146040Stjr re_node_set *dest_nodes; 1808146040Stjr const re_node_set *candidates; 1809146040Stjr{ 1810146040Stjr int ecl_idx; 1811146040Stjr reg_errcode_t err; 1812146040Stjr re_node_set *inv_eclosure = dfa->inveclosures + node; 1813146040Stjr re_node_set except_nodes; 1814146040Stjr re_node_set_init_empty (&except_nodes); 1815146040Stjr for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1816146040Stjr { 1817146040Stjr int cur_node = inv_eclosure->elems[ecl_idx]; 1818146040Stjr if (cur_node == node) 1819146040Stjr continue; 1820146040Stjr if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1821146040Stjr { 1822146040Stjr int edst1 = dfa->edests[cur_node].elems[0]; 1823146040Stjr int edst2 = ((dfa->edests[cur_node].nelem > 1) 1824146040Stjr ? dfa->edests[cur_node].elems[1] : -1); 1825146040Stjr if ((!re_node_set_contains (inv_eclosure, edst1) 1826146040Stjr && re_node_set_contains (dest_nodes, edst1)) 1827146040Stjr || (edst2 > 0 1828146040Stjr && !re_node_set_contains (inv_eclosure, edst2) 1829146040Stjr && re_node_set_contains (dest_nodes, edst2))) 1830146040Stjr { 1831146040Stjr err = re_node_set_add_intersect (&except_nodes, candidates, 1832146040Stjr dfa->inveclosures + cur_node); 1833146040Stjr if (BE (err != REG_NOERROR, 0)) 1834146040Stjr { 1835146040Stjr re_node_set_free (&except_nodes); 1836146040Stjr return err; 1837146040Stjr } 1838146040Stjr } 1839146040Stjr } 1840146040Stjr } 1841146040Stjr for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1842146040Stjr { 1843146040Stjr int cur_node = inv_eclosure->elems[ecl_idx]; 1844146040Stjr if (!re_node_set_contains (&except_nodes, cur_node)) 1845146040Stjr { 1846146040Stjr int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1847146040Stjr re_node_set_remove_at (dest_nodes, idx); 1848146040Stjr } 1849146040Stjr } 1850146040Stjr re_node_set_free (&except_nodes); 1851146040Stjr return REG_NOERROR; 1852146040Stjr} 1853146040Stjr 1854146040Stjrstatic int 1855146040Stjrcheck_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) 1856146040Stjr re_match_context_t *mctx; 1857146040Stjr re_node_set *limits; 1858146040Stjr int dst_node, dst_idx, src_node, src_idx; 1859146040Stjr{ 1860146040Stjr re_dfa_t *const dfa = mctx->dfa; 1861146040Stjr int lim_idx, src_pos, dst_pos; 1862146040Stjr 1863146040Stjr int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1864146040Stjr int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1865146040Stjr for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1866146040Stjr { 1867146040Stjr int subexp_idx; 1868146040Stjr struct re_backref_cache_entry *ent; 1869146040Stjr ent = mctx->bkref_ents + limits->elems[lim_idx]; 1870146040Stjr subexp_idx = dfa->nodes[ent->node].opr.idx; 1871146040Stjr 1872146040Stjr dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1873146040Stjr subexp_idx, dst_node, dst_idx, 1874146040Stjr dst_bkref_idx); 1875146040Stjr src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1876146040Stjr subexp_idx, src_node, src_idx, 1877146040Stjr src_bkref_idx); 1878146040Stjr 1879146040Stjr /* In case of: 1880146040Stjr <src> <dst> ( <subexp> ) 1881146040Stjr ( <subexp> ) <src> <dst> 1882146040Stjr ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1883146040Stjr if (src_pos == dst_pos) 1884146040Stjr continue; /* This is unrelated limitation. */ 1885146040Stjr else 1886146040Stjr return 1; 1887146040Stjr } 1888146040Stjr return 0; 1889146040Stjr} 1890146040Stjr 1891146040Stjrstatic int 1892146040Stjrcheck_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) 1893146040Stjr re_match_context_t *mctx; 1894146040Stjr int boundaries, subexp_idx, from_node, bkref_idx; 1895146040Stjr{ 1896146040Stjr re_dfa_t *const dfa = mctx->dfa; 1897146040Stjr re_node_set *eclosures = dfa->eclosures + from_node; 1898146040Stjr int node_idx; 1899146040Stjr 1900146040Stjr /* Else, we are on the boundary: examine the nodes on the epsilon 1901146040Stjr closure. */ 1902146040Stjr for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1903146040Stjr { 1904146040Stjr int node = eclosures->elems[node_idx]; 1905146040Stjr switch (dfa->nodes[node].type) 1906146040Stjr { 1907146040Stjr case OP_BACK_REF: 1908146040Stjr if (bkref_idx != -1) 1909146040Stjr { 1910146040Stjr struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1911146040Stjr do 1912146040Stjr { 1913146040Stjr int dst, cpos; 1914146040Stjr 1915146040Stjr if (ent->node != node) 1916146040Stjr continue; 1917146040Stjr 1918146040Stjr if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map) 1919146040Stjr && !(ent->eps_reachable_subexps_map & (1 << subexp_idx))) 1920146040Stjr continue; 1921146040Stjr 1922146040Stjr /* Recurse trying to reach the OP_OPEN_SUBEXP and 1923146040Stjr OP_CLOSE_SUBEXP cases below. But, if the 1924146040Stjr destination node is the same node as the source 1925146040Stjr node, don't recurse because it would cause an 1926146040Stjr infinite loop: a regex that exhibits this behavior 1927146040Stjr is ()\1*\1* */ 1928146040Stjr dst = dfa->edests[node].elems[0]; 1929146040Stjr if (dst == from_node) 1930146040Stjr { 1931146040Stjr if (boundaries & 1) 1932146040Stjr return -1; 1933146040Stjr else /* if (boundaries & 2) */ 1934146040Stjr return 0; 1935146040Stjr } 1936146040Stjr 1937146040Stjr cpos = 1938146040Stjr check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1939146040Stjr dst, bkref_idx); 1940146040Stjr if (cpos == -1 /* && (boundaries & 1) */) 1941146040Stjr return -1; 1942146040Stjr if (cpos == 0 && (boundaries & 2)) 1943146040Stjr return 0; 1944146040Stjr 1945146040Stjr ent->eps_reachable_subexps_map &= ~(1 << subexp_idx); 1946146040Stjr } 1947146040Stjr while (ent++->more); 1948146040Stjr } 1949146040Stjr break; 1950146040Stjr 1951146040Stjr case OP_OPEN_SUBEXP: 1952146040Stjr if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 1953146040Stjr return -1; 1954146040Stjr break; 1955146040Stjr 1956146040Stjr case OP_CLOSE_SUBEXP: 1957146040Stjr if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 1958146040Stjr return 0; 1959146040Stjr break; 1960146040Stjr 1961146040Stjr default: 1962146040Stjr break; 1963146040Stjr } 1964146040Stjr } 1965146040Stjr 1966146040Stjr return (boundaries & 2) ? 1 : 0; 1967146040Stjr} 1968146040Stjr 1969146040Stjrstatic int 1970146040Stjrcheck_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx) 1971146040Stjr re_match_context_t *mctx; 1972146040Stjr int limit, subexp_idx, from_node, str_idx, bkref_idx; 1973146040Stjr{ 1974146040Stjr struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 1975146040Stjr int boundaries; 1976146040Stjr 1977146040Stjr /* If we are outside the range of the subexpression, return -1 or 1. */ 1978146040Stjr if (str_idx < lim->subexp_from) 1979146040Stjr return -1; 1980146040Stjr 1981146040Stjr if (lim->subexp_to < str_idx) 1982146040Stjr return 1; 1983146040Stjr 1984146040Stjr /* If we are within the subexpression, return 0. */ 1985146040Stjr boundaries = (str_idx == lim->subexp_from); 1986146040Stjr boundaries |= (str_idx == lim->subexp_to) << 1; 1987146040Stjr if (boundaries == 0) 1988146040Stjr return 0; 1989146040Stjr 1990146040Stjr /* Else, examine epsilon closure. */ 1991146040Stjr return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1992146040Stjr from_node, bkref_idx); 1993146040Stjr} 1994146040Stjr 1995146040Stjr/* Check the limitations of sub expressions LIMITS, and remove the nodes 1996146040Stjr which are against limitations from DEST_NODES. */ 1997146040Stjr 1998146040Stjrstatic reg_errcode_t 1999146040Stjrcheck_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) 2000146040Stjr re_dfa_t *dfa; 2001146040Stjr re_node_set *dest_nodes; 2002146040Stjr const re_node_set *candidates; 2003146040Stjr re_node_set *limits; 2004146040Stjr struct re_backref_cache_entry *bkref_ents; 2005146040Stjr int str_idx; 2006146040Stjr{ 2007146040Stjr reg_errcode_t err; 2008146040Stjr int node_idx, lim_idx; 2009146040Stjr 2010146040Stjr for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2011146040Stjr { 2012146040Stjr int subexp_idx; 2013146040Stjr struct re_backref_cache_entry *ent; 2014146040Stjr ent = bkref_ents + limits->elems[lim_idx]; 2015146040Stjr 2016146040Stjr if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2017146040Stjr continue; /* This is unrelated limitation. */ 2018146040Stjr 2019146040Stjr subexp_idx = dfa->nodes[ent->node].opr.idx; 2020146040Stjr if (ent->subexp_to == str_idx) 2021146040Stjr { 2022146040Stjr int ops_node = -1; 2023146040Stjr int cls_node = -1; 2024146040Stjr for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2025146040Stjr { 2026146040Stjr int node = dest_nodes->elems[node_idx]; 2027146040Stjr re_token_type_t type = dfa->nodes[node].type; 2028146040Stjr if (type == OP_OPEN_SUBEXP 2029146040Stjr && subexp_idx == dfa->nodes[node].opr.idx) 2030146040Stjr ops_node = node; 2031146040Stjr else if (type == OP_CLOSE_SUBEXP 2032146040Stjr && subexp_idx == dfa->nodes[node].opr.idx) 2033146040Stjr cls_node = node; 2034146040Stjr } 2035146040Stjr 2036146040Stjr /* Check the limitation of the open subexpression. */ 2037146040Stjr /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2038146040Stjr if (ops_node >= 0) 2039146040Stjr { 2040146040Stjr err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2041146040Stjr candidates); 2042146040Stjr if (BE (err != REG_NOERROR, 0)) 2043146040Stjr return err; 2044146040Stjr } 2045146040Stjr 2046146040Stjr /* Check the limitation of the close subexpression. */ 2047146040Stjr if (cls_node >= 0) 2048146040Stjr for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2049146040Stjr { 2050146040Stjr int node = dest_nodes->elems[node_idx]; 2051146040Stjr if (!re_node_set_contains (dfa->inveclosures + node, 2052146040Stjr cls_node) 2053146040Stjr && !re_node_set_contains (dfa->eclosures + node, 2054146040Stjr cls_node)) 2055146040Stjr { 2056146040Stjr /* It is against this limitation. 2057146040Stjr Remove it form the current sifted state. */ 2058146040Stjr err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2059146040Stjr candidates); 2060146040Stjr if (BE (err != REG_NOERROR, 0)) 2061146040Stjr return err; 2062146040Stjr --node_idx; 2063146040Stjr } 2064146040Stjr } 2065146040Stjr } 2066146040Stjr else /* (ent->subexp_to != str_idx) */ 2067146040Stjr { 2068146040Stjr for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2069146040Stjr { 2070146040Stjr int node = dest_nodes->elems[node_idx]; 2071146040Stjr re_token_type_t type = dfa->nodes[node].type; 2072146040Stjr if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2073146040Stjr { 2074146040Stjr if (subexp_idx != dfa->nodes[node].opr.idx) 2075146040Stjr continue; 2076146040Stjr /* It is against this limitation. 2077146040Stjr Remove it form the current sifted state. */ 2078146040Stjr err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2079146040Stjr candidates); 2080146040Stjr if (BE (err != REG_NOERROR, 0)) 2081146040Stjr return err; 2082146040Stjr } 2083146040Stjr } 2084146040Stjr } 2085146040Stjr } 2086146040Stjr return REG_NOERROR; 2087146040Stjr} 2088146040Stjr 2089146040Stjrstatic reg_errcode_t 2090146040Stjrsift_states_bkref (mctx, sctx, str_idx, candidates) 2091146040Stjr re_match_context_t *mctx; 2092146040Stjr re_sift_context_t *sctx; 2093146040Stjr int str_idx; 2094146040Stjr const re_node_set *candidates; 2095146040Stjr{ 2096146040Stjr re_dfa_t *const dfa = mctx->dfa; 2097146040Stjr reg_errcode_t err; 2098146040Stjr int node_idx, node; 2099146040Stjr re_sift_context_t local_sctx; 2100146040Stjr int first_idx = search_cur_bkref_entry (mctx, str_idx); 2101146040Stjr 2102146040Stjr if (first_idx == -1) 2103146040Stjr return REG_NOERROR; 2104146040Stjr 2105146040Stjr local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2106146040Stjr 2107146040Stjr for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2108146040Stjr { 2109146040Stjr int enabled_idx; 2110146040Stjr re_token_type_t type; 2111146040Stjr struct re_backref_cache_entry *entry; 2112146040Stjr node = candidates->elems[node_idx]; 2113146040Stjr type = dfa->nodes[node].type; 2114146040Stjr /* Avoid infinite loop for the REs like "()\1+". */ 2115146040Stjr if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2116146040Stjr continue; 2117146040Stjr if (type != OP_BACK_REF) 2118146040Stjr continue; 2119146040Stjr 2120146040Stjr entry = mctx->bkref_ents + first_idx; 2121146040Stjr enabled_idx = first_idx; 2122146040Stjr do 2123146040Stjr { 2124146040Stjr int subexp_len, to_idx, dst_node; 2125146040Stjr re_dfastate_t *cur_state; 2126146040Stjr 2127146040Stjr if (entry->node != node) 2128146040Stjr continue; 2129146040Stjr subexp_len = entry->subexp_to - entry->subexp_from; 2130146040Stjr to_idx = str_idx + subexp_len; 2131146040Stjr dst_node = (subexp_len ? dfa->nexts[node] 2132146040Stjr : dfa->edests[node].elems[0]); 2133146040Stjr 2134146040Stjr if (to_idx > sctx->last_str_idx 2135146040Stjr || sctx->sifted_states[to_idx] == NULL 2136146040Stjr || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2137146040Stjr || check_dst_limits (mctx, &sctx->limits, node, 2138146040Stjr str_idx, dst_node, to_idx)) 2139146040Stjr continue; 2140146040Stjr 2141146040Stjr if (local_sctx.sifted_states == NULL) 2142146040Stjr { 2143146040Stjr local_sctx = *sctx; 2144146040Stjr err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2145146040Stjr if (BE (err != REG_NOERROR, 0)) 2146146040Stjr goto free_return; 2147146040Stjr } 2148146040Stjr local_sctx.last_node = node; 2149146040Stjr local_sctx.last_str_idx = str_idx; 2150146040Stjr err = re_node_set_insert (&local_sctx.limits, enabled_idx); 2151146040Stjr if (BE (err < 0, 0)) 2152146040Stjr { 2153146040Stjr err = REG_ESPACE; 2154146040Stjr goto free_return; 2155146040Stjr } 2156146040Stjr cur_state = local_sctx.sifted_states[str_idx]; 2157146040Stjr err = sift_states_backward (mctx, &local_sctx); 2158146040Stjr if (BE (err != REG_NOERROR, 0)) 2159146040Stjr goto free_return; 2160146040Stjr if (sctx->limited_states != NULL) 2161146040Stjr { 2162146040Stjr err = merge_state_array (dfa, sctx->limited_states, 2163146040Stjr local_sctx.sifted_states, 2164146040Stjr str_idx + 1); 2165146040Stjr if (BE (err != REG_NOERROR, 0)) 2166146040Stjr goto free_return; 2167146040Stjr } 2168146040Stjr local_sctx.sifted_states[str_idx] = cur_state; 2169146040Stjr re_node_set_remove (&local_sctx.limits, enabled_idx); 2170146040Stjr 2171146040Stjr /* mctx->bkref_ents may have changed, reload the pointer. */ 2172146040Stjr entry = mctx->bkref_ents + enabled_idx; 2173146040Stjr } 2174146040Stjr while (enabled_idx++, entry++->more); 2175146040Stjr } 2176146040Stjr err = REG_NOERROR; 2177146040Stjr free_return: 2178146040Stjr if (local_sctx.sifted_states != NULL) 2179146040Stjr { 2180146040Stjr re_node_set_free (&local_sctx.limits); 2181146040Stjr } 2182146040Stjr 2183146040Stjr return err; 2184146040Stjr} 2185146040Stjr 2186146040Stjr 2187146040Stjr#ifdef RE_ENABLE_I18N 2188146040Stjrstatic int 2189146040Stjrsift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx) 2190146040Stjr const re_match_context_t *mctx; 2191146040Stjr re_sift_context_t *sctx; 2192146040Stjr int node_idx, str_idx, max_str_idx; 2193146040Stjr{ 2194146040Stjr re_dfa_t *const dfa = mctx->dfa; 2195146040Stjr int naccepted; 2196146040Stjr /* Check the node can accept `multi byte'. */ 2197146040Stjr naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2198146040Stjr if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2199146040Stjr !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2200146040Stjr dfa->nexts[node_idx])) 2201146040Stjr /* The node can't accept the `multi byte', or the 2202146040Stjr destination was already thrown away, then the node 2203146040Stjr could't accept the current input `multi byte'. */ 2204146040Stjr naccepted = 0; 2205146040Stjr /* Otherwise, it is sure that the node could accept 2206146040Stjr `naccepted' bytes input. */ 2207146040Stjr return naccepted; 2208146040Stjr} 2209146040Stjr#endif /* RE_ENABLE_I18N */ 2210146040Stjr 2211146040Stjr 2212146040Stjr/* Functions for state transition. */ 2213146040Stjr 2214146040Stjr/* Return the next state to which the current state STATE will transit by 2215146040Stjr accepting the current input byte, and update STATE_LOG if necessary. 2216146040Stjr If STATE can accept a multibyte char/collating element/back reference 2217146040Stjr update the destination of STATE_LOG. */ 2218146040Stjr 2219146040Stjrstatic re_dfastate_t * 2220146040Stjrtransit_state (err, mctx, state) 2221146040Stjr reg_errcode_t *err; 2222146040Stjr re_match_context_t *mctx; 2223146040Stjr re_dfastate_t *state; 2224146040Stjr{ 2225146040Stjr re_dfastate_t **trtable; 2226146040Stjr unsigned char ch; 2227146040Stjr 2228146040Stjr#ifdef RE_ENABLE_I18N 2229146040Stjr /* If the current state can accept multibyte. */ 2230146040Stjr if (BE (state->accept_mb, 0)) 2231146040Stjr { 2232146040Stjr *err = transit_state_mb (mctx, state); 2233146040Stjr if (BE (*err != REG_NOERROR, 0)) 2234146040Stjr return NULL; 2235146040Stjr } 2236146040Stjr#endif /* RE_ENABLE_I18N */ 2237146040Stjr 2238146040Stjr /* Then decide the next state with the single byte. */ 2239146040Stjr#if 0 2240146040Stjr if (0) 2241146040Stjr /* don't use transition table */ 2242146040Stjr return transit_state_sb (err, mctx, state); 2243146040Stjr#endif 2244146040Stjr 2245146040Stjr /* Use transition table */ 2246146040Stjr ch = re_string_fetch_byte (&mctx->input); 2247146040Stjr for (;;) 2248146040Stjr { 2249146040Stjr trtable = state->trtable; 2250146040Stjr if (BE (trtable != NULL, 1)) 2251146040Stjr return trtable[ch]; 2252146040Stjr 2253146040Stjr trtable = state->word_trtable; 2254146040Stjr if (BE (trtable != NULL, 1)) 2255146040Stjr { 2256146040Stjr unsigned int context; 2257146040Stjr context 2258146040Stjr = re_string_context_at (&mctx->input, 2259146040Stjr re_string_cur_idx (&mctx->input) - 1, 2260146040Stjr mctx->eflags); 2261146040Stjr if (IS_WORD_CONTEXT (context)) 2262146040Stjr return trtable[ch + SBC_MAX]; 2263146040Stjr else 2264146040Stjr return trtable[ch]; 2265146040Stjr } 2266146040Stjr 2267146040Stjr if (!build_trtable (mctx->dfa, state)) 2268146040Stjr { 2269146040Stjr *err = REG_ESPACE; 2270146040Stjr return NULL; 2271146040Stjr } 2272146040Stjr 2273146040Stjr /* Retry, we now have a transition table. */ 2274146040Stjr } 2275146040Stjr} 2276146040Stjr 2277146040Stjr/* Update the state_log if we need */ 2278146040Stjrre_dfastate_t * 2279146040Stjrmerge_state_with_log (err, mctx, next_state) 2280146040Stjr reg_errcode_t *err; 2281146040Stjr re_match_context_t *mctx; 2282146040Stjr re_dfastate_t *next_state; 2283146040Stjr{ 2284146040Stjr re_dfa_t *const dfa = mctx->dfa; 2285146040Stjr int cur_idx = re_string_cur_idx (&mctx->input); 2286146040Stjr 2287146040Stjr if (cur_idx > mctx->state_log_top) 2288146040Stjr { 2289146040Stjr mctx->state_log[cur_idx] = next_state; 2290146040Stjr mctx->state_log_top = cur_idx; 2291146040Stjr } 2292146040Stjr else if (mctx->state_log[cur_idx] == 0) 2293146040Stjr { 2294146040Stjr mctx->state_log[cur_idx] = next_state; 2295146040Stjr } 2296146040Stjr else 2297146040Stjr { 2298146040Stjr re_dfastate_t *pstate; 2299146040Stjr unsigned int context; 2300146040Stjr re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2301146040Stjr /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2302146040Stjr the destination of a multibyte char/collating element/ 2303146040Stjr back reference. Then the next state is the union set of 2304146040Stjr these destinations and the results of the transition table. */ 2305146040Stjr pstate = mctx->state_log[cur_idx]; 2306146040Stjr log_nodes = pstate->entrance_nodes; 2307146040Stjr if (next_state != NULL) 2308146040Stjr { 2309146040Stjr table_nodes = next_state->entrance_nodes; 2310146040Stjr *err = re_node_set_init_union (&next_nodes, table_nodes, 2311146040Stjr log_nodes); 2312146040Stjr if (BE (*err != REG_NOERROR, 0)) 2313146040Stjr return NULL; 2314146040Stjr } 2315146040Stjr else 2316146040Stjr next_nodes = *log_nodes; 2317146040Stjr /* Note: We already add the nodes of the initial state, 2318146040Stjr then we don't need to add them here. */ 2319146040Stjr 2320146040Stjr context = re_string_context_at (&mctx->input, 2321146040Stjr re_string_cur_idx (&mctx->input) - 1, 2322146040Stjr mctx->eflags); 2323146040Stjr next_state = mctx->state_log[cur_idx] 2324146040Stjr = re_acquire_state_context (err, dfa, &next_nodes, context); 2325146040Stjr /* We don't need to check errors here, since the return value of 2326146040Stjr this function is next_state and ERR is already set. */ 2327146040Stjr 2328146040Stjr if (table_nodes != NULL) 2329146040Stjr re_node_set_free (&next_nodes); 2330146040Stjr } 2331146040Stjr 2332146040Stjr if (BE (dfa->nbackref, 0) && next_state != NULL) 2333146040Stjr { 2334146040Stjr /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2335146040Stjr later. We must check them here, since the back references in the 2336146040Stjr next state might use them. */ 2337146040Stjr *err = check_subexp_matching_top (mctx, &next_state->nodes, 2338146040Stjr cur_idx); 2339146040Stjr if (BE (*err != REG_NOERROR, 0)) 2340146040Stjr return NULL; 2341146040Stjr 2342146040Stjr /* If the next state has back references. */ 2343146040Stjr if (next_state->has_backref) 2344146040Stjr { 2345146040Stjr *err = transit_state_bkref (mctx, &next_state->nodes); 2346146040Stjr if (BE (*err != REG_NOERROR, 0)) 2347146040Stjr return NULL; 2348146040Stjr next_state = mctx->state_log[cur_idx]; 2349146040Stjr } 2350146040Stjr } 2351146040Stjr 2352146040Stjr return next_state; 2353146040Stjr} 2354146040Stjr 2355146040Stjr/* Skip bytes in the input that correspond to part of a 2356146040Stjr multi-byte match, then look in the log for a state 2357146040Stjr from which to restart matching. */ 2358146040Stjrre_dfastate_t * 2359146040Stjrfind_recover_state (err, mctx) 2360146040Stjr reg_errcode_t *err; 2361146040Stjr re_match_context_t *mctx; 2362146040Stjr{ 2363146040Stjr re_dfastate_t *cur_state = NULL; 2364146040Stjr do 2365146040Stjr { 2366146040Stjr int max = mctx->state_log_top; 2367146040Stjr int cur_str_idx = re_string_cur_idx (&mctx->input); 2368146040Stjr 2369146040Stjr do 2370146040Stjr { 2371146040Stjr if (++cur_str_idx > max) 2372146040Stjr return NULL; 2373146040Stjr re_string_skip_bytes (&mctx->input, 1); 2374146040Stjr } 2375146040Stjr while (mctx->state_log[cur_str_idx] == NULL); 2376146040Stjr 2377146040Stjr cur_state = merge_state_with_log (err, mctx, NULL); 2378146040Stjr } 2379146040Stjr while (err == REG_NOERROR && cur_state == NULL); 2380146040Stjr return cur_state; 2381146040Stjr} 2382146040Stjr 2383146040Stjr/* Helper functions for transit_state. */ 2384146040Stjr 2385146040Stjr/* From the node set CUR_NODES, pick up the nodes whose types are 2386146040Stjr OP_OPEN_SUBEXP and which have corresponding back references in the regular 2387146040Stjr expression. And register them to use them later for evaluating the 2388146040Stjr correspoding back references. */ 2389146040Stjr 2390146040Stjrstatic reg_errcode_t 2391146040Stjrcheck_subexp_matching_top (mctx, cur_nodes, str_idx) 2392146040Stjr re_match_context_t *mctx; 2393146040Stjr re_node_set *cur_nodes; 2394146040Stjr int str_idx; 2395146040Stjr{ 2396146040Stjr re_dfa_t *const dfa = mctx->dfa; 2397146040Stjr int node_idx; 2398146040Stjr reg_errcode_t err; 2399146040Stjr 2400146040Stjr /* TODO: This isn't efficient. 2401146040Stjr Because there might be more than one nodes whose types are 2402146040Stjr OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2403146040Stjr nodes. 2404146040Stjr E.g. RE: (a){2} */ 2405146040Stjr for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2406146040Stjr { 2407146040Stjr int node = cur_nodes->elems[node_idx]; 2408146040Stjr if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2409146040Stjr && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map)) 2410146040Stjr && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) 2411146040Stjr { 2412146040Stjr err = match_ctx_add_subtop (mctx, node, str_idx); 2413146040Stjr if (BE (err != REG_NOERROR, 0)) 2414146040Stjr return err; 2415146040Stjr } 2416146040Stjr } 2417146040Stjr return REG_NOERROR; 2418146040Stjr} 2419146040Stjr 2420146040Stjr#if 0 2421146040Stjr/* Return the next state to which the current state STATE will transit by 2422146040Stjr accepting the current input byte. */ 2423146040Stjr 2424146040Stjrstatic re_dfastate_t * 2425146040Stjrtransit_state_sb (err, mctx, state) 2426146040Stjr reg_errcode_t *err; 2427146040Stjr re_match_context_t *mctx; 2428146040Stjr re_dfastate_t *state; 2429146040Stjr{ 2430146040Stjr re_dfa_t *const dfa = mctx->dfa; 2431146040Stjr re_node_set next_nodes; 2432146040Stjr re_dfastate_t *next_state; 2433146040Stjr int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2434146040Stjr unsigned int context; 2435146040Stjr 2436146040Stjr *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2437146040Stjr if (BE (*err != REG_NOERROR, 0)) 2438146040Stjr return NULL; 2439146040Stjr for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2440146040Stjr { 2441146040Stjr int cur_node = state->nodes.elems[node_cnt]; 2442146040Stjr if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2443146040Stjr { 2444146040Stjr *err = re_node_set_merge (&next_nodes, 2445146040Stjr dfa->eclosures + dfa->nexts[cur_node]); 2446146040Stjr if (BE (*err != REG_NOERROR, 0)) 2447146040Stjr { 2448146040Stjr re_node_set_free (&next_nodes); 2449146040Stjr return NULL; 2450146040Stjr } 2451146040Stjr } 2452146040Stjr } 2453146040Stjr context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2454146040Stjr next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2455146040Stjr /* We don't need to check errors here, since the return value of 2456146040Stjr this function is next_state and ERR is already set. */ 2457146040Stjr 2458146040Stjr re_node_set_free (&next_nodes); 2459146040Stjr re_string_skip_bytes (&mctx->input, 1); 2460146040Stjr return next_state; 2461146040Stjr} 2462146040Stjr#endif 2463146040Stjr 2464146040Stjr#ifdef RE_ENABLE_I18N 2465146040Stjrstatic reg_errcode_t 2466146040Stjrtransit_state_mb (mctx, pstate) 2467146040Stjr re_match_context_t *mctx; 2468146040Stjr re_dfastate_t *pstate; 2469146040Stjr{ 2470146040Stjr re_dfa_t *const dfa = mctx->dfa; 2471146040Stjr reg_errcode_t err; 2472146040Stjr int i; 2473146040Stjr 2474146040Stjr for (i = 0; i < pstate->nodes.nelem; ++i) 2475146040Stjr { 2476146040Stjr re_node_set dest_nodes, *new_nodes; 2477146040Stjr int cur_node_idx = pstate->nodes.elems[i]; 2478146040Stjr int naccepted, dest_idx; 2479146040Stjr unsigned int context; 2480146040Stjr re_dfastate_t *dest_state; 2481146040Stjr 2482146040Stjr if (!dfa->nodes[cur_node_idx].accept_mb) 2483146040Stjr continue; 2484146040Stjr 2485146040Stjr if (dfa->nodes[cur_node_idx].constraint) 2486146040Stjr { 2487146040Stjr context = re_string_context_at (&mctx->input, 2488146040Stjr re_string_cur_idx (&mctx->input), 2489146040Stjr mctx->eflags); 2490146040Stjr if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2491146040Stjr context)) 2492146040Stjr continue; 2493146040Stjr } 2494146040Stjr 2495146040Stjr /* How many bytes the node can accept? */ 2496146040Stjr naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2497146040Stjr re_string_cur_idx (&mctx->input)); 2498146040Stjr if (naccepted == 0) 2499146040Stjr continue; 2500146040Stjr 2501146040Stjr /* The node can accepts `naccepted' bytes. */ 2502146040Stjr dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2503146040Stjr mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2504146040Stjr : mctx->max_mb_elem_len); 2505146040Stjr err = clean_state_log_if_needed (mctx, dest_idx); 2506146040Stjr if (BE (err != REG_NOERROR, 0)) 2507146040Stjr return err; 2508146040Stjr#ifdef DEBUG 2509146040Stjr assert (dfa->nexts[cur_node_idx] != -1); 2510146040Stjr#endif 2511146040Stjr new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2512146040Stjr 2513146040Stjr dest_state = mctx->state_log[dest_idx]; 2514146040Stjr if (dest_state == NULL) 2515146040Stjr dest_nodes = *new_nodes; 2516146040Stjr else 2517146040Stjr { 2518146040Stjr err = re_node_set_init_union (&dest_nodes, 2519146040Stjr dest_state->entrance_nodes, new_nodes); 2520146040Stjr if (BE (err != REG_NOERROR, 0)) 2521146040Stjr return err; 2522146040Stjr } 2523146040Stjr context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags); 2524146040Stjr mctx->state_log[dest_idx] 2525146040Stjr = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2526146040Stjr if (dest_state != NULL) 2527146040Stjr re_node_set_free (&dest_nodes); 2528146040Stjr if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2529146040Stjr return err; 2530146040Stjr } 2531146040Stjr return REG_NOERROR; 2532146040Stjr} 2533146040Stjr#endif /* RE_ENABLE_I18N */ 2534146040Stjr 2535146040Stjrstatic reg_errcode_t 2536146040Stjrtransit_state_bkref (mctx, nodes) 2537146040Stjr re_match_context_t *mctx; 2538146040Stjr const re_node_set *nodes; 2539146040Stjr{ 2540146040Stjr re_dfa_t *const dfa = mctx->dfa; 2541146040Stjr reg_errcode_t err; 2542146040Stjr int i; 2543146040Stjr int cur_str_idx = re_string_cur_idx (&mctx->input); 2544146040Stjr 2545146040Stjr for (i = 0; i < nodes->nelem; ++i) 2546146040Stjr { 2547146040Stjr int dest_str_idx, prev_nelem, bkc_idx; 2548146040Stjr int node_idx = nodes->elems[i]; 2549146040Stjr unsigned int context; 2550146040Stjr const re_token_t *node = dfa->nodes + node_idx; 2551146040Stjr re_node_set *new_dest_nodes; 2552146040Stjr 2553146040Stjr /* Check whether `node' is a backreference or not. */ 2554146040Stjr if (node->type != OP_BACK_REF) 2555146040Stjr continue; 2556146040Stjr 2557146040Stjr if (node->constraint) 2558146040Stjr { 2559146040Stjr context = re_string_context_at (&mctx->input, cur_str_idx, 2560146040Stjr mctx->eflags); 2561146040Stjr if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2562146040Stjr continue; 2563146040Stjr } 2564146040Stjr 2565146040Stjr /* `node' is a backreference. 2566146040Stjr Check the substring which the substring matched. */ 2567146040Stjr bkc_idx = mctx->nbkref_ents; 2568146040Stjr err = get_subexp (mctx, node_idx, cur_str_idx); 2569146040Stjr if (BE (err != REG_NOERROR, 0)) 2570146040Stjr goto free_return; 2571146040Stjr 2572146040Stjr /* And add the epsilon closures (which is `new_dest_nodes') of 2573146040Stjr the backreference to appropriate state_log. */ 2574146040Stjr#ifdef DEBUG 2575146040Stjr assert (dfa->nexts[node_idx] != -1); 2576146040Stjr#endif 2577146040Stjr for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2578146040Stjr { 2579146040Stjr int subexp_len; 2580146040Stjr re_dfastate_t *dest_state; 2581146040Stjr struct re_backref_cache_entry *bkref_ent; 2582146040Stjr bkref_ent = mctx->bkref_ents + bkc_idx; 2583146040Stjr if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2584146040Stjr continue; 2585146040Stjr subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2586146040Stjr new_dest_nodes = (subexp_len == 0 2587146040Stjr ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2588146040Stjr : dfa->eclosures + dfa->nexts[node_idx]); 2589146040Stjr dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2590146040Stjr - bkref_ent->subexp_from); 2591146040Stjr context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2592146040Stjr mctx->eflags); 2593146040Stjr dest_state = mctx->state_log[dest_str_idx]; 2594146040Stjr prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2595146040Stjr : mctx->state_log[cur_str_idx]->nodes.nelem); 2596146040Stjr /* Add `new_dest_node' to state_log. */ 2597146040Stjr if (dest_state == NULL) 2598146040Stjr { 2599146040Stjr mctx->state_log[dest_str_idx] 2600146040Stjr = re_acquire_state_context (&err, dfa, new_dest_nodes, 2601146040Stjr context); 2602146040Stjr if (BE (mctx->state_log[dest_str_idx] == NULL 2603146040Stjr && err != REG_NOERROR, 0)) 2604146040Stjr goto free_return; 2605146040Stjr } 2606146040Stjr else 2607146040Stjr { 2608146040Stjr re_node_set dest_nodes; 2609146040Stjr err = re_node_set_init_union (&dest_nodes, 2610146040Stjr dest_state->entrance_nodes, 2611146040Stjr new_dest_nodes); 2612146040Stjr if (BE (err != REG_NOERROR, 0)) 2613146040Stjr { 2614146040Stjr re_node_set_free (&dest_nodes); 2615146040Stjr goto free_return; 2616146040Stjr } 2617146040Stjr mctx->state_log[dest_str_idx] 2618146040Stjr = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2619146040Stjr re_node_set_free (&dest_nodes); 2620146040Stjr if (BE (mctx->state_log[dest_str_idx] == NULL 2621146040Stjr && err != REG_NOERROR, 0)) 2622146040Stjr goto free_return; 2623146040Stjr } 2624146040Stjr /* We need to check recursively if the backreference can epsilon 2625146040Stjr transit. */ 2626146040Stjr if (subexp_len == 0 2627146040Stjr && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2628146040Stjr { 2629146040Stjr err = check_subexp_matching_top (mctx, new_dest_nodes, 2630146040Stjr cur_str_idx); 2631146040Stjr if (BE (err != REG_NOERROR, 0)) 2632146040Stjr goto free_return; 2633146040Stjr err = transit_state_bkref (mctx, new_dest_nodes); 2634146040Stjr if (BE (err != REG_NOERROR, 0)) 2635146040Stjr goto free_return; 2636146040Stjr } 2637146040Stjr } 2638146040Stjr } 2639146040Stjr err = REG_NOERROR; 2640146040Stjr free_return: 2641146040Stjr return err; 2642146040Stjr} 2643146040Stjr 2644146040Stjr/* Enumerate all the candidates which the backreference BKREF_NODE can match 2645146040Stjr at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2646146040Stjr Note that we might collect inappropriate candidates here. 2647146040Stjr However, the cost of checking them strictly here is too high, then we 2648146040Stjr delay these checking for prune_impossible_nodes(). */ 2649146040Stjr 2650146040Stjrstatic reg_errcode_t 2651146040Stjrget_subexp (mctx, bkref_node, bkref_str_idx) 2652146040Stjr re_match_context_t *mctx; 2653146040Stjr int bkref_node, bkref_str_idx; 2654146040Stjr{ 2655146040Stjr re_dfa_t *const dfa = mctx->dfa; 2656146040Stjr int subexp_num, sub_top_idx; 2657146040Stjr const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2658146040Stjr /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2659146040Stjr int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2660146040Stjr if (cache_idx != -1) 2661146040Stjr { 2662146040Stjr const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; 2663146040Stjr do 2664146040Stjr if (entry->node == bkref_node) 2665146040Stjr return REG_NOERROR; /* We already checked it. */ 2666146040Stjr while (entry++->more); 2667146040Stjr } 2668146040Stjr 2669146040Stjr subexp_num = dfa->nodes[bkref_node].opr.idx; 2670146040Stjr 2671146040Stjr /* For each sub expression */ 2672146040Stjr for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2673146040Stjr { 2674146040Stjr reg_errcode_t err; 2675146040Stjr re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2676146040Stjr re_sub_match_last_t *sub_last; 2677146040Stjr int sub_last_idx, sl_str, bkref_str_off; 2678146040Stjr 2679146040Stjr if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2680146040Stjr continue; /* It isn't related. */ 2681146040Stjr 2682146040Stjr sl_str = sub_top->str_idx; 2683146040Stjr bkref_str_off = bkref_str_idx; 2684146040Stjr /* At first, check the last node of sub expressions we already 2685146040Stjr evaluated. */ 2686146040Stjr for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2687146040Stjr { 2688146040Stjr int sl_str_diff; 2689146040Stjr sub_last = sub_top->lasts[sub_last_idx]; 2690146040Stjr sl_str_diff = sub_last->str_idx - sl_str; 2691146040Stjr /* The matched string by the sub expression match with the substring 2692146040Stjr at the back reference? */ 2693146040Stjr if (sl_str_diff > 0) 2694146040Stjr { 2695146040Stjr if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2696146040Stjr { 2697146040Stjr /* Not enough chars for a successful match. */ 2698146040Stjr if (bkref_str_off + sl_str_diff > mctx->input.len) 2699146040Stjr break; 2700146040Stjr 2701146040Stjr err = clean_state_log_if_needed (mctx, 2702146040Stjr bkref_str_off 2703146040Stjr + sl_str_diff); 2704146040Stjr if (BE (err != REG_NOERROR, 0)) 2705146040Stjr return err; 2706146040Stjr buf = (const char *) re_string_get_buffer (&mctx->input); 2707146040Stjr } 2708146040Stjr if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2709146040Stjr break; /* We don't need to search this sub expression any more. */ 2710146040Stjr } 2711146040Stjr bkref_str_off += sl_str_diff; 2712146040Stjr sl_str += sl_str_diff; 2713146040Stjr err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2714146040Stjr bkref_str_idx); 2715146040Stjr 2716146040Stjr /* Reload buf, since the preceding call might have reallocated 2717146040Stjr the buffer. */ 2718146040Stjr buf = (const char *) re_string_get_buffer (&mctx->input); 2719146040Stjr 2720146040Stjr if (err == REG_NOMATCH) 2721146040Stjr continue; 2722146040Stjr if (BE (err != REG_NOERROR, 0)) 2723146040Stjr return err; 2724146040Stjr } 2725146040Stjr 2726146040Stjr if (sub_last_idx < sub_top->nlasts) 2727146040Stjr continue; 2728146040Stjr if (sub_last_idx > 0) 2729146040Stjr ++sl_str; 2730146040Stjr /* Then, search for the other last nodes of the sub expression. */ 2731146040Stjr for (; sl_str <= bkref_str_idx; ++sl_str) 2732146040Stjr { 2733146040Stjr int cls_node, sl_str_off; 2734146040Stjr const re_node_set *nodes; 2735146040Stjr sl_str_off = sl_str - sub_top->str_idx; 2736146040Stjr /* The matched string by the sub expression match with the substring 2737146040Stjr at the back reference? */ 2738146040Stjr if (sl_str_off > 0) 2739146040Stjr { 2740146040Stjr if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2741146040Stjr { 2742146040Stjr /* If we are at the end of the input, we cannot match. */ 2743146040Stjr if (bkref_str_off >= mctx->input.len) 2744146040Stjr break; 2745146040Stjr 2746146040Stjr err = extend_buffers (mctx); 2747146040Stjr if (BE (err != REG_NOERROR, 0)) 2748146040Stjr return err; 2749146040Stjr 2750146040Stjr buf = (const char *) re_string_get_buffer (&mctx->input); 2751146040Stjr } 2752146040Stjr if (buf [bkref_str_off++] != buf[sl_str - 1]) 2753146040Stjr break; /* We don't need to search this sub expression 2754146040Stjr any more. */ 2755146040Stjr } 2756146040Stjr if (mctx->state_log[sl_str] == NULL) 2757146040Stjr continue; 2758146040Stjr /* Does this state have a ')' of the sub expression? */ 2759146040Stjr nodes = &mctx->state_log[sl_str]->nodes; 2760146040Stjr cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP); 2761146040Stjr if (cls_node == -1) 2762146040Stjr continue; /* No. */ 2763146040Stjr if (sub_top->path == NULL) 2764146040Stjr { 2765146040Stjr sub_top->path = calloc (sizeof (state_array_t), 2766146040Stjr sl_str - sub_top->str_idx + 1); 2767146040Stjr if (sub_top->path == NULL) 2768146040Stjr return REG_ESPACE; 2769146040Stjr } 2770146040Stjr /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2771146040Stjr in the current context? */ 2772146040Stjr err = check_arrival (mctx, sub_top->path, sub_top->node, 2773146040Stjr sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP); 2774146040Stjr if (err == REG_NOMATCH) 2775146040Stjr continue; 2776146040Stjr if (BE (err != REG_NOERROR, 0)) 2777146040Stjr return err; 2778146040Stjr sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2779146040Stjr if (BE (sub_last == NULL, 0)) 2780146040Stjr return REG_ESPACE; 2781146040Stjr err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2782146040Stjr bkref_str_idx); 2783146040Stjr if (err == REG_NOMATCH) 2784146040Stjr continue; 2785146040Stjr } 2786146040Stjr } 2787146040Stjr return REG_NOERROR; 2788146040Stjr} 2789146040Stjr 2790146040Stjr/* Helper functions for get_subexp(). */ 2791146040Stjr 2792146040Stjr/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2793146040Stjr If it can arrive, register the sub expression expressed with SUB_TOP 2794146040Stjr and SUB_LAST. */ 2795146040Stjr 2796146040Stjrstatic reg_errcode_t 2797146040Stjrget_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str) 2798146040Stjr re_match_context_t *mctx; 2799146040Stjr const re_sub_match_top_t *sub_top; 2800146040Stjr re_sub_match_last_t *sub_last; 2801146040Stjr int bkref_node, bkref_str; 2802146040Stjr{ 2803146040Stjr reg_errcode_t err; 2804146040Stjr int to_idx; 2805146040Stjr /* Can the subexpression arrive the back reference? */ 2806146040Stjr err = check_arrival (mctx, &sub_last->path, sub_last->node, 2807146040Stjr sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP); 2808146040Stjr if (err != REG_NOERROR) 2809146040Stjr return err; 2810146040Stjr err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2811146040Stjr sub_last->str_idx); 2812146040Stjr if (BE (err != REG_NOERROR, 0)) 2813146040Stjr return err; 2814146040Stjr to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2815146040Stjr return clean_state_log_if_needed (mctx, to_idx); 2816146040Stjr} 2817146040Stjr 2818146040Stjr/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2819146040Stjr Search '(' if FL_OPEN, or search ')' otherwise. 2820146040Stjr TODO: This function isn't efficient... 2821146040Stjr Because there might be more than one nodes whose types are 2822146040Stjr OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2823146040Stjr nodes. 2824146040Stjr E.g. RE: (a){2} */ 2825146040Stjr 2826146040Stjrstatic int 2827146040Stjrfind_subexp_node (dfa, nodes, subexp_idx, type) 2828146040Stjr const re_dfa_t *dfa; 2829146040Stjr const re_node_set *nodes; 2830146040Stjr int subexp_idx, type; 2831146040Stjr{ 2832146040Stjr int cls_idx; 2833146040Stjr for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2834146040Stjr { 2835146040Stjr int cls_node = nodes->elems[cls_idx]; 2836146040Stjr const re_token_t *node = dfa->nodes + cls_node; 2837146040Stjr if (node->type == type 2838146040Stjr && node->opr.idx == subexp_idx) 2839146040Stjr return cls_node; 2840146040Stjr } 2841146040Stjr return -1; 2842146040Stjr} 2843146040Stjr 2844146040Stjr/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2845146040Stjr LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2846146040Stjr heavily reused. 2847146040Stjr Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2848146040Stjr 2849146040Stjrstatic reg_errcode_t 2850146040Stjrcheck_arrival (mctx, path, top_node, top_str, last_node, last_str, 2851146040Stjr type) 2852146040Stjr re_match_context_t *mctx; 2853146040Stjr state_array_t *path; 2854146040Stjr int top_node, top_str, last_node, last_str, type; 2855146040Stjr{ 2856146040Stjr re_dfa_t *const dfa = mctx->dfa; 2857146040Stjr reg_errcode_t err; 2858146040Stjr int subexp_num, backup_cur_idx, str_idx, null_cnt; 2859146040Stjr re_dfastate_t *cur_state = NULL; 2860146040Stjr re_node_set *cur_nodes, next_nodes; 2861146040Stjr re_dfastate_t **backup_state_log; 2862146040Stjr unsigned int context; 2863146040Stjr 2864146040Stjr subexp_num = dfa->nodes[top_node].opr.idx; 2865146040Stjr /* Extend the buffer if we need. */ 2866146040Stjr if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2867146040Stjr { 2868146040Stjr re_dfastate_t **new_array; 2869146040Stjr int old_alloc = path->alloc; 2870146040Stjr path->alloc += last_str + mctx->max_mb_elem_len + 1; 2871146040Stjr new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2872146040Stjr if (new_array == NULL) 2873146040Stjr { 2874146040Stjr path->alloc = old_alloc; 2875146040Stjr return REG_ESPACE; 2876146040Stjr } 2877146040Stjr path->array = new_array; 2878146040Stjr memset (new_array + old_alloc, '\0', 2879146040Stjr sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2880146040Stjr } 2881146040Stjr 2882146040Stjr str_idx = path->next_idx == 0 ? top_str : path->next_idx; 2883146040Stjr 2884146040Stjr /* Temporary modify MCTX. */ 2885146040Stjr backup_state_log = mctx->state_log; 2886146040Stjr backup_cur_idx = mctx->input.cur_idx; 2887146040Stjr mctx->state_log = path->array; 2888146040Stjr mctx->input.cur_idx = str_idx; 2889146040Stjr 2890146040Stjr /* Setup initial node set. */ 2891146040Stjr context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2892146040Stjr if (str_idx == top_str) 2893146040Stjr { 2894146040Stjr err = re_node_set_init_1 (&next_nodes, top_node); 2895146040Stjr if (BE (err != REG_NOERROR, 0)) 2896146040Stjr return err; 2897146040Stjr err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2898146040Stjr if (BE (err != REG_NOERROR, 0)) 2899146040Stjr { 2900146040Stjr re_node_set_free (&next_nodes); 2901146040Stjr return err; 2902146040Stjr } 2903146040Stjr } 2904146040Stjr else 2905146040Stjr { 2906146040Stjr cur_state = mctx->state_log[str_idx]; 2907146040Stjr if (cur_state && cur_state->has_backref) 2908146040Stjr { 2909146040Stjr err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2910146040Stjr if (BE ( err != REG_NOERROR, 0)) 2911146040Stjr return err; 2912146040Stjr } 2913146040Stjr else 2914146040Stjr re_node_set_init_empty (&next_nodes); 2915146040Stjr } 2916146040Stjr if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2917146040Stjr { 2918146040Stjr if (next_nodes.nelem) 2919146040Stjr { 2920146040Stjr err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2921146040Stjr subexp_num, type); 2922146040Stjr if (BE ( err != REG_NOERROR, 0)) 2923146040Stjr { 2924146040Stjr re_node_set_free (&next_nodes); 2925146040Stjr return err; 2926146040Stjr } 2927146040Stjr } 2928146040Stjr cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2929146040Stjr if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2930146040Stjr { 2931146040Stjr re_node_set_free (&next_nodes); 2932146040Stjr return err; 2933146040Stjr } 2934146040Stjr mctx->state_log[str_idx] = cur_state; 2935146040Stjr } 2936146040Stjr 2937146040Stjr for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2938146040Stjr { 2939146040Stjr re_node_set_empty (&next_nodes); 2940146040Stjr if (mctx->state_log[str_idx + 1]) 2941146040Stjr { 2942146040Stjr err = re_node_set_merge (&next_nodes, 2943146040Stjr &mctx->state_log[str_idx + 1]->nodes); 2944146040Stjr if (BE (err != REG_NOERROR, 0)) 2945146040Stjr { 2946146040Stjr re_node_set_free (&next_nodes); 2947146040Stjr return err; 2948146040Stjr } 2949146040Stjr } 2950146040Stjr if (cur_state) 2951146040Stjr { 2952146040Stjr err = check_arrival_add_next_nodes (mctx, str_idx, 2953146040Stjr &cur_state->non_eps_nodes, &next_nodes); 2954146040Stjr if (BE (err != REG_NOERROR, 0)) 2955146040Stjr { 2956146040Stjr re_node_set_free (&next_nodes); 2957146040Stjr return err; 2958146040Stjr } 2959146040Stjr } 2960146040Stjr ++str_idx; 2961146040Stjr if (next_nodes.nelem) 2962146040Stjr { 2963146040Stjr err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2964146040Stjr if (BE (err != REG_NOERROR, 0)) 2965146040Stjr { 2966146040Stjr re_node_set_free (&next_nodes); 2967146040Stjr return err; 2968146040Stjr } 2969146040Stjr err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2970146040Stjr subexp_num, type); 2971146040Stjr if (BE ( err != REG_NOERROR, 0)) 2972146040Stjr { 2973146040Stjr re_node_set_free (&next_nodes); 2974146040Stjr return err; 2975146040Stjr } 2976146040Stjr } 2977146040Stjr context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2978146040Stjr cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2979146040Stjr if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2980146040Stjr { 2981146040Stjr re_node_set_free (&next_nodes); 2982146040Stjr return err; 2983146040Stjr } 2984146040Stjr mctx->state_log[str_idx] = cur_state; 2985146040Stjr null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 2986146040Stjr } 2987146040Stjr re_node_set_free (&next_nodes); 2988146040Stjr cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 2989146040Stjr : &mctx->state_log[last_str]->nodes); 2990146040Stjr path->next_idx = str_idx; 2991146040Stjr 2992146040Stjr /* Fix MCTX. */ 2993146040Stjr mctx->state_log = backup_state_log; 2994146040Stjr mctx->input.cur_idx = backup_cur_idx; 2995146040Stjr 2996146040Stjr /* Then check the current node set has the node LAST_NODE. */ 2997146040Stjr if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 2998146040Stjr return REG_NOERROR; 2999146040Stjr 3000146040Stjr return REG_NOMATCH; 3001146040Stjr} 3002146040Stjr 3003146040Stjr/* Helper functions for check_arrival. */ 3004146040Stjr 3005146040Stjr/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3006146040Stjr to NEXT_NODES. 3007146040Stjr TODO: This function is similar to the functions transit_state*(), 3008146040Stjr however this function has many additional works. 3009146040Stjr Can't we unify them? */ 3010146040Stjr 3011146040Stjrstatic reg_errcode_t 3012146040Stjrcheck_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes) 3013146040Stjr re_match_context_t *mctx; 3014146040Stjr int str_idx; 3015146040Stjr re_node_set *cur_nodes, *next_nodes; 3016146040Stjr{ 3017146040Stjr re_dfa_t *const dfa = mctx->dfa; 3018146040Stjr int result; 3019146040Stjr int cur_idx; 3020146040Stjr reg_errcode_t err; 3021146040Stjr re_node_set union_set; 3022146040Stjr re_node_set_init_empty (&union_set); 3023146040Stjr for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3024146040Stjr { 3025146040Stjr int naccepted = 0; 3026146040Stjr int cur_node = cur_nodes->elems[cur_idx]; 3027146040Stjr#ifdef DEBUG 3028146040Stjr re_token_type_t type = dfa->nodes[cur_node].type; 3029146040Stjr assert (!IS_EPSILON_NODE (type)); 3030146040Stjr#endif 3031146040Stjr#ifdef RE_ENABLE_I18N 3032146040Stjr /* If the node may accept `multi byte'. */ 3033146040Stjr if (dfa->nodes[cur_node].accept_mb) 3034146040Stjr { 3035146040Stjr naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3036146040Stjr str_idx); 3037146040Stjr if (naccepted > 1) 3038146040Stjr { 3039146040Stjr re_dfastate_t *dest_state; 3040146040Stjr int next_node = dfa->nexts[cur_node]; 3041146040Stjr int next_idx = str_idx + naccepted; 3042146040Stjr dest_state = mctx->state_log[next_idx]; 3043146040Stjr re_node_set_empty (&union_set); 3044146040Stjr if (dest_state) 3045146040Stjr { 3046146040Stjr err = re_node_set_merge (&union_set, &dest_state->nodes); 3047146040Stjr if (BE (err != REG_NOERROR, 0)) 3048146040Stjr { 3049146040Stjr re_node_set_free (&union_set); 3050146040Stjr return err; 3051146040Stjr } 3052146040Stjr } 3053146040Stjr result = re_node_set_insert (&union_set, next_node); 3054146040Stjr if (BE (result < 0, 0)) 3055146040Stjr { 3056146040Stjr re_node_set_free (&union_set); 3057146040Stjr return REG_ESPACE; 3058146040Stjr } 3059146040Stjr mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3060146040Stjr &union_set); 3061146040Stjr if (BE (mctx->state_log[next_idx] == NULL 3062146040Stjr && err != REG_NOERROR, 0)) 3063146040Stjr { 3064146040Stjr re_node_set_free (&union_set); 3065146040Stjr return err; 3066146040Stjr } 3067146040Stjr } 3068146040Stjr } 3069146040Stjr#endif /* RE_ENABLE_I18N */ 3070146040Stjr if (naccepted 3071146040Stjr || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3072146040Stjr { 3073146040Stjr result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3074146040Stjr if (BE (result < 0, 0)) 3075146040Stjr { 3076146040Stjr re_node_set_free (&union_set); 3077146040Stjr return REG_ESPACE; 3078146040Stjr } 3079146040Stjr } 3080146040Stjr } 3081146040Stjr re_node_set_free (&union_set); 3082146040Stjr return REG_NOERROR; 3083146040Stjr} 3084146040Stjr 3085146040Stjr/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3086146040Stjr CUR_NODES, however exclude the nodes which are: 3087146040Stjr - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3088146040Stjr - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3089146040Stjr*/ 3090146040Stjr 3091146040Stjrstatic reg_errcode_t 3092146040Stjrcheck_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type) 3093146040Stjr re_dfa_t *dfa; 3094146040Stjr re_node_set *cur_nodes; 3095146040Stjr int ex_subexp, type; 3096146040Stjr{ 3097146040Stjr reg_errcode_t err; 3098146040Stjr int idx, outside_node; 3099146040Stjr re_node_set new_nodes; 3100146040Stjr#ifdef DEBUG 3101146040Stjr assert (cur_nodes->nelem); 3102146040Stjr#endif 3103146040Stjr err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3104146040Stjr if (BE (err != REG_NOERROR, 0)) 3105146040Stjr return err; 3106146040Stjr /* Create a new node set NEW_NODES with the nodes which are epsilon 3107146040Stjr closures of the node in CUR_NODES. */ 3108146040Stjr 3109146040Stjr for (idx = 0; idx < cur_nodes->nelem; ++idx) 3110146040Stjr { 3111146040Stjr int cur_node = cur_nodes->elems[idx]; 3112146040Stjr re_node_set *eclosure = dfa->eclosures + cur_node; 3113146040Stjr outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3114146040Stjr if (outside_node == -1) 3115146040Stjr { 3116146040Stjr /* There are no problematic nodes, just merge them. */ 3117146040Stjr err = re_node_set_merge (&new_nodes, eclosure); 3118146040Stjr if (BE (err != REG_NOERROR, 0)) 3119146040Stjr { 3120146040Stjr re_node_set_free (&new_nodes); 3121146040Stjr return err; 3122146040Stjr } 3123146040Stjr } 3124146040Stjr else 3125146040Stjr { 3126146040Stjr /* There are problematic nodes, re-calculate incrementally. */ 3127146040Stjr err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3128146040Stjr ex_subexp, type); 3129146040Stjr if (BE (err != REG_NOERROR, 0)) 3130146040Stjr { 3131146040Stjr re_node_set_free (&new_nodes); 3132146040Stjr return err; 3133146040Stjr } 3134146040Stjr } 3135146040Stjr } 3136146040Stjr re_node_set_free (cur_nodes); 3137146040Stjr *cur_nodes = new_nodes; 3138146040Stjr return REG_NOERROR; 3139146040Stjr} 3140146040Stjr 3141146040Stjr/* Helper function for check_arrival_expand_ecl. 3142146040Stjr Check incrementally the epsilon closure of TARGET, and if it isn't 3143146040Stjr problematic append it to DST_NODES. */ 3144146040Stjr 3145146040Stjrstatic reg_errcode_t 3146146040Stjrcheck_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type) 3147146040Stjr re_dfa_t *dfa; 3148146040Stjr int target, ex_subexp, type; 3149146040Stjr re_node_set *dst_nodes; 3150146040Stjr{ 3151146040Stjr int cur_node; 3152146040Stjr for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3153146040Stjr { 3154146040Stjr int err; 3155146040Stjr 3156146040Stjr if (dfa->nodes[cur_node].type == type 3157146040Stjr && dfa->nodes[cur_node].opr.idx == ex_subexp) 3158146040Stjr { 3159146040Stjr if (type == OP_CLOSE_SUBEXP) 3160146040Stjr { 3161146040Stjr err = re_node_set_insert (dst_nodes, cur_node); 3162146040Stjr if (BE (err == -1, 0)) 3163146040Stjr return REG_ESPACE; 3164146040Stjr } 3165146040Stjr break; 3166146040Stjr } 3167146040Stjr err = re_node_set_insert (dst_nodes, cur_node); 3168146040Stjr if (BE (err == -1, 0)) 3169146040Stjr return REG_ESPACE; 3170146040Stjr if (dfa->edests[cur_node].nelem == 0) 3171146040Stjr break; 3172146040Stjr if (dfa->edests[cur_node].nelem == 2) 3173146040Stjr { 3174146040Stjr err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3175146040Stjr dfa->edests[cur_node].elems[1], 3176146040Stjr ex_subexp, type); 3177146040Stjr if (BE (err != REG_NOERROR, 0)) 3178146040Stjr return err; 3179146040Stjr } 3180146040Stjr cur_node = dfa->edests[cur_node].elems[0]; 3181146040Stjr } 3182146040Stjr return REG_NOERROR; 3183146040Stjr} 3184146040Stjr 3185146040Stjr 3186146040Stjr/* For all the back references in the current state, calculate the 3187146040Stjr destination of the back references by the appropriate entry 3188146040Stjr in MCTX->BKREF_ENTS. */ 3189146040Stjr 3190146040Stjrstatic reg_errcode_t 3191146040Stjrexpand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num, 3192146040Stjr type) 3193146040Stjr re_match_context_t *mctx; 3194146040Stjr int cur_str, subexp_num, type; 3195146040Stjr re_node_set *cur_nodes; 3196146040Stjr{ 3197146040Stjr re_dfa_t *const dfa = mctx->dfa; 3198146040Stjr reg_errcode_t err; 3199146040Stjr int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3200146040Stjr struct re_backref_cache_entry *ent; 3201146040Stjr 3202146040Stjr if (cache_idx_start == -1) 3203146040Stjr return REG_NOERROR; 3204146040Stjr 3205146040Stjr restart: 3206146040Stjr ent = mctx->bkref_ents + cache_idx_start; 3207146040Stjr do 3208146040Stjr { 3209146040Stjr int to_idx, next_node; 3210146040Stjr 3211146040Stjr /* Is this entry ENT is appropriate? */ 3212146040Stjr if (!re_node_set_contains (cur_nodes, ent->node)) 3213146040Stjr continue; /* No. */ 3214146040Stjr 3215146040Stjr to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3216146040Stjr /* Calculate the destination of the back reference, and append it 3217146040Stjr to MCTX->STATE_LOG. */ 3218146040Stjr if (to_idx == cur_str) 3219146040Stjr { 3220146040Stjr /* The backreference did epsilon transit, we must re-check all the 3221146040Stjr node in the current state. */ 3222146040Stjr re_node_set new_dests; 3223146040Stjr reg_errcode_t err2, err3; 3224146040Stjr next_node = dfa->edests[ent->node].elems[0]; 3225146040Stjr if (re_node_set_contains (cur_nodes, next_node)) 3226146040Stjr continue; 3227146040Stjr err = re_node_set_init_1 (&new_dests, next_node); 3228146040Stjr err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3229146040Stjr err3 = re_node_set_merge (cur_nodes, &new_dests); 3230146040Stjr re_node_set_free (&new_dests); 3231146040Stjr if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3232146040Stjr || err3 != REG_NOERROR, 0)) 3233146040Stjr { 3234146040Stjr err = (err != REG_NOERROR ? err 3235146040Stjr : (err2 != REG_NOERROR ? err2 : err3)); 3236146040Stjr return err; 3237146040Stjr } 3238146040Stjr /* TODO: It is still inefficient... */ 3239146040Stjr goto restart; 3240146040Stjr } 3241146040Stjr else 3242146040Stjr { 3243146040Stjr re_node_set union_set; 3244146040Stjr next_node = dfa->nexts[ent->node]; 3245146040Stjr if (mctx->state_log[to_idx]) 3246146040Stjr { 3247146040Stjr int ret; 3248146040Stjr if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3249146040Stjr next_node)) 3250146040Stjr continue; 3251146040Stjr err = re_node_set_init_copy (&union_set, 3252146040Stjr &mctx->state_log[to_idx]->nodes); 3253146040Stjr ret = re_node_set_insert (&union_set, next_node); 3254146040Stjr if (BE (err != REG_NOERROR || ret < 0, 0)) 3255146040Stjr { 3256146040Stjr re_node_set_free (&union_set); 3257146040Stjr err = err != REG_NOERROR ? err : REG_ESPACE; 3258146040Stjr return err; 3259146040Stjr } 3260146040Stjr } 3261146040Stjr else 3262146040Stjr { 3263146040Stjr err = re_node_set_init_1 (&union_set, next_node); 3264146040Stjr if (BE (err != REG_NOERROR, 0)) 3265146040Stjr return err; 3266146040Stjr } 3267146040Stjr mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3268146040Stjr re_node_set_free (&union_set); 3269146040Stjr if (BE (mctx->state_log[to_idx] == NULL 3270146040Stjr && err != REG_NOERROR, 0)) 3271146040Stjr return err; 3272146040Stjr } 3273146040Stjr } 3274146040Stjr while (ent++->more); 3275146040Stjr return REG_NOERROR; 3276146040Stjr} 3277146040Stjr 3278146040Stjr/* Build transition table for the state. 3279146040Stjr Return 1 if succeeded, otherwise return NULL. */ 3280146040Stjr 3281146040Stjrstatic int 3282146040Stjrbuild_trtable (dfa, state) 3283146040Stjr re_dfa_t *dfa; 3284146040Stjr re_dfastate_t *state; 3285146040Stjr{ 3286146040Stjr reg_errcode_t err; 3287146040Stjr int i, j, ch, need_word_trtable = 0; 3288146040Stjr unsigned int elem, mask; 3289146040Stjr int dests_node_malloced = 0, dest_states_malloced = 0; 3290146040Stjr int ndests; /* Number of the destination states from `state'. */ 3291146040Stjr re_dfastate_t **trtable; 3292146040Stjr re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3293146040Stjr re_node_set follows, *dests_node; 3294146040Stjr bitset *dests_ch; 3295146040Stjr bitset acceptable; 3296146040Stjr 3297146040Stjr /* We build DFA states which corresponds to the destination nodes 3298146040Stjr from `state'. `dests_node[i]' represents the nodes which i-th 3299146040Stjr destination state contains, and `dests_ch[i]' represents the 3300146040Stjr characters which i-th destination state accepts. */ 3301146040Stjr#ifdef _LIBC 3302146040Stjr if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) 3303146040Stjr dests_node = (re_node_set *) 3304146040Stjr alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3305146040Stjr else 3306146040Stjr#endif 3307146040Stjr { 3308146040Stjr dests_node = (re_node_set *) 3309146040Stjr malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3310146040Stjr if (BE (dests_node == NULL, 0)) 3311146040Stjr return 0; 3312146040Stjr dests_node_malloced = 1; 3313146040Stjr } 3314146040Stjr dests_ch = (bitset *) (dests_node + SBC_MAX); 3315146040Stjr 3316146040Stjr /* Initialize transiton table. */ 3317146040Stjr state->word_trtable = state->trtable = NULL; 3318146040Stjr 3319146040Stjr /* At first, group all nodes belonging to `state' into several 3320146040Stjr destinations. */ 3321146040Stjr ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3322146040Stjr if (BE (ndests <= 0, 0)) 3323146040Stjr { 3324146040Stjr if (dests_node_malloced) 3325146040Stjr free (dests_node); 3326146040Stjr /* Return 0 in case of an error, 1 otherwise. */ 3327146040Stjr if (ndests == 0) 3328146040Stjr { 3329146040Stjr state->trtable = (re_dfastate_t **) 3330146040Stjr calloc (sizeof (re_dfastate_t *), SBC_MAX); 3331146040Stjr return 1; 3332146040Stjr } 3333146040Stjr return 0; 3334146040Stjr } 3335146040Stjr 3336146040Stjr err = re_node_set_alloc (&follows, ndests + 1); 3337146040Stjr if (BE (err != REG_NOERROR, 0)) 3338146040Stjr goto out_free; 3339146040Stjr 3340146040Stjr#ifdef _LIBC 3341146040Stjr if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX 3342146040Stjr + ndests * 3 * sizeof (re_dfastate_t *))) 3343146040Stjr dest_states = (re_dfastate_t **) 3344146040Stjr alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3345146040Stjr else 3346146040Stjr#endif 3347146040Stjr { 3348146040Stjr dest_states = (re_dfastate_t **) 3349146040Stjr malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3350146040Stjr if (BE (dest_states == NULL, 0)) 3351146040Stjr { 3352146040Stjrout_free: 3353146040Stjr if (dest_states_malloced) 3354146040Stjr free (dest_states); 3355146040Stjr re_node_set_free (&follows); 3356146040Stjr for (i = 0; i < ndests; ++i) 3357146040Stjr re_node_set_free (dests_node + i); 3358146040Stjr if (dests_node_malloced) 3359146040Stjr free (dests_node); 3360146040Stjr return 0; 3361146040Stjr } 3362146040Stjr dest_states_malloced = 1; 3363146040Stjr } 3364146040Stjr dest_states_word = dest_states + ndests; 3365146040Stjr dest_states_nl = dest_states_word + ndests; 3366146040Stjr bitset_empty (acceptable); 3367146040Stjr 3368146040Stjr /* Then build the states for all destinations. */ 3369146040Stjr for (i = 0; i < ndests; ++i) 3370146040Stjr { 3371146040Stjr int next_node; 3372146040Stjr re_node_set_empty (&follows); 3373146040Stjr /* Merge the follows of this destination states. */ 3374146040Stjr for (j = 0; j < dests_node[i].nelem; ++j) 3375146040Stjr { 3376146040Stjr next_node = dfa->nexts[dests_node[i].elems[j]]; 3377146040Stjr if (next_node != -1) 3378146040Stjr { 3379146040Stjr err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3380146040Stjr if (BE (err != REG_NOERROR, 0)) 3381146040Stjr goto out_free; 3382146040Stjr } 3383146040Stjr } 3384146040Stjr dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3385146040Stjr if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3386146040Stjr goto out_free; 3387146040Stjr /* If the new state has context constraint, 3388146040Stjr build appropriate states for these contexts. */ 3389146040Stjr if (dest_states[i]->has_constraint) 3390146040Stjr { 3391146040Stjr dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3392146040Stjr CONTEXT_WORD); 3393146040Stjr if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3394146040Stjr goto out_free; 3395146040Stjr 3396146040Stjr if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3397146040Stjr need_word_trtable = 1; 3398146040Stjr 3399146040Stjr dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3400146040Stjr CONTEXT_NEWLINE); 3401146040Stjr if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3402146040Stjr goto out_free; 3403146040Stjr } 3404146040Stjr else 3405146040Stjr { 3406146040Stjr dest_states_word[i] = dest_states[i]; 3407146040Stjr dest_states_nl[i] = dest_states[i]; 3408146040Stjr } 3409146040Stjr bitset_merge (acceptable, dests_ch[i]); 3410146040Stjr } 3411146040Stjr 3412146040Stjr if (!BE (need_word_trtable, 0)) 3413146040Stjr { 3414146040Stjr /* We don't care about whether the following character is a word 3415146040Stjr character, or we are in a single-byte character set so we can 3416146040Stjr discern by looking at the character code: allocate a 3417146040Stjr 256-entry transition table. */ 3418146040Stjr trtable = state->trtable = 3419146040Stjr (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3420146040Stjr if (BE (trtable == NULL, 0)) 3421146040Stjr goto out_free; 3422146040Stjr 3423146040Stjr /* For all characters ch...: */ 3424146040Stjr for (i = 0; i < BITSET_UINTS; ++i) 3425146040Stjr for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; 3426146040Stjr elem; 3427146040Stjr mask <<= 1, elem >>= 1, ++ch) 3428146040Stjr if (BE (elem & 1, 0)) 3429146040Stjr { 3430146040Stjr /* There must be exactly one destination which accepts 3431146040Stjr character ch. See group_nodes_into_DFAstates. */ 3432146040Stjr for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3433146040Stjr ; 3434146040Stjr 3435146040Stjr /* j-th destination accepts the word character ch. */ 3436146040Stjr if (dfa->word_char[i] & mask) 3437146040Stjr trtable[ch] = dest_states_word[j]; 3438146040Stjr else 3439146040Stjr trtable[ch] = dest_states[j]; 3440146040Stjr } 3441146040Stjr } 3442146040Stjr else 3443146040Stjr { 3444146040Stjr /* We care about whether the following character is a word 3445146040Stjr character, and we are in a multi-byte character set: discern 3446146040Stjr by looking at the character code: build two 256-entry 3447146040Stjr transition tables, one starting at trtable[0] and one 3448146040Stjr starting at trtable[SBC_MAX]. */ 3449146040Stjr trtable = state->word_trtable = 3450146040Stjr (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3451146040Stjr if (BE (trtable == NULL, 0)) 3452146040Stjr goto out_free; 3453146040Stjr 3454146040Stjr /* For all characters ch...: */ 3455146040Stjr for (i = 0; i < BITSET_UINTS; ++i) 3456146040Stjr for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; 3457146040Stjr elem; 3458146040Stjr mask <<= 1, elem >>= 1, ++ch) 3459146040Stjr if (BE (elem & 1, 0)) 3460146040Stjr { 3461146040Stjr /* There must be exactly one destination which accepts 3462146040Stjr character ch. See group_nodes_into_DFAstates. */ 3463146040Stjr for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3464146040Stjr ; 3465146040Stjr 3466146040Stjr /* j-th destination accepts the word character ch. */ 3467146040Stjr trtable[ch] = dest_states[j]; 3468146040Stjr trtable[ch + SBC_MAX] = dest_states_word[j]; 3469146040Stjr } 3470146040Stjr } 3471146040Stjr 3472146040Stjr /* new line */ 3473146040Stjr if (bitset_contain (acceptable, NEWLINE_CHAR)) 3474146040Stjr { 3475146040Stjr /* The current state accepts newline character. */ 3476146040Stjr for (j = 0; j < ndests; ++j) 3477146040Stjr if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3478146040Stjr { 3479146040Stjr /* k-th destination accepts newline character. */ 3480146040Stjr trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3481146040Stjr if (need_word_trtable) 3482146040Stjr trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3483146040Stjr /* There must be only one destination which accepts 3484146040Stjr newline. See group_nodes_into_DFAstates. */ 3485146040Stjr break; 3486146040Stjr } 3487146040Stjr } 3488146040Stjr 3489146040Stjr if (dest_states_malloced) 3490146040Stjr free (dest_states); 3491146040Stjr 3492146040Stjr re_node_set_free (&follows); 3493146040Stjr for (i = 0; i < ndests; ++i) 3494146040Stjr re_node_set_free (dests_node + i); 3495146040Stjr 3496146040Stjr if (dests_node_malloced) 3497146040Stjr free (dests_node); 3498146040Stjr 3499146040Stjr return 1; 3500146040Stjr} 3501146040Stjr 3502146040Stjr/* Group all nodes belonging to STATE into several destinations. 3503146040Stjr Then for all destinations, set the nodes belonging to the destination 3504146040Stjr to DESTS_NODE[i] and set the characters accepted by the destination 3505146040Stjr to DEST_CH[i]. This function return the number of destinations. */ 3506146040Stjr 3507146040Stjrstatic int 3508146040Stjrgroup_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) 3509146040Stjr re_dfa_t *dfa; 3510146040Stjr const re_dfastate_t *state; 3511146040Stjr re_node_set *dests_node; 3512146040Stjr bitset *dests_ch; 3513146040Stjr{ 3514146040Stjr reg_errcode_t err; 3515146040Stjr int result; 3516146040Stjr int i, j, k; 3517146040Stjr int ndests; /* Number of the destinations from `state'. */ 3518146040Stjr bitset accepts; /* Characters a node can accept. */ 3519146040Stjr const re_node_set *cur_nodes = &state->nodes; 3520146040Stjr bitset_empty (accepts); 3521146040Stjr ndests = 0; 3522146040Stjr 3523146040Stjr /* For all the nodes belonging to `state', */ 3524146040Stjr for (i = 0; i < cur_nodes->nelem; ++i) 3525146040Stjr { 3526146040Stjr re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3527146040Stjr re_token_type_t type = node->type; 3528146040Stjr unsigned int constraint = node->constraint; 3529146040Stjr 3530146040Stjr /* Enumerate all single byte character this node can accept. */ 3531146040Stjr if (type == CHARACTER) 3532146040Stjr bitset_set (accepts, node->opr.c); 3533146040Stjr else if (type == SIMPLE_BRACKET) 3534146040Stjr { 3535146040Stjr bitset_merge (accepts, node->opr.sbcset); 3536146040Stjr } 3537146040Stjr else if (type == OP_PERIOD) 3538146040Stjr { 3539146040Stjr#ifdef RE_ENABLE_I18N 3540146040Stjr if (dfa->mb_cur_max > 1) 3541146040Stjr bitset_merge (accepts, dfa->sb_char); 3542146040Stjr else 3543146040Stjr#endif 3544146040Stjr bitset_set_all (accepts); 3545146040Stjr if (!(dfa->syntax & RE_DOT_NEWLINE)) 3546146040Stjr bitset_clear (accepts, '\n'); 3547146040Stjr if (dfa->syntax & RE_DOT_NOT_NULL) 3548146040Stjr bitset_clear (accepts, '\0'); 3549146040Stjr } 3550146040Stjr#ifdef RE_ENABLE_I18N 3551146040Stjr else if (type == OP_UTF8_PERIOD) 3552146040Stjr { 3553146040Stjr memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2); 3554146040Stjr if (!(dfa->syntax & RE_DOT_NEWLINE)) 3555146040Stjr bitset_clear (accepts, '\n'); 3556146040Stjr if (dfa->syntax & RE_DOT_NOT_NULL) 3557146040Stjr bitset_clear (accepts, '\0'); 3558146040Stjr } 3559146040Stjr#endif 3560146040Stjr else 3561146040Stjr continue; 3562146040Stjr 3563146040Stjr /* Check the `accepts' and sift the characters which are not 3564146040Stjr match it the context. */ 3565146040Stjr if (constraint) 3566146040Stjr { 3567146040Stjr if (constraint & NEXT_NEWLINE_CONSTRAINT) 3568146040Stjr { 3569146040Stjr int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3570146040Stjr bitset_empty (accepts); 3571146040Stjr if (accepts_newline) 3572146040Stjr bitset_set (accepts, NEWLINE_CHAR); 3573146040Stjr else 3574146040Stjr continue; 3575146040Stjr } 3576146040Stjr if (constraint & NEXT_ENDBUF_CONSTRAINT) 3577146040Stjr { 3578146040Stjr bitset_empty (accepts); 3579146040Stjr continue; 3580146040Stjr } 3581146040Stjr 3582146040Stjr if (constraint & NEXT_WORD_CONSTRAINT) 3583146040Stjr { 3584146040Stjr unsigned int any_set = 0; 3585146040Stjr if (type == CHARACTER && !node->word_char) 3586146040Stjr { 3587146040Stjr bitset_empty (accepts); 3588146040Stjr continue; 3589146040Stjr } 3590146040Stjr#ifdef RE_ENABLE_I18N 3591146040Stjr if (dfa->mb_cur_max > 1) 3592146040Stjr for (j = 0; j < BITSET_UINTS; ++j) 3593146040Stjr any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3594146040Stjr else 3595146040Stjr#endif 3596146040Stjr for (j = 0; j < BITSET_UINTS; ++j) 3597146040Stjr any_set |= (accepts[j] &= dfa->word_char[j]); 3598146040Stjr if (!any_set) 3599146040Stjr continue; 3600146040Stjr } 3601146040Stjr if (constraint & NEXT_NOTWORD_CONSTRAINT) 3602146040Stjr { 3603146040Stjr unsigned int any_set = 0; 3604146040Stjr if (type == CHARACTER && node->word_char) 3605146040Stjr { 3606146040Stjr bitset_empty (accepts); 3607146040Stjr continue; 3608146040Stjr } 3609146040Stjr#ifdef RE_ENABLE_I18N 3610146040Stjr if (dfa->mb_cur_max > 1) 3611146040Stjr for (j = 0; j < BITSET_UINTS; ++j) 3612146040Stjr any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3613146040Stjr else 3614146040Stjr#endif 3615146040Stjr for (j = 0; j < BITSET_UINTS; ++j) 3616146040Stjr any_set |= (accepts[j] &= ~dfa->word_char[j]); 3617146040Stjr if (!any_set) 3618146040Stjr continue; 3619146040Stjr } 3620146040Stjr } 3621146040Stjr 3622146040Stjr /* Then divide `accepts' into DFA states, or create a new 3623146040Stjr state. Above, we make sure that accepts is not empty. */ 3624146040Stjr for (j = 0; j < ndests; ++j) 3625146040Stjr { 3626146040Stjr bitset intersec; /* Intersection sets, see below. */ 3627146040Stjr bitset remains; 3628146040Stjr /* Flags, see below. */ 3629146040Stjr int has_intersec, not_subset, not_consumed; 3630146040Stjr 3631146040Stjr /* Optimization, skip if this state doesn't accept the character. */ 3632146040Stjr if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3633146040Stjr continue; 3634146040Stjr 3635146040Stjr /* Enumerate the intersection set of this state and `accepts'. */ 3636146040Stjr has_intersec = 0; 3637146040Stjr for (k = 0; k < BITSET_UINTS; ++k) 3638146040Stjr has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3639146040Stjr /* And skip if the intersection set is empty. */ 3640146040Stjr if (!has_intersec) 3641146040Stjr continue; 3642146040Stjr 3643146040Stjr /* Then check if this state is a subset of `accepts'. */ 3644146040Stjr not_subset = not_consumed = 0; 3645146040Stjr for (k = 0; k < BITSET_UINTS; ++k) 3646146040Stjr { 3647146040Stjr not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3648146040Stjr not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3649146040Stjr } 3650146040Stjr 3651146040Stjr /* If this state isn't a subset of `accepts', create a 3652146040Stjr new group state, which has the `remains'. */ 3653146040Stjr if (not_subset) 3654146040Stjr { 3655146040Stjr bitset_copy (dests_ch[ndests], remains); 3656146040Stjr bitset_copy (dests_ch[j], intersec); 3657146040Stjr err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3658146040Stjr if (BE (err != REG_NOERROR, 0)) 3659146040Stjr goto error_return; 3660146040Stjr ++ndests; 3661146040Stjr } 3662146040Stjr 3663146040Stjr /* Put the position in the current group. */ 3664146040Stjr result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3665146040Stjr if (BE (result < 0, 0)) 3666146040Stjr goto error_return; 3667146040Stjr 3668146040Stjr /* If all characters are consumed, go to next node. */ 3669146040Stjr if (!not_consumed) 3670146040Stjr break; 3671146040Stjr } 3672146040Stjr /* Some characters remain, create a new group. */ 3673146040Stjr if (j == ndests) 3674146040Stjr { 3675146040Stjr bitset_copy (dests_ch[ndests], accepts); 3676146040Stjr err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3677146040Stjr if (BE (err != REG_NOERROR, 0)) 3678146040Stjr goto error_return; 3679146040Stjr ++ndests; 3680146040Stjr bitset_empty (accepts); 3681146040Stjr } 3682146040Stjr } 3683146040Stjr return ndests; 3684146040Stjr error_return: 3685146040Stjr for (j = 0; j < ndests; ++j) 3686146040Stjr re_node_set_free (dests_node + j); 3687146040Stjr return -1; 3688146040Stjr} 3689146040Stjr 3690146040Stjr#ifdef RE_ENABLE_I18N 3691146040Stjr/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3692146040Stjr Return the number of the bytes the node accepts. 3693146040Stjr STR_IDX is the current index of the input string. 3694146040Stjr 3695146040Stjr This function handles the nodes which can accept one character, or 3696146040Stjr one collating element like '.', '[a-z]', opposite to the other nodes 3697146040Stjr can only accept one byte. */ 3698146040Stjr 3699146040Stjrstatic int 3700146040Stjrcheck_node_accept_bytes (dfa, node_idx, input, str_idx) 3701146040Stjr re_dfa_t *dfa; 3702146040Stjr int node_idx, str_idx; 3703146040Stjr const re_string_t *input; 3704146040Stjr{ 3705146040Stjr const re_token_t *node = dfa->nodes + node_idx; 3706146040Stjr int char_len, elem_len; 3707146040Stjr int i; 3708146040Stjr 3709146040Stjr if (BE (node->type == OP_UTF8_PERIOD, 0)) 3710146040Stjr { 3711146040Stjr unsigned char c = re_string_byte_at (input, str_idx), d; 3712146040Stjr if (BE (c < 0xc2, 1)) 3713146040Stjr return 0; 3714146040Stjr 3715146040Stjr if (str_idx + 2 > input->len) 3716146040Stjr return 0; 3717146040Stjr 3718146040Stjr d = re_string_byte_at (input, str_idx + 1); 3719146040Stjr if (c < 0xe0) 3720146040Stjr return (d < 0x80 || d > 0xbf) ? 0 : 2; 3721146040Stjr else if (c < 0xf0) 3722146040Stjr { 3723146040Stjr char_len = 3; 3724146040Stjr if (c == 0xe0 && d < 0xa0) 3725146040Stjr return 0; 3726146040Stjr } 3727146040Stjr else if (c < 0xf8) 3728146040Stjr { 3729146040Stjr char_len = 4; 3730146040Stjr if (c == 0xf0 && d < 0x90) 3731146040Stjr return 0; 3732146040Stjr } 3733146040Stjr else if (c < 0xfc) 3734146040Stjr { 3735146040Stjr char_len = 5; 3736146040Stjr if (c == 0xf8 && d < 0x88) 3737146040Stjr return 0; 3738146040Stjr } 3739146040Stjr else if (c < 0xfe) 3740146040Stjr { 3741146040Stjr char_len = 6; 3742146040Stjr if (c == 0xfc && d < 0x84) 3743146040Stjr return 0; 3744146040Stjr } 3745146040Stjr else 3746146040Stjr return 0; 3747146040Stjr 3748146040Stjr if (str_idx + char_len > input->len) 3749146040Stjr return 0; 3750146040Stjr 3751146040Stjr for (i = 1; i < char_len; ++i) 3752146040Stjr { 3753146040Stjr d = re_string_byte_at (input, str_idx + i); 3754146040Stjr if (d < 0x80 || d > 0xbf) 3755146040Stjr return 0; 3756146040Stjr } 3757146040Stjr return char_len; 3758146040Stjr } 3759146040Stjr 3760146040Stjr char_len = re_string_char_size_at (input, str_idx); 3761146040Stjr if (node->type == OP_PERIOD) 3762146040Stjr { 3763146040Stjr if (char_len <= 1) 3764146040Stjr return 0; 3765146040Stjr /* FIXME: I don't think this if is needed, as both '\n' 3766146040Stjr and '\0' are char_len == 1. */ 3767146040Stjr /* '.' accepts any one character except the following two cases. */ 3768146040Stjr if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3769146040Stjr re_string_byte_at (input, str_idx) == '\n') || 3770146040Stjr ((dfa->syntax & RE_DOT_NOT_NULL) && 3771146040Stjr re_string_byte_at (input, str_idx) == '\0')) 3772146040Stjr return 0; 3773146040Stjr return char_len; 3774146040Stjr } 3775146040Stjr 3776146040Stjr elem_len = re_string_elem_size_at (input, str_idx); 3777146040Stjr if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3778146040Stjr return 0; 3779146040Stjr 3780146040Stjr if (node->type == COMPLEX_BRACKET) 3781146040Stjr { 3782146040Stjr const re_charset_t *cset = node->opr.mbcset; 3783146040Stjr# ifdef _LIBC 3784146040Stjr const unsigned char *pin 3785146040Stjr = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3786146040Stjr int j; 3787146040Stjr uint32_t nrules; 3788146040Stjr# endif /* _LIBC */ 3789146040Stjr int match_len = 0; 3790146040Stjr wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3791146040Stjr ? re_string_wchar_at (input, str_idx) : 0); 3792146040Stjr 3793146040Stjr /* match with multibyte character? */ 3794146040Stjr for (i = 0; i < cset->nmbchars; ++i) 3795146040Stjr if (wc == cset->mbchars[i]) 3796146040Stjr { 3797146040Stjr match_len = char_len; 3798146040Stjr goto check_node_accept_bytes_match; 3799146040Stjr } 3800146040Stjr /* match with character_class? */ 3801146040Stjr for (i = 0; i < cset->nchar_classes; ++i) 3802146040Stjr { 3803146040Stjr wctype_t wt = cset->char_classes[i]; 3804146040Stjr if (__iswctype (wc, wt)) 3805146040Stjr { 3806146040Stjr match_len = char_len; 3807146040Stjr goto check_node_accept_bytes_match; 3808146040Stjr } 3809146040Stjr } 3810146040Stjr 3811146040Stjr# ifdef _LIBC 3812146040Stjr nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3813146040Stjr if (nrules != 0) 3814146040Stjr { 3815146040Stjr unsigned int in_collseq = 0; 3816146040Stjr const int32_t *table, *indirect; 3817146040Stjr const unsigned char *weights, *extra; 3818146040Stjr const char *collseqwc; 3819146040Stjr int32_t idx; 3820146040Stjr /* This #include defines a local function! */ 3821146040Stjr# include <locale/weight.h> 3822146040Stjr 3823146040Stjr /* match with collating_symbol? */ 3824146040Stjr if (cset->ncoll_syms) 3825146040Stjr extra = (const unsigned char *) 3826146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3827146040Stjr for (i = 0; i < cset->ncoll_syms; ++i) 3828146040Stjr { 3829146040Stjr const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3830146040Stjr /* Compare the length of input collating element and 3831146040Stjr the length of current collating element. */ 3832146040Stjr if (*coll_sym != elem_len) 3833146040Stjr continue; 3834146040Stjr /* Compare each bytes. */ 3835146040Stjr for (j = 0; j < *coll_sym; j++) 3836146040Stjr if (pin[j] != coll_sym[1 + j]) 3837146040Stjr break; 3838146040Stjr if (j == *coll_sym) 3839146040Stjr { 3840146040Stjr /* Match if every bytes is equal. */ 3841146040Stjr match_len = j; 3842146040Stjr goto check_node_accept_bytes_match; 3843146040Stjr } 3844146040Stjr } 3845146040Stjr 3846146040Stjr if (cset->nranges) 3847146040Stjr { 3848146040Stjr if (elem_len <= char_len) 3849146040Stjr { 3850146040Stjr collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3851146040Stjr in_collseq = __collseq_table_lookup (collseqwc, wc); 3852146040Stjr } 3853146040Stjr else 3854146040Stjr in_collseq = find_collation_sequence_value (pin, elem_len); 3855146040Stjr } 3856146040Stjr /* match with range expression? */ 3857146040Stjr for (i = 0; i < cset->nranges; ++i) 3858146040Stjr if (cset->range_starts[i] <= in_collseq 3859146040Stjr && in_collseq <= cset->range_ends[i]) 3860146040Stjr { 3861146040Stjr match_len = elem_len; 3862146040Stjr goto check_node_accept_bytes_match; 3863146040Stjr } 3864146040Stjr 3865146040Stjr /* match with equivalence_class? */ 3866146040Stjr if (cset->nequiv_classes) 3867146040Stjr { 3868146040Stjr const unsigned char *cp = pin; 3869146040Stjr table = (const int32_t *) 3870146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3871146040Stjr weights = (const unsigned char *) 3872146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3873146040Stjr extra = (const unsigned char *) 3874146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3875146040Stjr indirect = (const int32_t *) 3876146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3877146040Stjr idx = findidx (&cp); 3878146040Stjr if (idx > 0) 3879146040Stjr for (i = 0; i < cset->nequiv_classes; ++i) 3880146040Stjr { 3881146040Stjr int32_t equiv_class_idx = cset->equiv_classes[i]; 3882146040Stjr size_t weight_len = weights[idx]; 3883146040Stjr if (weight_len == weights[equiv_class_idx]) 3884146040Stjr { 3885146040Stjr int cnt = 0; 3886146040Stjr while (cnt <= weight_len 3887146040Stjr && (weights[equiv_class_idx + 1 + cnt] 3888146040Stjr == weights[idx + 1 + cnt])) 3889146040Stjr ++cnt; 3890146040Stjr if (cnt > weight_len) 3891146040Stjr { 3892146040Stjr match_len = elem_len; 3893146040Stjr goto check_node_accept_bytes_match; 3894146040Stjr } 3895146040Stjr } 3896146040Stjr } 3897146040Stjr } 3898146040Stjr } 3899146040Stjr else 3900146040Stjr# endif /* _LIBC */ 3901146040Stjr { 3902146040Stjr /* match with range expression? */ 3903146040Stjr#if __GNUC__ >= 2 3904146040Stjr wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3905146040Stjr#else 3906146040Stjr wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3907146040Stjr cmp_buf[2] = wc; 3908146040Stjr#endif 3909146040Stjr for (i = 0; i < cset->nranges; ++i) 3910146040Stjr { 3911146040Stjr cmp_buf[0] = cset->range_starts[i]; 3912146040Stjr cmp_buf[4] = cset->range_ends[i]; 3913146040Stjr if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3914146040Stjr && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3915146040Stjr { 3916146040Stjr match_len = char_len; 3917146040Stjr goto check_node_accept_bytes_match; 3918146040Stjr } 3919146040Stjr } 3920146040Stjr } 3921146040Stjr check_node_accept_bytes_match: 3922146040Stjr if (!cset->non_match) 3923146040Stjr return match_len; 3924146040Stjr else 3925146040Stjr { 3926146040Stjr if (match_len > 0) 3927146040Stjr return 0; 3928146040Stjr else 3929146040Stjr return (elem_len > char_len) ? elem_len : char_len; 3930146040Stjr } 3931146040Stjr } 3932146040Stjr return 0; 3933146040Stjr} 3934146040Stjr 3935146040Stjr# ifdef _LIBC 3936146040Stjrstatic unsigned int 3937146040Stjrfind_collation_sequence_value (mbs, mbs_len) 3938146040Stjr const unsigned char *mbs; 3939146040Stjr size_t mbs_len; 3940146040Stjr{ 3941146040Stjr uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3942146040Stjr if (nrules == 0) 3943146040Stjr { 3944146040Stjr if (mbs_len == 1) 3945146040Stjr { 3946146040Stjr /* No valid character. Match it as a single byte character. */ 3947146040Stjr const unsigned char *collseq = (const unsigned char *) 3948146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3949146040Stjr return collseq[mbs[0]]; 3950146040Stjr } 3951146040Stjr return UINT_MAX; 3952146040Stjr } 3953146040Stjr else 3954146040Stjr { 3955146040Stjr int32_t idx; 3956146040Stjr const unsigned char *extra = (const unsigned char *) 3957146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3958146040Stjr int32_t extrasize = (const unsigned char *) 3959146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 3960146040Stjr 3961146040Stjr for (idx = 0; idx < extrasize;) 3962146040Stjr { 3963146040Stjr int mbs_cnt, found = 0; 3964146040Stjr int32_t elem_mbs_len; 3965146040Stjr /* Skip the name of collating element name. */ 3966146040Stjr idx = idx + extra[idx] + 1; 3967146040Stjr elem_mbs_len = extra[idx++]; 3968146040Stjr if (mbs_len == elem_mbs_len) 3969146040Stjr { 3970146040Stjr for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 3971146040Stjr if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 3972146040Stjr break; 3973146040Stjr if (mbs_cnt == elem_mbs_len) 3974146040Stjr /* Found the entry. */ 3975146040Stjr found = 1; 3976146040Stjr } 3977146040Stjr /* Skip the byte sequence of the collating element. */ 3978146040Stjr idx += elem_mbs_len; 3979146040Stjr /* Adjust for the alignment. */ 3980146040Stjr idx = (idx + 3) & ~3; 3981146040Stjr /* Skip the collation sequence value. */ 3982146040Stjr idx += sizeof (uint32_t); 3983146040Stjr /* Skip the wide char sequence of the collating element. */ 3984146040Stjr idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 3985146040Stjr /* If we found the entry, return the sequence value. */ 3986146040Stjr if (found) 3987146040Stjr return *(uint32_t *) (extra + idx); 3988146040Stjr /* Skip the collation sequence value. */ 3989146040Stjr idx += sizeof (uint32_t); 3990146040Stjr } 3991146040Stjr return UINT_MAX; 3992146040Stjr } 3993146040Stjr} 3994146040Stjr# endif /* _LIBC */ 3995146040Stjr#endif /* RE_ENABLE_I18N */ 3996146040Stjr 3997146040Stjr/* Check whether the node accepts the byte which is IDX-th 3998146040Stjr byte of the INPUT. */ 3999146040Stjr 4000146040Stjrstatic int 4001146040Stjrcheck_node_accept (mctx, node, idx) 4002146040Stjr const re_match_context_t *mctx; 4003146040Stjr const re_token_t *node; 4004146040Stjr int idx; 4005146040Stjr{ 4006146040Stjr unsigned char ch; 4007146040Stjr ch = re_string_byte_at (&mctx->input, idx); 4008146040Stjr switch (node->type) 4009146040Stjr { 4010146040Stjr case CHARACTER: 4011146040Stjr if (node->opr.c != ch) 4012146040Stjr return 0; 4013146040Stjr break; 4014146040Stjr 4015146040Stjr case SIMPLE_BRACKET: 4016146040Stjr if (!bitset_contain (node->opr.sbcset, ch)) 4017146040Stjr return 0; 4018146040Stjr break; 4019146040Stjr 4020146040Stjr#ifdef RE_ENABLE_I18N 4021146040Stjr case OP_UTF8_PERIOD: 4022146040Stjr if (ch >= 0x80) 4023146040Stjr return 0; 4024146040Stjr /* FALLTHROUGH */ 4025146040Stjr#endif 4026146040Stjr case OP_PERIOD: 4027146040Stjr if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4028146040Stjr || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4029146040Stjr return 0; 4030146040Stjr break; 4031146040Stjr 4032146040Stjr default: 4033146040Stjr return 0; 4034146040Stjr } 4035146040Stjr 4036146040Stjr if (node->constraint) 4037146040Stjr { 4038146040Stjr /* The node has constraints. Check whether the current context 4039146040Stjr satisfies the constraints. */ 4040146040Stjr unsigned int context = re_string_context_at (&mctx->input, idx, 4041146040Stjr mctx->eflags); 4042146040Stjr if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4043146040Stjr return 0; 4044146040Stjr } 4045146040Stjr 4046146040Stjr return 1; 4047146040Stjr} 4048146040Stjr 4049146040Stjr/* Extend the buffers, if the buffers have run out. */ 4050146040Stjr 4051146040Stjrstatic reg_errcode_t 4052146040Stjrextend_buffers (mctx) 4053146040Stjr re_match_context_t *mctx; 4054146040Stjr{ 4055146040Stjr reg_errcode_t ret; 4056146040Stjr re_string_t *pstr = &mctx->input; 4057146040Stjr 4058146040Stjr /* Double the lengthes of the buffers. */ 4059146040Stjr ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4060146040Stjr if (BE (ret != REG_NOERROR, 0)) 4061146040Stjr return ret; 4062146040Stjr 4063146040Stjr if (mctx->state_log != NULL) 4064146040Stjr { 4065146040Stjr /* And double the length of state_log. */ 4066146040Stjr /* XXX We have no indication of the size of this buffer. If this 4067146040Stjr allocation fail we have no indication that the state_log array 4068146040Stjr does not have the right size. */ 4069146040Stjr re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4070146040Stjr pstr->bufs_len + 1); 4071146040Stjr if (BE (new_array == NULL, 0)) 4072146040Stjr return REG_ESPACE; 4073146040Stjr mctx->state_log = new_array; 4074146040Stjr } 4075146040Stjr 4076146040Stjr /* Then reconstruct the buffers. */ 4077146040Stjr if (pstr->icase) 4078146040Stjr { 4079146040Stjr#ifdef RE_ENABLE_I18N 4080146040Stjr if (pstr->mb_cur_max > 1) 4081146040Stjr { 4082146040Stjr ret = build_wcs_upper_buffer (pstr); 4083146040Stjr if (BE (ret != REG_NOERROR, 0)) 4084146040Stjr return ret; 4085146040Stjr } 4086146040Stjr else 4087146040Stjr#endif /* RE_ENABLE_I18N */ 4088146040Stjr build_upper_buffer (pstr); 4089146040Stjr } 4090146040Stjr else 4091146040Stjr { 4092146040Stjr#ifdef RE_ENABLE_I18N 4093146040Stjr if (pstr->mb_cur_max > 1) 4094146040Stjr build_wcs_buffer (pstr); 4095146040Stjr else 4096146040Stjr#endif /* RE_ENABLE_I18N */ 4097146040Stjr { 4098146040Stjr if (pstr->trans != NULL) 4099146040Stjr re_string_translate_buffer (pstr); 4100146040Stjr } 4101146040Stjr } 4102146040Stjr return REG_NOERROR; 4103146040Stjr} 4104146040Stjr 4105146040Stjr 4106146040Stjr/* Functions for matching context. */ 4107146040Stjr 4108146040Stjr/* Initialize MCTX. */ 4109146040Stjr 4110146040Stjrstatic reg_errcode_t 4111146040Stjrmatch_ctx_init (mctx, eflags, n) 4112146040Stjr re_match_context_t *mctx; 4113146040Stjr int eflags, n; 4114146040Stjr{ 4115146040Stjr mctx->eflags = eflags; 4116146040Stjr mctx->match_last = -1; 4117146040Stjr if (n > 0) 4118146040Stjr { 4119146040Stjr mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4120146040Stjr mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4121146040Stjr if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4122146040Stjr return REG_ESPACE; 4123146040Stjr } 4124146040Stjr /* Already zero-ed by the caller. 4125146040Stjr else 4126146040Stjr mctx->bkref_ents = NULL; 4127146040Stjr mctx->nbkref_ents = 0; 4128146040Stjr mctx->nsub_tops = 0; */ 4129146040Stjr mctx->abkref_ents = n; 4130146040Stjr mctx->max_mb_elem_len = 1; 4131146040Stjr mctx->asub_tops = n; 4132146040Stjr return REG_NOERROR; 4133146040Stjr} 4134146040Stjr 4135146040Stjr/* Clean the entries which depend on the current input in MCTX. 4136146040Stjr This function must be invoked when the matcher changes the start index 4137146040Stjr of the input, or changes the input string. */ 4138146040Stjr 4139146040Stjrstatic void 4140146040Stjrmatch_ctx_clean (mctx) 4141146040Stjr re_match_context_t *mctx; 4142146040Stjr{ 4143146040Stjr int st_idx; 4144146040Stjr for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4145146040Stjr { 4146146040Stjr int sl_idx; 4147146040Stjr re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4148146040Stjr for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4149146040Stjr { 4150146040Stjr re_sub_match_last_t *last = top->lasts[sl_idx]; 4151146040Stjr re_free (last->path.array); 4152146040Stjr re_free (last); 4153146040Stjr } 4154146040Stjr re_free (top->lasts); 4155146040Stjr if (top->path) 4156146040Stjr { 4157146040Stjr re_free (top->path->array); 4158146040Stjr re_free (top->path); 4159146040Stjr } 4160146040Stjr free (top); 4161146040Stjr } 4162146040Stjr 4163146040Stjr mctx->nsub_tops = 0; 4164146040Stjr mctx->nbkref_ents = 0; 4165146040Stjr} 4166146040Stjr 4167146040Stjr/* Free all the memory associated with MCTX. */ 4168146040Stjr 4169146040Stjrstatic void 4170146040Stjrmatch_ctx_free (mctx) 4171146040Stjr re_match_context_t *mctx; 4172146040Stjr{ 4173146040Stjr /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4174146040Stjr match_ctx_clean (mctx); 4175146040Stjr re_free (mctx->sub_tops); 4176146040Stjr re_free (mctx->bkref_ents); 4177146040Stjr} 4178146040Stjr 4179146040Stjr/* Add a new backreference entry to MCTX. 4180146040Stjr Note that we assume that caller never call this function with duplicate 4181146040Stjr entry, and call with STR_IDX which isn't smaller than any existing entry. 4182146040Stjr*/ 4183146040Stjr 4184146040Stjrstatic reg_errcode_t 4185146040Stjrmatch_ctx_add_entry (mctx, node, str_idx, from, to) 4186146040Stjr re_match_context_t *mctx; 4187146040Stjr int node, str_idx, from, to; 4188146040Stjr{ 4189146040Stjr if (mctx->nbkref_ents >= mctx->abkref_ents) 4190146040Stjr { 4191146040Stjr struct re_backref_cache_entry* new_entry; 4192146040Stjr new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4193146040Stjr mctx->abkref_ents * 2); 4194146040Stjr if (BE (new_entry == NULL, 0)) 4195146040Stjr { 4196146040Stjr re_free (mctx->bkref_ents); 4197146040Stjr return REG_ESPACE; 4198146040Stjr } 4199146040Stjr mctx->bkref_ents = new_entry; 4200146040Stjr memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4201146040Stjr sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4202146040Stjr mctx->abkref_ents *= 2; 4203146040Stjr } 4204146040Stjr if (mctx->nbkref_ents > 0 4205146040Stjr && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4206146040Stjr mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4207146040Stjr 4208146040Stjr mctx->bkref_ents[mctx->nbkref_ents].node = node; 4209146040Stjr mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4210146040Stjr mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4211146040Stjr mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4212146040Stjr 4213146040Stjr /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4214146040Stjr If bit N is clear, means that this entry won't epsilon-transition to 4215146040Stjr an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4216146040Stjr it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4217146040Stjr such node. 4218146040Stjr 4219146040Stjr A backreference does not epsilon-transition unless it is empty, so set 4220146040Stjr to all zeros if FROM != TO. */ 4221146040Stjr mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4222146040Stjr = (from == to ? ~0 : 0); 4223146040Stjr 4224146040Stjr mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4225146040Stjr if (mctx->max_mb_elem_len < to - from) 4226146040Stjr mctx->max_mb_elem_len = to - from; 4227146040Stjr return REG_NOERROR; 4228146040Stjr} 4229146040Stjr 4230146040Stjr/* Search for the first entry which has the same str_idx, or -1 if none is 4231146040Stjr found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4232146040Stjr 4233146040Stjrstatic int 4234146040Stjrsearch_cur_bkref_entry (mctx, str_idx) 4235146040Stjr re_match_context_t *mctx; 4236146040Stjr int str_idx; 4237146040Stjr{ 4238146040Stjr int left, right, mid, last; 4239146040Stjr last = right = mctx->nbkref_ents; 4240146040Stjr for (left = 0; left < right;) 4241146040Stjr { 4242146040Stjr mid = (left + right) / 2; 4243146040Stjr if (mctx->bkref_ents[mid].str_idx < str_idx) 4244146040Stjr left = mid + 1; 4245146040Stjr else 4246146040Stjr right = mid; 4247146040Stjr } 4248146040Stjr if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4249146040Stjr return left; 4250146040Stjr else 4251146040Stjr return -1; 4252146040Stjr} 4253146040Stjr 4254146040Stjr/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4255146040Stjr at STR_IDX. */ 4256146040Stjr 4257146040Stjrstatic reg_errcode_t 4258146040Stjrmatch_ctx_add_subtop (mctx, node, str_idx) 4259146040Stjr re_match_context_t *mctx; 4260146040Stjr int node, str_idx; 4261146040Stjr{ 4262146040Stjr#ifdef DEBUG 4263146040Stjr assert (mctx->sub_tops != NULL); 4264146040Stjr assert (mctx->asub_tops > 0); 4265146040Stjr#endif 4266146040Stjr if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4267146040Stjr { 4268146040Stjr int new_asub_tops = mctx->asub_tops * 2; 4269146040Stjr re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4270146040Stjr re_sub_match_top_t *, 4271146040Stjr new_asub_tops); 4272146040Stjr if (BE (new_array == NULL, 0)) 4273146040Stjr return REG_ESPACE; 4274146040Stjr mctx->sub_tops = new_array; 4275146040Stjr mctx->asub_tops = new_asub_tops; 4276146040Stjr } 4277146040Stjr mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4278146040Stjr if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4279146040Stjr return REG_ESPACE; 4280146040Stjr mctx->sub_tops[mctx->nsub_tops]->node = node; 4281146040Stjr mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4282146040Stjr return REG_NOERROR; 4283146040Stjr} 4284146040Stjr 4285146040Stjr/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4286146040Stjr at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4287146040Stjr 4288146040Stjrstatic re_sub_match_last_t * 4289146040Stjrmatch_ctx_add_sublast (subtop, node, str_idx) 4290146040Stjr re_sub_match_top_t *subtop; 4291146040Stjr int node, str_idx; 4292146040Stjr{ 4293146040Stjr re_sub_match_last_t *new_entry; 4294146040Stjr if (BE (subtop->nlasts == subtop->alasts, 0)) 4295146040Stjr { 4296146040Stjr int new_alasts = 2 * subtop->alasts + 1; 4297146040Stjr re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4298146040Stjr re_sub_match_last_t *, 4299146040Stjr new_alasts); 4300146040Stjr if (BE (new_array == NULL, 0)) 4301146040Stjr return NULL; 4302146040Stjr subtop->lasts = new_array; 4303146040Stjr subtop->alasts = new_alasts; 4304146040Stjr } 4305146040Stjr new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4306146040Stjr if (BE (new_entry != NULL, 1)) 4307146040Stjr { 4308146040Stjr subtop->lasts[subtop->nlasts] = new_entry; 4309146040Stjr new_entry->node = node; 4310146040Stjr new_entry->str_idx = str_idx; 4311146040Stjr ++subtop->nlasts; 4312146040Stjr } 4313146040Stjr return new_entry; 4314146040Stjr} 4315146040Stjr 4316146040Stjrstatic void 4317146040Stjrsift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx) 4318146040Stjr re_sift_context_t *sctx; 4319146040Stjr re_dfastate_t **sifted_sts, **limited_sts; 4320146040Stjr int last_node, last_str_idx; 4321146040Stjr{ 4322146040Stjr sctx->sifted_states = sifted_sts; 4323146040Stjr sctx->limited_states = limited_sts; 4324146040Stjr sctx->last_node = last_node; 4325146040Stjr sctx->last_str_idx = last_str_idx; 4326146040Stjr re_node_set_init_empty (&sctx->limits); 4327146040Stjr} 4328