1146040Stjr/* Extended regular expression matching and search library. 2250724Sjkim Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc. 3146040Stjr This file is part of the GNU C Library. 4146040Stjr Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5146040Stjr 6146040Stjr The GNU C Library is free software; you can redistribute it and/or 7146040Stjr modify it under the terms of the GNU Lesser General Public 8146040Stjr License as published by the Free Software Foundation; either 9146040Stjr version 2.1 of the License, or (at your option) any later version. 10146040Stjr 11146040Stjr The GNU C Library is distributed in the hope that it will be useful, 12146040Stjr but WITHOUT ANY WARRANTY; without even the implied warranty of 13146040Stjr MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14146040Stjr Lesser General Public License for more details. 15146040Stjr 16146040Stjr You should have received a copy of the GNU Lesser General Public 17250724Sjkim License along with the GNU C Library; if not, see 18250724Sjkim <http://www.gnu.org/licenses/>. */ 19146040Stjr 20146040Stjrstatic reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 21250724Sjkim size_t length, reg_syntax_t syntax); 22146040Stjrstatic void re_compile_fastmap_iter (regex_t *bufp, 23146040Stjr const re_dfastate_t *init_state, 24146040Stjr char *fastmap); 25250724Sjkimstatic reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 26146040Stjr#ifdef RE_ENABLE_I18N 27146040Stjrstatic void free_charset (re_charset_t *cset); 28146040Stjr#endif /* RE_ENABLE_I18N */ 29146040Stjrstatic void free_workarea_compile (regex_t *preg); 30146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa); 31146040Stjr#ifdef RE_ENABLE_I18N 32146040Stjrstatic void optimize_utf8 (re_dfa_t *dfa); 33146040Stjr#endif 34146040Stjrstatic reg_errcode_t analyze (regex_t *preg); 35146040Stjrstatic reg_errcode_t preorder (bin_tree_t *root, 36146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)), 37146040Stjr void *extra); 38146040Stjrstatic reg_errcode_t postorder (bin_tree_t *root, 39146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)), 40146040Stjr void *extra); 41146040Stjrstatic reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 42146040Stjrstatic reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 43146040Stjrstatic bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 44146040Stjr bin_tree_t *node); 45146040Stjrstatic reg_errcode_t calc_first (void *extra, bin_tree_t *node); 46146040Stjrstatic reg_errcode_t calc_next (void *extra, bin_tree_t *node); 47146040Stjrstatic reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 48250724Sjkimstatic int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); 49250724Sjkimstatic int search_duplicated_node (const re_dfa_t *dfa, int org_node, 50146040Stjr unsigned int constraint); 51146040Stjrstatic reg_errcode_t calc_eclosure (re_dfa_t *dfa); 52146040Stjrstatic reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 53146040Stjr int node, int root); 54146040Stjrstatic reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 55146040Stjrstatic int fetch_number (re_string_t *input, re_token_t *token, 56146040Stjr reg_syntax_t syntax); 57146040Stjrstatic int peek_token (re_token_t *token, re_string_t *input, 58250724Sjkim reg_syntax_t syntax) internal_function; 59146040Stjrstatic bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 60146040Stjr reg_syntax_t syntax, reg_errcode_t *err); 61146040Stjrstatic bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 62146040Stjr re_token_t *token, reg_syntax_t syntax, 63146040Stjr int nest, reg_errcode_t *err); 64146040Stjrstatic bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 65146040Stjr re_token_t *token, reg_syntax_t syntax, 66146040Stjr int nest, reg_errcode_t *err); 67146040Stjrstatic bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 68146040Stjr re_token_t *token, reg_syntax_t syntax, 69146040Stjr int nest, reg_errcode_t *err); 70146040Stjrstatic bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 71146040Stjr re_token_t *token, reg_syntax_t syntax, 72146040Stjr int nest, reg_errcode_t *err); 73146040Stjrstatic bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 74146040Stjr re_dfa_t *dfa, re_token_t *token, 75146040Stjr reg_syntax_t syntax, reg_errcode_t *err); 76146040Stjrstatic bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 77146040Stjr re_token_t *token, reg_syntax_t syntax, 78146040Stjr reg_errcode_t *err); 79146040Stjrstatic reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 80146040Stjr re_string_t *regexp, 81146040Stjr re_token_t *token, int token_len, 82146040Stjr re_dfa_t *dfa, 83146040Stjr reg_syntax_t syntax, 84146040Stjr int accept_hyphen); 85146040Stjrstatic reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 86146040Stjr re_string_t *regexp, 87146040Stjr re_token_t *token); 88146040Stjr#ifdef RE_ENABLE_I18N 89250724Sjkimstatic reg_errcode_t build_equiv_class (bitset_t sbcset, 90146040Stjr re_charset_t *mbcset, 91146040Stjr int *equiv_class_alloc, 92146040Stjr const unsigned char *name); 93250724Sjkimstatic reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 94250724Sjkim bitset_t sbcset, 95146040Stjr re_charset_t *mbcset, 96146040Stjr int *char_class_alloc, 97146040Stjr const unsigned char *class_name, 98146040Stjr reg_syntax_t syntax); 99146040Stjr#else /* not RE_ENABLE_I18N */ 100250724Sjkimstatic reg_errcode_t build_equiv_class (bitset_t sbcset, 101146040Stjr const unsigned char *name); 102250724Sjkimstatic reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 103250724Sjkim bitset_t sbcset, 104146040Stjr const unsigned char *class_name, 105146040Stjr reg_syntax_t syntax); 106146040Stjr#endif /* not RE_ENABLE_I18N */ 107146040Stjrstatic bin_tree_t *build_charclass_op (re_dfa_t *dfa, 108250724Sjkim RE_TRANSLATE_TYPE trans, 109146040Stjr const unsigned char *class_name, 110146040Stjr const unsigned char *extra, 111146040Stjr int non_match, reg_errcode_t *err); 112146040Stjrstatic bin_tree_t *create_tree (re_dfa_t *dfa, 113146040Stjr bin_tree_t *left, bin_tree_t *right, 114146040Stjr re_token_type_t type); 115146040Stjrstatic bin_tree_t *create_token_tree (re_dfa_t *dfa, 116146040Stjr bin_tree_t *left, bin_tree_t *right, 117146040Stjr const re_token_t *token); 118146040Stjrstatic bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 119146040Stjrstatic void free_token (re_token_t *node); 120146040Stjrstatic reg_errcode_t free_tree (void *extra, bin_tree_t *node); 121146040Stjrstatic reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 122146040Stjr 123146040Stjr/* This table gives an error message for each of the error codes listed 124146040Stjr in regex.h. Obviously the order here has to be same as there. 125146040Stjr POSIX doesn't require that we do anything for REG_NOERROR, 126146040Stjr but why not be nice? */ 127146040Stjr 128146040Stjrconst char __re_error_msgid[] attribute_hidden = 129146040Stjr { 130146040Stjr#define REG_NOERROR_IDX 0 131146040Stjr gettext_noop ("Success") /* REG_NOERROR */ 132146040Stjr "\0" 133146040Stjr#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 134146040Stjr gettext_noop ("No match") /* REG_NOMATCH */ 135146040Stjr "\0" 136146040Stjr#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 137146040Stjr gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 138146040Stjr "\0" 139146040Stjr#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 140146040Stjr gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 141146040Stjr "\0" 142146040Stjr#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 143146040Stjr gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 144146040Stjr "\0" 145146040Stjr#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 146146040Stjr gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 147146040Stjr "\0" 148146040Stjr#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 149146040Stjr gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 150146040Stjr "\0" 151146040Stjr#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 152146040Stjr gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 153146040Stjr "\0" 154146040Stjr#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 155146040Stjr gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 156146040Stjr "\0" 157146040Stjr#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 158146040Stjr gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 159146040Stjr "\0" 160146040Stjr#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 161146040Stjr gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 162146040Stjr "\0" 163146040Stjr#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 164146040Stjr gettext_noop ("Invalid range end") /* REG_ERANGE */ 165146040Stjr "\0" 166146040Stjr#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 167146040Stjr gettext_noop ("Memory exhausted") /* REG_ESPACE */ 168146040Stjr "\0" 169146040Stjr#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 170146040Stjr gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 171146040Stjr "\0" 172146040Stjr#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 173146040Stjr gettext_noop ("Premature end of regular expression") /* REG_EEND */ 174146040Stjr "\0" 175146040Stjr#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 176146040Stjr gettext_noop ("Regular expression too big") /* REG_ESIZE */ 177146040Stjr "\0" 178146040Stjr#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 179146040Stjr gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 180146040Stjr }; 181146040Stjr 182146040Stjrconst size_t __re_error_msgid_idx[] attribute_hidden = 183146040Stjr { 184146040Stjr REG_NOERROR_IDX, 185146040Stjr REG_NOMATCH_IDX, 186146040Stjr REG_BADPAT_IDX, 187146040Stjr REG_ECOLLATE_IDX, 188146040Stjr REG_ECTYPE_IDX, 189146040Stjr REG_EESCAPE_IDX, 190146040Stjr REG_ESUBREG_IDX, 191146040Stjr REG_EBRACK_IDX, 192146040Stjr REG_EPAREN_IDX, 193146040Stjr REG_EBRACE_IDX, 194146040Stjr REG_BADBR_IDX, 195146040Stjr REG_ERANGE_IDX, 196146040Stjr REG_ESPACE_IDX, 197146040Stjr REG_BADRPT_IDX, 198146040Stjr REG_EEND_IDX, 199146040Stjr REG_ESIZE_IDX, 200146040Stjr REG_ERPAREN_IDX 201146040Stjr }; 202146040Stjr 203146040Stjr/* Entry points for GNU code. */ 204146040Stjr 205146040Stjr/* re_compile_pattern is the GNU regular expression compiler: it 206146040Stjr compiles PATTERN (of length LENGTH) and puts the result in BUFP. 207146040Stjr Returns 0 if the pattern was valid, otherwise an error string. 208146040Stjr 209146040Stjr Assumes the `allocated' (and perhaps `buffer') and `translate' fields 210146040Stjr are set in BUFP on entry. */ 211146040Stjr 212146040Stjrconst char * 213146040Stjrre_compile_pattern (pattern, length, bufp) 214146040Stjr const char *pattern; 215146040Stjr size_t length; 216146040Stjr struct re_pattern_buffer *bufp; 217146040Stjr{ 218146040Stjr reg_errcode_t ret; 219146040Stjr 220146040Stjr /* And GNU code determines whether or not to get register information 221146040Stjr by passing null for the REGS argument to re_match, etc., not by 222146040Stjr setting no_sub, unless RE_NO_SUB is set. */ 223146040Stjr bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 224146040Stjr 225146040Stjr /* Match anchors at newline. */ 226146040Stjr bufp->newline_anchor = 1; 227146040Stjr 228146040Stjr ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 229146040Stjr 230146040Stjr if (!ret) 231146040Stjr return NULL; 232146040Stjr return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 233146040Stjr} 234146040Stjr#ifdef _LIBC 235146040Stjrweak_alias (__re_compile_pattern, re_compile_pattern) 236146040Stjr#endif 237146040Stjr 238146040Stjr/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 239146040Stjr also be assigned to arbitrarily: each pattern buffer stores its own 240146040Stjr syntax, so it can be changed between regex compilations. */ 241146040Stjr/* This has no initializer because initialized variables in Emacs 242146040Stjr become read-only after dumping. */ 243146040Stjrreg_syntax_t re_syntax_options; 244146040Stjr 245146040Stjr 246146040Stjr/* Specify the precise syntax of regexps for compilation. This provides 247146040Stjr for compatibility for various utilities which historically have 248146040Stjr different, incompatible syntaxes. 249146040Stjr 250146040Stjr The argument SYNTAX is a bit mask comprised of the various bits 251146040Stjr defined in regex.h. We return the old syntax. */ 252146040Stjr 253146040Stjrreg_syntax_t 254146040Stjrre_set_syntax (syntax) 255146040Stjr reg_syntax_t syntax; 256146040Stjr{ 257146040Stjr reg_syntax_t ret = re_syntax_options; 258146040Stjr 259146040Stjr re_syntax_options = syntax; 260146040Stjr return ret; 261146040Stjr} 262146040Stjr#ifdef _LIBC 263146040Stjrweak_alias (__re_set_syntax, re_set_syntax) 264146040Stjr#endif 265146040Stjr 266146040Stjrint 267146040Stjrre_compile_fastmap (bufp) 268146040Stjr struct re_pattern_buffer *bufp; 269146040Stjr{ 270146040Stjr re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 271146040Stjr char *fastmap = bufp->fastmap; 272146040Stjr 273146040Stjr memset (fastmap, '\0', sizeof (char) * SBC_MAX); 274146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 275146040Stjr if (dfa->init_state != dfa->init_state_word) 276146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 277146040Stjr if (dfa->init_state != dfa->init_state_nl) 278146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 279146040Stjr if (dfa->init_state != dfa->init_state_begbuf) 280146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 281146040Stjr bufp->fastmap_accurate = 1; 282146040Stjr return 0; 283146040Stjr} 284146040Stjr#ifdef _LIBC 285146040Stjrweak_alias (__re_compile_fastmap, re_compile_fastmap) 286146040Stjr#endif 287146040Stjr 288146040Stjrstatic inline void 289146040Stjr__attribute ((always_inline)) 290146040Stjrre_set_fastmap (char *fastmap, int icase, int ch) 291146040Stjr{ 292146040Stjr fastmap[ch] = 1; 293146040Stjr if (icase) 294146040Stjr fastmap[tolower (ch)] = 1; 295146040Stjr} 296146040Stjr 297146040Stjr/* Helper function for re_compile_fastmap. 298146040Stjr Compile fastmap for the initial_state INIT_STATE. */ 299146040Stjr 300146040Stjrstatic void 301250724Sjkimre_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 302250724Sjkim char *fastmap) 303146040Stjr{ 304146040Stjr re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 305146040Stjr int node_cnt; 306146040Stjr int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 307146040Stjr for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 308146040Stjr { 309146040Stjr int node = init_state->nodes.elems[node_cnt]; 310146040Stjr re_token_type_t type = dfa->nodes[node].type; 311146040Stjr 312146040Stjr if (type == CHARACTER) 313146040Stjr { 314146040Stjr re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 315146040Stjr#ifdef RE_ENABLE_I18N 316146040Stjr if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 317146040Stjr { 318146040Stjr unsigned char *buf = alloca (dfa->mb_cur_max), *p; 319146040Stjr wchar_t wc; 320146040Stjr mbstate_t state; 321146040Stjr 322146040Stjr p = buf; 323146040Stjr *p++ = dfa->nodes[node].opr.c; 324146040Stjr while (++node < dfa->nodes_len 325146040Stjr && dfa->nodes[node].type == CHARACTER 326146040Stjr && dfa->nodes[node].mb_partial) 327146040Stjr *p++ = dfa->nodes[node].opr.c; 328250724Sjkim memset (&state, '\0', sizeof (state)); 329250724Sjkim if (__mbrtowc (&wc, (const char *) buf, p - buf, 330250724Sjkim &state) == p - buf 331146040Stjr && (__wcrtomb ((char *) buf, towlower (wc), &state) 332146040Stjr != (size_t) -1)) 333146040Stjr re_set_fastmap (fastmap, 0, buf[0]); 334146040Stjr } 335146040Stjr#endif 336146040Stjr } 337146040Stjr else if (type == SIMPLE_BRACKET) 338146040Stjr { 339250724Sjkim int i, ch; 340250724Sjkim for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 341250724Sjkim { 342250724Sjkim int j; 343250724Sjkim bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 344250724Sjkim for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 345250724Sjkim if (w & ((bitset_word_t) 1 << j)) 346250724Sjkim re_set_fastmap (fastmap, icase, ch); 347250724Sjkim } 348146040Stjr } 349146040Stjr#ifdef RE_ENABLE_I18N 350146040Stjr else if (type == COMPLEX_BRACKET) 351146040Stjr { 352250724Sjkim re_charset_t *cset = dfa->nodes[node].opr.mbcset; 353146040Stjr int i; 354250724Sjkim 355146040Stjr# ifdef _LIBC 356250724Sjkim /* See if we have to try all bytes which start multiple collation 357250724Sjkim elements. 358250724Sjkim e.g. In da_DK, we want to catch 'a' since "aa" is a valid 359250724Sjkim collation element, and don't catch 'b' since 'b' is 360250724Sjkim the only collation element which starts from 'b' (and 361250724Sjkim it is caught by SIMPLE_BRACKET). */ 362250724Sjkim if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 363250724Sjkim && (cset->ncoll_syms || cset->nranges)) 364146040Stjr { 365146040Stjr const int32_t *table = (const int32_t *) 366146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 367250724Sjkim for (i = 0; i < SBC_MAX; ++i) 368250724Sjkim if (table[i] < 0) 369250724Sjkim re_set_fastmap (fastmap, icase, i); 370146040Stjr } 371250724Sjkim# endif /* _LIBC */ 372250724Sjkim 373250724Sjkim /* See if we have to start the match at all multibyte characters, 374250724Sjkim i.e. where we would not find an invalid sequence. This only 375250724Sjkim applies to multibyte character sets; for single byte character 376250724Sjkim sets, the SIMPLE_BRACKET again suffices. */ 377250724Sjkim if (dfa->mb_cur_max > 1 378250724Sjkim && (cset->nchar_classes || cset->non_match || cset->nranges 379250724Sjkim# ifdef _LIBC 380250724Sjkim || cset->nequiv_classes 381250724Sjkim# endif /* _LIBC */ 382250724Sjkim )) 383250724Sjkim { 384250724Sjkim unsigned char c = 0; 385250724Sjkim do 386250724Sjkim { 387250724Sjkim mbstate_t mbs; 388250724Sjkim memset (&mbs, 0, sizeof (mbs)); 389250724Sjkim if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 390250724Sjkim re_set_fastmap (fastmap, false, (int) c); 391250724Sjkim } 392250724Sjkim while (++c != 0); 393146040Stjr } 394250724Sjkim 395250724Sjkim else 396146040Stjr { 397250724Sjkim /* ... Else catch all bytes which can start the mbchars. */ 398250724Sjkim for (i = 0; i < cset->nmbchars; ++i) 399146040Stjr { 400250724Sjkim char buf[256]; 401250724Sjkim mbstate_t state; 402250724Sjkim memset (&state, '\0', sizeof (state)); 403250724Sjkim if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 404250724Sjkim re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 405250724Sjkim if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 406250724Sjkim { 407250724Sjkim if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 408250724Sjkim != (size_t) -1) 409250724Sjkim re_set_fastmap (fastmap, false, *(unsigned char *) buf); 410250724Sjkim } 411146040Stjr } 412146040Stjr } 413146040Stjr } 414146040Stjr#endif /* RE_ENABLE_I18N */ 415146040Stjr else if (type == OP_PERIOD 416146040Stjr#ifdef RE_ENABLE_I18N 417146040Stjr || type == OP_UTF8_PERIOD 418146040Stjr#endif /* RE_ENABLE_I18N */ 419146040Stjr || type == END_OF_RE) 420146040Stjr { 421146040Stjr memset (fastmap, '\1', sizeof (char) * SBC_MAX); 422146040Stjr if (type == END_OF_RE) 423146040Stjr bufp->can_be_null = 1; 424146040Stjr return; 425146040Stjr } 426146040Stjr } 427146040Stjr} 428146040Stjr 429146040Stjr/* Entry point for POSIX code. */ 430146040Stjr/* regcomp takes a regular expression as a string and compiles it. 431146040Stjr 432146040Stjr PREG is a regex_t *. We do not expect any fields to be initialized, 433146040Stjr since POSIX says we shouldn't. Thus, we set 434146040Stjr 435146040Stjr `buffer' to the compiled pattern; 436146040Stjr `used' to the length of the compiled pattern; 437146040Stjr `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 438146040Stjr REG_EXTENDED bit in CFLAGS is set; otherwise, to 439146040Stjr RE_SYNTAX_POSIX_BASIC; 440146040Stjr `newline_anchor' to REG_NEWLINE being set in CFLAGS; 441146040Stjr `fastmap' to an allocated space for the fastmap; 442146040Stjr `fastmap_accurate' to zero; 443146040Stjr `re_nsub' to the number of subexpressions in PATTERN. 444146040Stjr 445146040Stjr PATTERN is the address of the pattern string. 446146040Stjr 447146040Stjr CFLAGS is a series of bits which affect compilation. 448146040Stjr 449146040Stjr If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 450146040Stjr use POSIX basic syntax. 451146040Stjr 452146040Stjr If REG_NEWLINE is set, then . and [^...] don't match newline. 453146040Stjr Also, regexec will try a match beginning after every newline. 454146040Stjr 455146040Stjr If REG_ICASE is set, then we considers upper- and lowercase 456146040Stjr versions of letters to be equivalent when matching. 457146040Stjr 458146040Stjr If REG_NOSUB is set, then when PREG is passed to regexec, that 459146040Stjr routine will report only success or failure, and nothing about the 460146040Stjr registers. 461146040Stjr 462146040Stjr It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 463146040Stjr the return codes and their meanings.) */ 464146040Stjr 465146040Stjrint 466146040Stjrregcomp (preg, pattern, cflags) 467146040Stjr regex_t *__restrict preg; 468146040Stjr const char *__restrict pattern; 469146040Stjr int cflags; 470146040Stjr{ 471146040Stjr reg_errcode_t ret; 472146040Stjr reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 473146040Stjr : RE_SYNTAX_POSIX_BASIC); 474146040Stjr 475146040Stjr preg->buffer = NULL; 476146040Stjr preg->allocated = 0; 477146040Stjr preg->used = 0; 478146040Stjr 479146040Stjr /* Try to allocate space for the fastmap. */ 480146040Stjr preg->fastmap = re_malloc (char, SBC_MAX); 481146040Stjr if (BE (preg->fastmap == NULL, 0)) 482146040Stjr return REG_ESPACE; 483146040Stjr 484146040Stjr syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 485146040Stjr 486146040Stjr /* If REG_NEWLINE is set, newlines are treated differently. */ 487146040Stjr if (cflags & REG_NEWLINE) 488146040Stjr { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 489146040Stjr syntax &= ~RE_DOT_NEWLINE; 490146040Stjr syntax |= RE_HAT_LISTS_NOT_NEWLINE; 491146040Stjr /* It also changes the matching behavior. */ 492146040Stjr preg->newline_anchor = 1; 493146040Stjr } 494146040Stjr else 495146040Stjr preg->newline_anchor = 0; 496146040Stjr preg->no_sub = !!(cflags & REG_NOSUB); 497146040Stjr preg->translate = NULL; 498146040Stjr 499146040Stjr ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 500146040Stjr 501146040Stjr /* POSIX doesn't distinguish between an unmatched open-group and an 502146040Stjr unmatched close-group: both are REG_EPAREN. */ 503146040Stjr if (ret == REG_ERPAREN) 504146040Stjr ret = REG_EPAREN; 505146040Stjr 506146040Stjr /* We have already checked preg->fastmap != NULL. */ 507146040Stjr if (BE (ret == REG_NOERROR, 1)) 508146040Stjr /* Compute the fastmap now, since regexec cannot modify the pattern 509146040Stjr buffer. This function never fails in this implementation. */ 510146040Stjr (void) re_compile_fastmap (preg); 511146040Stjr else 512146040Stjr { 513146040Stjr /* Some error occurred while compiling the expression. */ 514146040Stjr re_free (preg->fastmap); 515146040Stjr preg->fastmap = NULL; 516146040Stjr } 517146040Stjr 518146040Stjr return (int) ret; 519146040Stjr} 520146040Stjr#ifdef _LIBC 521146040Stjrweak_alias (__regcomp, regcomp) 522146040Stjr#endif 523146040Stjr 524146040Stjr/* Returns a message corresponding to an error code, ERRCODE, returned 525146040Stjr from either regcomp or regexec. We don't use PREG here. */ 526146040Stjr 527146040Stjrsize_t 528146040Stjrregerror (errcode, preg, errbuf, errbuf_size) 529146040Stjr int errcode; 530250724Sjkim const regex_t *__restrict preg; 531250724Sjkim char *__restrict errbuf; 532146040Stjr size_t errbuf_size; 533146040Stjr{ 534146040Stjr const char *msg; 535146040Stjr size_t msg_size; 536146040Stjr 537146040Stjr if (BE (errcode < 0 538146040Stjr || errcode >= (int) (sizeof (__re_error_msgid_idx) 539146040Stjr / sizeof (__re_error_msgid_idx[0])), 0)) 540146040Stjr /* Only error codes returned by the rest of the code should be passed 541146040Stjr to this routine. If we are given anything else, or if other regex 542146040Stjr code generates an invalid error code, then the program has a bug. 543146040Stjr Dump core so we can fix it. */ 544146040Stjr abort (); 545146040Stjr 546146040Stjr msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 547146040Stjr 548146040Stjr msg_size = strlen (msg) + 1; /* Includes the null. */ 549146040Stjr 550146040Stjr if (BE (errbuf_size != 0, 1)) 551146040Stjr { 552146040Stjr if (BE (msg_size > errbuf_size, 0)) 553146040Stjr { 554146040Stjr#if defined HAVE_MEMPCPY || defined _LIBC 555146040Stjr *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 556146040Stjr#else 557146040Stjr memcpy (errbuf, msg, errbuf_size - 1); 558146040Stjr errbuf[errbuf_size - 1] = 0; 559146040Stjr#endif 560146040Stjr } 561146040Stjr else 562146040Stjr memcpy (errbuf, msg, msg_size); 563146040Stjr } 564146040Stjr 565146040Stjr return msg_size; 566146040Stjr} 567146040Stjr#ifdef _LIBC 568146040Stjrweak_alias (__regerror, regerror) 569146040Stjr#endif 570146040Stjr 571146040Stjr 572146040Stjr#ifdef RE_ENABLE_I18N 573146040Stjr/* This static array is used for the map to single-byte characters when 574146040Stjr UTF-8 is used. Otherwise we would allocate memory just to initialize 575146040Stjr it the same all the time. UTF-8 is the preferred encoding so this is 576146040Stjr a worthwhile optimization. */ 577250724Sjkimstatic const bitset_t utf8_sb_map = 578146040Stjr{ 579146040Stjr /* Set the first 128 bits. */ 580250724Sjkim [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 581146040Stjr}; 582146040Stjr#endif 583146040Stjr 584146040Stjr 585146040Stjrstatic void 586146040Stjrfree_dfa_content (re_dfa_t *dfa) 587146040Stjr{ 588146040Stjr int i, j; 589146040Stjr 590146040Stjr if (dfa->nodes) 591146040Stjr for (i = 0; i < dfa->nodes_len; ++i) 592146040Stjr free_token (dfa->nodes + i); 593146040Stjr re_free (dfa->nexts); 594146040Stjr for (i = 0; i < dfa->nodes_len; ++i) 595146040Stjr { 596146040Stjr if (dfa->eclosures != NULL) 597146040Stjr re_node_set_free (dfa->eclosures + i); 598146040Stjr if (dfa->inveclosures != NULL) 599146040Stjr re_node_set_free (dfa->inveclosures + i); 600146040Stjr if (dfa->edests != NULL) 601146040Stjr re_node_set_free (dfa->edests + i); 602146040Stjr } 603146040Stjr re_free (dfa->edests); 604146040Stjr re_free (dfa->eclosures); 605146040Stjr re_free (dfa->inveclosures); 606146040Stjr re_free (dfa->nodes); 607146040Stjr 608146040Stjr if (dfa->state_table) 609146040Stjr for (i = 0; i <= dfa->state_hash_mask; ++i) 610146040Stjr { 611146040Stjr struct re_state_table_entry *entry = dfa->state_table + i; 612146040Stjr for (j = 0; j < entry->num; ++j) 613146040Stjr { 614146040Stjr re_dfastate_t *state = entry->array[j]; 615146040Stjr free_state (state); 616146040Stjr } 617250724Sjkim re_free (entry->array); 618146040Stjr } 619146040Stjr re_free (dfa->state_table); 620146040Stjr#ifdef RE_ENABLE_I18N 621146040Stjr if (dfa->sb_char != utf8_sb_map) 622146040Stjr re_free (dfa->sb_char); 623146040Stjr#endif 624146040Stjr re_free (dfa->subexp_map); 625146040Stjr#ifdef DEBUG 626146040Stjr re_free (dfa->re_str); 627146040Stjr#endif 628146040Stjr 629146040Stjr re_free (dfa); 630146040Stjr} 631146040Stjr 632146040Stjr 633146040Stjr/* Free dynamically allocated space used by PREG. */ 634146040Stjr 635146040Stjrvoid 636146040Stjrregfree (preg) 637146040Stjr regex_t *preg; 638146040Stjr{ 639146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 640146040Stjr if (BE (dfa != NULL, 1)) 641146040Stjr free_dfa_content (dfa); 642146040Stjr preg->buffer = NULL; 643146040Stjr preg->allocated = 0; 644146040Stjr 645146040Stjr re_free (preg->fastmap); 646146040Stjr preg->fastmap = NULL; 647146040Stjr 648146040Stjr re_free (preg->translate); 649146040Stjr preg->translate = NULL; 650146040Stjr} 651146040Stjr#ifdef _LIBC 652146040Stjrweak_alias (__regfree, regfree) 653146040Stjr#endif 654146040Stjr 655146040Stjr/* Entry points compatible with 4.2 BSD regex library. We don't define 656146040Stjr them unless specifically requested. */ 657146040Stjr 658146040Stjr#if defined _REGEX_RE_COMP || defined _LIBC 659146040Stjr 660146040Stjr/* BSD has one and only one pattern buffer. */ 661146040Stjrstatic struct re_pattern_buffer re_comp_buf; 662146040Stjr 663146040Stjrchar * 664146040Stjr# ifdef _LIBC 665146040Stjr/* Make these definitions weak in libc, so POSIX programs can redefine 666146040Stjr these names if they don't use our functions, and still use 667146040Stjr regcomp/regexec above without link errors. */ 668146040Stjrweak_function 669146040Stjr# endif 670146040Stjrre_comp (s) 671146040Stjr const char *s; 672146040Stjr{ 673146040Stjr reg_errcode_t ret; 674146040Stjr char *fastmap; 675146040Stjr 676146040Stjr if (!s) 677146040Stjr { 678146040Stjr if (!re_comp_buf.buffer) 679146040Stjr return gettext ("No previous regular expression"); 680146040Stjr return 0; 681146040Stjr } 682146040Stjr 683146040Stjr if (re_comp_buf.buffer) 684146040Stjr { 685146040Stjr fastmap = re_comp_buf.fastmap; 686146040Stjr re_comp_buf.fastmap = NULL; 687146040Stjr __regfree (&re_comp_buf); 688146040Stjr memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 689146040Stjr re_comp_buf.fastmap = fastmap; 690146040Stjr } 691146040Stjr 692146040Stjr if (re_comp_buf.fastmap == NULL) 693146040Stjr { 694146040Stjr re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 695146040Stjr if (re_comp_buf.fastmap == NULL) 696146040Stjr return (char *) gettext (__re_error_msgid 697146040Stjr + __re_error_msgid_idx[(int) REG_ESPACE]); 698146040Stjr } 699146040Stjr 700146040Stjr /* Since `re_exec' always passes NULL for the `regs' argument, we 701146040Stjr don't need to initialize the pattern buffer fields which affect it. */ 702146040Stjr 703146040Stjr /* Match anchors at newlines. */ 704146040Stjr re_comp_buf.newline_anchor = 1; 705146040Stjr 706146040Stjr ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 707146040Stjr 708146040Stjr if (!ret) 709146040Stjr return NULL; 710146040Stjr 711146040Stjr /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 712146040Stjr return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 713146040Stjr} 714146040Stjr 715146040Stjr#ifdef _LIBC 716146040Stjrlibc_freeres_fn (free_mem) 717146040Stjr{ 718146040Stjr __regfree (&re_comp_buf); 719146040Stjr} 720146040Stjr#endif 721146040Stjr 722146040Stjr#endif /* _REGEX_RE_COMP */ 723146040Stjr 724146040Stjr/* Internal entry point. 725146040Stjr Compile the regular expression PATTERN, whose length is LENGTH. 726146040Stjr SYNTAX indicate regular expression's syntax. */ 727146040Stjr 728146040Stjrstatic reg_errcode_t 729250724Sjkimre_compile_internal (regex_t *preg, const char * pattern, size_t length, 730250724Sjkim reg_syntax_t syntax) 731146040Stjr{ 732146040Stjr reg_errcode_t err = REG_NOERROR; 733146040Stjr re_dfa_t *dfa; 734146040Stjr re_string_t regexp; 735146040Stjr 736146040Stjr /* Initialize the pattern buffer. */ 737146040Stjr preg->fastmap_accurate = 0; 738146040Stjr preg->syntax = syntax; 739146040Stjr preg->not_bol = preg->not_eol = 0; 740146040Stjr preg->used = 0; 741146040Stjr preg->re_nsub = 0; 742146040Stjr preg->can_be_null = 0; 743146040Stjr preg->regs_allocated = REGS_UNALLOCATED; 744146040Stjr 745146040Stjr /* Initialize the dfa. */ 746146040Stjr dfa = (re_dfa_t *) preg->buffer; 747146040Stjr if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 748146040Stjr { 749146040Stjr /* If zero allocated, but buffer is non-null, try to realloc 750146040Stjr enough space. This loses if buffer's address is bogus, but 751146040Stjr that is the user's responsibility. If ->buffer is NULL this 752146040Stjr is a simple allocation. */ 753146040Stjr dfa = re_realloc (preg->buffer, re_dfa_t, 1); 754146040Stjr if (dfa == NULL) 755146040Stjr return REG_ESPACE; 756146040Stjr preg->allocated = sizeof (re_dfa_t); 757146040Stjr preg->buffer = (unsigned char *) dfa; 758146040Stjr } 759146040Stjr preg->used = sizeof (re_dfa_t); 760146040Stjr 761146040Stjr err = init_dfa (dfa, length); 762146040Stjr if (BE (err != REG_NOERROR, 0)) 763146040Stjr { 764146040Stjr free_dfa_content (dfa); 765146040Stjr preg->buffer = NULL; 766146040Stjr preg->allocated = 0; 767146040Stjr return err; 768146040Stjr } 769146040Stjr#ifdef DEBUG 770250724Sjkim /* Note: length+1 will not overflow since it is checked in init_dfa. */ 771146040Stjr dfa->re_str = re_malloc (char, length + 1); 772146040Stjr strncpy (dfa->re_str, pattern, length + 1); 773146040Stjr#endif 774146040Stjr 775250724Sjkim __libc_lock_init (dfa->lock); 776250724Sjkim 777146040Stjr err = re_string_construct (®exp, pattern, length, preg->translate, 778146040Stjr syntax & RE_ICASE, dfa); 779146040Stjr if (BE (err != REG_NOERROR, 0)) 780146040Stjr { 781146040Stjr re_compile_internal_free_return: 782146040Stjr free_workarea_compile (preg); 783146040Stjr re_string_destruct (®exp); 784146040Stjr free_dfa_content (dfa); 785146040Stjr preg->buffer = NULL; 786146040Stjr preg->allocated = 0; 787146040Stjr return err; 788146040Stjr } 789146040Stjr 790146040Stjr /* Parse the regular expression, and build a structure tree. */ 791146040Stjr preg->re_nsub = 0; 792146040Stjr dfa->str_tree = parse (®exp, preg, syntax, &err); 793146040Stjr if (BE (dfa->str_tree == NULL, 0)) 794146040Stjr goto re_compile_internal_free_return; 795146040Stjr 796146040Stjr /* Analyze the tree and create the nfa. */ 797146040Stjr err = analyze (preg); 798146040Stjr if (BE (err != REG_NOERROR, 0)) 799146040Stjr goto re_compile_internal_free_return; 800146040Stjr 801146040Stjr#ifdef RE_ENABLE_I18N 802146040Stjr /* If possible, do searching in single byte encoding to speed things up. */ 803146040Stjr if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 804146040Stjr optimize_utf8 (dfa); 805146040Stjr#endif 806146040Stjr 807146040Stjr /* Then create the initial state of the dfa. */ 808146040Stjr err = create_initial_state (dfa); 809146040Stjr 810146040Stjr /* Release work areas. */ 811146040Stjr free_workarea_compile (preg); 812146040Stjr re_string_destruct (®exp); 813146040Stjr 814146040Stjr if (BE (err != REG_NOERROR, 0)) 815146040Stjr { 816146040Stjr free_dfa_content (dfa); 817146040Stjr preg->buffer = NULL; 818146040Stjr preg->allocated = 0; 819146040Stjr } 820146040Stjr 821146040Stjr return err; 822146040Stjr} 823146040Stjr 824146040Stjr/* Initialize DFA. We use the length of the regular expression PAT_LEN 825146040Stjr as the initial length of some arrays. */ 826146040Stjr 827146040Stjrstatic reg_errcode_t 828250724Sjkiminit_dfa (re_dfa_t *dfa, size_t pat_len) 829146040Stjr{ 830250724Sjkim unsigned int table_size; 831146040Stjr#ifndef _LIBC 832146040Stjr char *codeset_name; 833146040Stjr#endif 834146040Stjr 835146040Stjr memset (dfa, '\0', sizeof (re_dfa_t)); 836146040Stjr 837146040Stjr /* Force allocation of str_tree_storage the first time. */ 838146040Stjr dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 839146040Stjr 840250724Sjkim /* Avoid overflows. */ 841250724Sjkim if (pat_len == SIZE_MAX) 842250724Sjkim return REG_ESPACE; 843250724Sjkim 844146040Stjr dfa->nodes_alloc = pat_len + 1; 845146040Stjr dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 846146040Stjr 847146040Stjr /* table_size = 2 ^ ceil(log pat_len) */ 848250724Sjkim for (table_size = 1; ; table_size <<= 1) 849146040Stjr if (table_size > pat_len) 850146040Stjr break; 851146040Stjr 852146040Stjr dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 853146040Stjr dfa->state_hash_mask = table_size - 1; 854146040Stjr 855146040Stjr dfa->mb_cur_max = MB_CUR_MAX; 856146040Stjr#ifdef _LIBC 857146040Stjr if (dfa->mb_cur_max == 6 858146040Stjr && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 859146040Stjr dfa->is_utf8 = 1; 860146040Stjr dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 861146040Stjr != 0); 862146040Stjr#else 863146040Stjr# ifdef HAVE_LANGINFO_CODESET 864146040Stjr codeset_name = nl_langinfo (CODESET); 865146040Stjr# else 866146040Stjr codeset_name = getenv ("LC_ALL"); 867146040Stjr if (codeset_name == NULL || codeset_name[0] == '\0') 868146040Stjr codeset_name = getenv ("LC_CTYPE"); 869146040Stjr if (codeset_name == NULL || codeset_name[0] == '\0') 870146040Stjr codeset_name = getenv ("LANG"); 871146040Stjr if (codeset_name == NULL) 872146040Stjr codeset_name = ""; 873146040Stjr else if (strchr (codeset_name, '.') != NULL) 874146040Stjr codeset_name = strchr (codeset_name, '.') + 1; 875146040Stjr# endif 876146040Stjr 877146040Stjr if (strcasecmp (codeset_name, "UTF-8") == 0 878146040Stjr || strcasecmp (codeset_name, "UTF8") == 0) 879146040Stjr dfa->is_utf8 = 1; 880146040Stjr 881146040Stjr /* We check exhaustively in the loop below if this charset is a 882146040Stjr superset of ASCII. */ 883146040Stjr dfa->map_notascii = 0; 884146040Stjr#endif 885146040Stjr 886146040Stjr#ifdef RE_ENABLE_I18N 887146040Stjr if (dfa->mb_cur_max > 1) 888146040Stjr { 889146040Stjr if (dfa->is_utf8) 890146040Stjr dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 891146040Stjr else 892146040Stjr { 893146040Stjr int i, j, ch; 894146040Stjr 895250724Sjkim dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 896146040Stjr if (BE (dfa->sb_char == NULL, 0)) 897146040Stjr return REG_ESPACE; 898146040Stjr 899250724Sjkim /* Set the bits corresponding to single byte chars. */ 900250724Sjkim for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 901250724Sjkim for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 902146040Stjr { 903250724Sjkim wint_t wch = __btowc (ch); 904146040Stjr if (wch != WEOF) 905250724Sjkim dfa->sb_char[i] |= (bitset_word_t) 1 << j; 906146040Stjr# ifndef _LIBC 907250724Sjkim if (isascii (ch) && wch != ch) 908146040Stjr dfa->map_notascii = 1; 909146040Stjr# endif 910146040Stjr } 911146040Stjr } 912146040Stjr } 913146040Stjr#endif 914146040Stjr 915146040Stjr if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 916146040Stjr return REG_ESPACE; 917146040Stjr return REG_NOERROR; 918146040Stjr} 919146040Stjr 920146040Stjr/* Initialize WORD_CHAR table, which indicate which character is 921146040Stjr "word". In this case "word" means that it is the word construction 922146040Stjr character used by some operators like "\<", "\>", etc. */ 923146040Stjr 924146040Stjrstatic void 925250724Sjkiminternal_function 926250724Sjkiminit_word_char (re_dfa_t *dfa) 927146040Stjr{ 928146040Stjr dfa->word_ops_used = 1; 929250724Sjkim int i = 0; 930250724Sjkim int ch = 0; 931250724Sjkim if (BE (dfa->map_notascii == 0, 1)) 932250724Sjkim { 933250724Sjkim if (sizeof (dfa->word_char[0]) == 8) 934250724Sjkim { 935250724Sjkim /* The extra temporaries here avoid "implicitly truncated" 936250724Sjkim warnings in the case when this is dead code, i.e. 32-bit. */ 937250724Sjkim const uint64_t wc0 = UINT64_C (0x03ff000000000000); 938250724Sjkim const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe); 939250724Sjkim dfa->word_char[0] = wc0; 940250724Sjkim dfa->word_char[1] = wc1; 941250724Sjkim i = 2; 942250724Sjkim } 943250724Sjkim else if (sizeof (dfa->word_char[0]) == 4) 944250724Sjkim { 945250724Sjkim dfa->word_char[0] = UINT32_C (0x00000000); 946250724Sjkim dfa->word_char[1] = UINT32_C (0x03ff0000); 947250724Sjkim dfa->word_char[2] = UINT32_C (0x87fffffe); 948250724Sjkim dfa->word_char[3] = UINT32_C (0x07fffffe); 949250724Sjkim i = 4; 950250724Sjkim } 951250724Sjkim else 952250724Sjkim abort (); 953250724Sjkim ch = 128; 954250724Sjkim 955250724Sjkim if (BE (dfa->is_utf8, 1)) 956250724Sjkim { 957250724Sjkim memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 958250724Sjkim return; 959250724Sjkim } 960250724Sjkim } 961250724Sjkim 962250724Sjkim for (; i < BITSET_WORDS; ++i) 963250724Sjkim for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 964146040Stjr if (isalnum (ch) || ch == '_') 965250724Sjkim dfa->word_char[i] |= (bitset_word_t) 1 << j; 966146040Stjr} 967146040Stjr 968146040Stjr/* Free the work area which are only used while compiling. */ 969146040Stjr 970146040Stjrstatic void 971250724Sjkimfree_workarea_compile (regex_t *preg) 972146040Stjr{ 973146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 974146040Stjr bin_tree_storage_t *storage, *next; 975146040Stjr for (storage = dfa->str_tree_storage; storage; storage = next) 976146040Stjr { 977146040Stjr next = storage->next; 978146040Stjr re_free (storage); 979146040Stjr } 980146040Stjr dfa->str_tree_storage = NULL; 981146040Stjr dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 982146040Stjr dfa->str_tree = NULL; 983146040Stjr re_free (dfa->org_indices); 984146040Stjr dfa->org_indices = NULL; 985146040Stjr} 986146040Stjr 987146040Stjr/* Create initial states for all contexts. */ 988146040Stjr 989146040Stjrstatic reg_errcode_t 990250724Sjkimcreate_initial_state (re_dfa_t *dfa) 991146040Stjr{ 992146040Stjr int first, i; 993146040Stjr reg_errcode_t err; 994146040Stjr re_node_set init_nodes; 995146040Stjr 996146040Stjr /* Initial states have the epsilon closure of the node which is 997146040Stjr the first node of the regular expression. */ 998146040Stjr first = dfa->str_tree->first->node_idx; 999146040Stjr dfa->init_node = first; 1000146040Stjr err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1001146040Stjr if (BE (err != REG_NOERROR, 0)) 1002146040Stjr return err; 1003146040Stjr 1004146040Stjr /* The back-references which are in initial states can epsilon transit, 1005146040Stjr since in this case all of the subexpressions can be null. 1006146040Stjr Then we add epsilon closures of the nodes which are the next nodes of 1007146040Stjr the back-references. */ 1008146040Stjr if (dfa->nbackref > 0) 1009146040Stjr for (i = 0; i < init_nodes.nelem; ++i) 1010146040Stjr { 1011146040Stjr int node_idx = init_nodes.elems[i]; 1012146040Stjr re_token_type_t type = dfa->nodes[node_idx].type; 1013146040Stjr 1014146040Stjr int clexp_idx; 1015146040Stjr if (type != OP_BACK_REF) 1016146040Stjr continue; 1017146040Stjr for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1018146040Stjr { 1019146040Stjr re_token_t *clexp_node; 1020146040Stjr clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1021146040Stjr if (clexp_node->type == OP_CLOSE_SUBEXP 1022146040Stjr && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1023146040Stjr break; 1024146040Stjr } 1025146040Stjr if (clexp_idx == init_nodes.nelem) 1026146040Stjr continue; 1027146040Stjr 1028146040Stjr if (type == OP_BACK_REF) 1029146040Stjr { 1030146040Stjr int dest_idx = dfa->edests[node_idx].elems[0]; 1031146040Stjr if (!re_node_set_contains (&init_nodes, dest_idx)) 1032146040Stjr { 1033250724Sjkim reg_errcode_t err = re_node_set_merge (&init_nodes, 1034250724Sjkim dfa->eclosures 1035250724Sjkim + dest_idx); 1036250724Sjkim if (err != REG_NOERROR) 1037250724Sjkim return err; 1038146040Stjr i = 0; 1039146040Stjr } 1040146040Stjr } 1041146040Stjr } 1042146040Stjr 1043146040Stjr /* It must be the first time to invoke acquire_state. */ 1044146040Stjr dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1045146040Stjr /* We don't check ERR here, since the initial state must not be NULL. */ 1046146040Stjr if (BE (dfa->init_state == NULL, 0)) 1047146040Stjr return err; 1048146040Stjr if (dfa->init_state->has_constraint) 1049146040Stjr { 1050146040Stjr dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1051146040Stjr CONTEXT_WORD); 1052146040Stjr dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1053146040Stjr CONTEXT_NEWLINE); 1054146040Stjr dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1055146040Stjr &init_nodes, 1056146040Stjr CONTEXT_NEWLINE 1057146040Stjr | CONTEXT_BEGBUF); 1058146040Stjr if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1059146040Stjr || dfa->init_state_begbuf == NULL, 0)) 1060146040Stjr return err; 1061146040Stjr } 1062146040Stjr else 1063146040Stjr dfa->init_state_word = dfa->init_state_nl 1064146040Stjr = dfa->init_state_begbuf = dfa->init_state; 1065146040Stjr 1066146040Stjr re_node_set_free (&init_nodes); 1067146040Stjr return REG_NOERROR; 1068146040Stjr} 1069146040Stjr 1070146040Stjr#ifdef RE_ENABLE_I18N 1071146040Stjr/* If it is possible to do searching in single byte encoding instead of UTF-8 1072146040Stjr to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1073146040Stjr DFA nodes where needed. */ 1074146040Stjr 1075146040Stjrstatic void 1076250724Sjkimoptimize_utf8 (re_dfa_t *dfa) 1077146040Stjr{ 1078146040Stjr int node, i, mb_chars = 0, has_period = 0; 1079146040Stjr 1080146040Stjr for (node = 0; node < dfa->nodes_len; ++node) 1081146040Stjr switch (dfa->nodes[node].type) 1082146040Stjr { 1083146040Stjr case CHARACTER: 1084146040Stjr if (dfa->nodes[node].opr.c >= 0x80) 1085146040Stjr mb_chars = 1; 1086146040Stjr break; 1087146040Stjr case ANCHOR: 1088250724Sjkim switch (dfa->nodes[node].opr.ctx_type) 1089146040Stjr { 1090146040Stjr case LINE_FIRST: 1091146040Stjr case LINE_LAST: 1092146040Stjr case BUF_FIRST: 1093146040Stjr case BUF_LAST: 1094146040Stjr break; 1095146040Stjr default: 1096250724Sjkim /* Word anchors etc. cannot be handled. It's okay to test 1097250724Sjkim opr.ctx_type since constraints (for all DFA nodes) are 1098250724Sjkim created by ORing one or more opr.ctx_type values. */ 1099146040Stjr return; 1100146040Stjr } 1101146040Stjr break; 1102146040Stjr case OP_PERIOD: 1103250724Sjkim has_period = 1; 1104250724Sjkim break; 1105146040Stjr case OP_BACK_REF: 1106146040Stjr case OP_ALT: 1107146040Stjr case END_OF_RE: 1108146040Stjr case OP_DUP_ASTERISK: 1109146040Stjr case OP_OPEN_SUBEXP: 1110146040Stjr case OP_CLOSE_SUBEXP: 1111146040Stjr break; 1112146040Stjr case COMPLEX_BRACKET: 1113146040Stjr return; 1114146040Stjr case SIMPLE_BRACKET: 1115250724Sjkim /* Just double check. The non-ASCII range starts at 0x80. */ 1116250724Sjkim assert (0x80 % BITSET_WORD_BITS == 0); 1117250724Sjkim for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1118146040Stjr if (dfa->nodes[node].opr.sbcset[i]) 1119146040Stjr return; 1120146040Stjr break; 1121146040Stjr default: 1122146040Stjr abort (); 1123146040Stjr } 1124146040Stjr 1125146040Stjr if (mb_chars || has_period) 1126146040Stjr for (node = 0; node < dfa->nodes_len; ++node) 1127146040Stjr { 1128146040Stjr if (dfa->nodes[node].type == CHARACTER 1129146040Stjr && dfa->nodes[node].opr.c >= 0x80) 1130146040Stjr dfa->nodes[node].mb_partial = 0; 1131146040Stjr else if (dfa->nodes[node].type == OP_PERIOD) 1132146040Stjr dfa->nodes[node].type = OP_UTF8_PERIOD; 1133146040Stjr } 1134146040Stjr 1135146040Stjr /* The search can be in single byte locale. */ 1136146040Stjr dfa->mb_cur_max = 1; 1137146040Stjr dfa->is_utf8 = 0; 1138146040Stjr dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1139146040Stjr} 1140146040Stjr#endif 1141146040Stjr 1142146040Stjr/* Analyze the structure tree, and calculate "first", "next", "edest", 1143146040Stjr "eclosure", and "inveclosure". */ 1144146040Stjr 1145146040Stjrstatic reg_errcode_t 1146250724Sjkimanalyze (regex_t *preg) 1147146040Stjr{ 1148146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1149146040Stjr reg_errcode_t ret; 1150146040Stjr 1151146040Stjr /* Allocate arrays. */ 1152146040Stjr dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1153146040Stjr dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1154146040Stjr dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1155146040Stjr dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1156146040Stjr if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1157146040Stjr || dfa->eclosures == NULL, 0)) 1158146040Stjr return REG_ESPACE; 1159146040Stjr 1160146040Stjr dfa->subexp_map = re_malloc (int, preg->re_nsub); 1161146040Stjr if (dfa->subexp_map != NULL) 1162146040Stjr { 1163146040Stjr int i; 1164146040Stjr for (i = 0; i < preg->re_nsub; i++) 1165146040Stjr dfa->subexp_map[i] = i; 1166146040Stjr preorder (dfa->str_tree, optimize_subexps, dfa); 1167146040Stjr for (i = 0; i < preg->re_nsub; i++) 1168146040Stjr if (dfa->subexp_map[i] != i) 1169146040Stjr break; 1170146040Stjr if (i == preg->re_nsub) 1171146040Stjr { 1172146040Stjr free (dfa->subexp_map); 1173146040Stjr dfa->subexp_map = NULL; 1174146040Stjr } 1175146040Stjr } 1176146040Stjr 1177146040Stjr ret = postorder (dfa->str_tree, lower_subexps, preg); 1178146040Stjr if (BE (ret != REG_NOERROR, 0)) 1179146040Stjr return ret; 1180146040Stjr ret = postorder (dfa->str_tree, calc_first, dfa); 1181146040Stjr if (BE (ret != REG_NOERROR, 0)) 1182146040Stjr return ret; 1183146040Stjr preorder (dfa->str_tree, calc_next, dfa); 1184146040Stjr ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1185146040Stjr if (BE (ret != REG_NOERROR, 0)) 1186146040Stjr return ret; 1187146040Stjr ret = calc_eclosure (dfa); 1188146040Stjr if (BE (ret != REG_NOERROR, 0)) 1189146040Stjr return ret; 1190146040Stjr 1191146040Stjr /* We only need this during the prune_impossible_nodes pass in regexec.c; 1192146040Stjr skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1193146040Stjr if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1194146040Stjr || dfa->nbackref) 1195146040Stjr { 1196146040Stjr dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1197146040Stjr if (BE (dfa->inveclosures == NULL, 0)) 1198250724Sjkim return REG_ESPACE; 1199146040Stjr ret = calc_inveclosure (dfa); 1200146040Stjr } 1201146040Stjr 1202146040Stjr return ret; 1203146040Stjr} 1204146040Stjr 1205146040Stjr/* Our parse trees are very unbalanced, so we cannot use a stack to 1206146040Stjr implement parse tree visits. Instead, we use parent pointers and 1207146040Stjr some hairy code in these two functions. */ 1208146040Stjrstatic reg_errcode_t 1209250724Sjkimpostorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1210250724Sjkim void *extra) 1211146040Stjr{ 1212146040Stjr bin_tree_t *node, *prev; 1213146040Stjr 1214146040Stjr for (node = root; ; ) 1215146040Stjr { 1216146040Stjr /* Descend down the tree, preferably to the left (or to the right 1217146040Stjr if that's the only child). */ 1218146040Stjr while (node->left || node->right) 1219146040Stjr if (node->left) 1220250724Sjkim node = node->left; 1221250724Sjkim else 1222250724Sjkim node = node->right; 1223146040Stjr 1224146040Stjr do 1225146040Stjr { 1226146040Stjr reg_errcode_t err = fn (extra, node); 1227146040Stjr if (BE (err != REG_NOERROR, 0)) 1228146040Stjr return err; 1229250724Sjkim if (node->parent == NULL) 1230146040Stjr return REG_NOERROR; 1231146040Stjr prev = node; 1232146040Stjr node = node->parent; 1233146040Stjr } 1234146040Stjr /* Go up while we have a node that is reached from the right. */ 1235146040Stjr while (node->right == prev || node->right == NULL); 1236146040Stjr node = node->right; 1237146040Stjr } 1238146040Stjr} 1239146040Stjr 1240146040Stjrstatic reg_errcode_t 1241250724Sjkimpreorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1242250724Sjkim void *extra) 1243146040Stjr{ 1244146040Stjr bin_tree_t *node; 1245146040Stjr 1246146040Stjr for (node = root; ; ) 1247146040Stjr { 1248146040Stjr reg_errcode_t err = fn (extra, node); 1249146040Stjr if (BE (err != REG_NOERROR, 0)) 1250146040Stjr return err; 1251146040Stjr 1252146040Stjr /* Go to the left node, or up and to the right. */ 1253146040Stjr if (node->left) 1254146040Stjr node = node->left; 1255146040Stjr else 1256146040Stjr { 1257146040Stjr bin_tree_t *prev = NULL; 1258146040Stjr while (node->right == prev || node->right == NULL) 1259146040Stjr { 1260146040Stjr prev = node; 1261146040Stjr node = node->parent; 1262146040Stjr if (!node) 1263250724Sjkim return REG_NOERROR; 1264146040Stjr } 1265146040Stjr node = node->right; 1266146040Stjr } 1267146040Stjr } 1268146040Stjr} 1269146040Stjr 1270146040Stjr/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1271146040Stjr re_search_internal to map the inner one's opr.idx to this one's. Adjust 1272146040Stjr backreferences as well. Requires a preorder visit. */ 1273146040Stjrstatic reg_errcode_t 1274250724Sjkimoptimize_subexps (void *extra, bin_tree_t *node) 1275146040Stjr{ 1276146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1277146040Stjr 1278146040Stjr if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1279146040Stjr { 1280146040Stjr int idx = node->token.opr.idx; 1281146040Stjr node->token.opr.idx = dfa->subexp_map[idx]; 1282146040Stjr dfa->used_bkref_map |= 1 << node->token.opr.idx; 1283146040Stjr } 1284146040Stjr 1285146040Stjr else if (node->token.type == SUBEXP 1286250724Sjkim && node->left && node->left->token.type == SUBEXP) 1287146040Stjr { 1288146040Stjr int other_idx = node->left->token.opr.idx; 1289146040Stjr 1290146040Stjr node->left = node->left->left; 1291146040Stjr if (node->left) 1292250724Sjkim node->left->parent = node; 1293146040Stjr 1294146040Stjr dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1295250724Sjkim if (other_idx < BITSET_WORD_BITS) 1296250724Sjkim dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1297146040Stjr } 1298146040Stjr 1299146040Stjr return REG_NOERROR; 1300146040Stjr} 1301146040Stjr 1302146040Stjr/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1303146040Stjr of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1304146040Stjrstatic reg_errcode_t 1305250724Sjkimlower_subexps (void *extra, bin_tree_t *node) 1306146040Stjr{ 1307146040Stjr regex_t *preg = (regex_t *) extra; 1308146040Stjr reg_errcode_t err = REG_NOERROR; 1309146040Stjr 1310146040Stjr if (node->left && node->left->token.type == SUBEXP) 1311146040Stjr { 1312146040Stjr node->left = lower_subexp (&err, preg, node->left); 1313146040Stjr if (node->left) 1314146040Stjr node->left->parent = node; 1315146040Stjr } 1316146040Stjr if (node->right && node->right->token.type == SUBEXP) 1317146040Stjr { 1318146040Stjr node->right = lower_subexp (&err, preg, node->right); 1319146040Stjr if (node->right) 1320146040Stjr node->right->parent = node; 1321146040Stjr } 1322146040Stjr 1323146040Stjr return err; 1324146040Stjr} 1325146040Stjr 1326146040Stjrstatic bin_tree_t * 1327250724Sjkimlower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1328146040Stjr{ 1329146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1330146040Stjr bin_tree_t *body = node->left; 1331146040Stjr bin_tree_t *op, *cls, *tree1, *tree; 1332146040Stjr 1333146040Stjr if (preg->no_sub 1334146040Stjr /* We do not optimize empty subexpressions, because otherwise we may 1335146040Stjr have bad CONCAT nodes with NULL children. This is obviously not 1336146040Stjr very common, so we do not lose much. An example that triggers 1337146040Stjr this case is the sed "script" /\(\)/x. */ 1338146040Stjr && node->left != NULL 1339250724Sjkim && (node->token.opr.idx >= BITSET_WORD_BITS 1340250724Sjkim || !(dfa->used_bkref_map 1341250724Sjkim & ((bitset_word_t) 1 << node->token.opr.idx)))) 1342146040Stjr return node->left; 1343146040Stjr 1344146040Stjr /* Convert the SUBEXP node to the concatenation of an 1345146040Stjr OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1346146040Stjr op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1347146040Stjr cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1348146040Stjr tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1349146040Stjr tree = create_tree (dfa, op, tree1, CONCAT); 1350146040Stjr if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1351146040Stjr { 1352146040Stjr *err = REG_ESPACE; 1353146040Stjr return NULL; 1354146040Stjr } 1355146040Stjr 1356146040Stjr op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1357146040Stjr op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1358146040Stjr return tree; 1359146040Stjr} 1360146040Stjr 1361146040Stjr/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1362146040Stjr nodes. Requires a postorder visit. */ 1363146040Stjrstatic reg_errcode_t 1364250724Sjkimcalc_first (void *extra, bin_tree_t *node) 1365146040Stjr{ 1366146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1367146040Stjr if (node->token.type == CONCAT) 1368146040Stjr { 1369146040Stjr node->first = node->left->first; 1370146040Stjr node->node_idx = node->left->node_idx; 1371146040Stjr } 1372146040Stjr else 1373146040Stjr { 1374146040Stjr node->first = node; 1375146040Stjr node->node_idx = re_dfa_add_node (dfa, node->token); 1376146040Stjr if (BE (node->node_idx == -1, 0)) 1377250724Sjkim return REG_ESPACE; 1378250724Sjkim if (node->token.type == ANCHOR) 1379250724Sjkim dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1380146040Stjr } 1381146040Stjr return REG_NOERROR; 1382146040Stjr} 1383146040Stjr 1384146040Stjr/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1385146040Stjrstatic reg_errcode_t 1386250724Sjkimcalc_next (void *extra, bin_tree_t *node) 1387146040Stjr{ 1388146040Stjr switch (node->token.type) 1389146040Stjr { 1390146040Stjr case OP_DUP_ASTERISK: 1391146040Stjr node->left->next = node; 1392146040Stjr break; 1393146040Stjr case CONCAT: 1394146040Stjr node->left->next = node->right->first; 1395146040Stjr node->right->next = node->next; 1396146040Stjr break; 1397146040Stjr default: 1398146040Stjr if (node->left) 1399146040Stjr node->left->next = node->next; 1400146040Stjr if (node->right) 1401250724Sjkim node->right->next = node->next; 1402146040Stjr break; 1403146040Stjr } 1404146040Stjr return REG_NOERROR; 1405146040Stjr} 1406146040Stjr 1407146040Stjr/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1408146040Stjrstatic reg_errcode_t 1409250724Sjkimlink_nfa_nodes (void *extra, bin_tree_t *node) 1410146040Stjr{ 1411146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1412146040Stjr int idx = node->node_idx; 1413146040Stjr reg_errcode_t err = REG_NOERROR; 1414146040Stjr 1415146040Stjr switch (node->token.type) 1416146040Stjr { 1417146040Stjr case CONCAT: 1418146040Stjr break; 1419146040Stjr 1420146040Stjr case END_OF_RE: 1421146040Stjr assert (node->next == NULL); 1422146040Stjr break; 1423146040Stjr 1424146040Stjr case OP_DUP_ASTERISK: 1425146040Stjr case OP_ALT: 1426146040Stjr { 1427146040Stjr int left, right; 1428146040Stjr dfa->has_plural_match = 1; 1429146040Stjr if (node->left != NULL) 1430146040Stjr left = node->left->first->node_idx; 1431146040Stjr else 1432146040Stjr left = node->next->node_idx; 1433146040Stjr if (node->right != NULL) 1434146040Stjr right = node->right->first->node_idx; 1435146040Stjr else 1436146040Stjr right = node->next->node_idx; 1437146040Stjr assert (left > -1); 1438146040Stjr assert (right > -1); 1439146040Stjr err = re_node_set_init_2 (dfa->edests + idx, left, right); 1440146040Stjr } 1441146040Stjr break; 1442146040Stjr 1443146040Stjr case ANCHOR: 1444146040Stjr case OP_OPEN_SUBEXP: 1445146040Stjr case OP_CLOSE_SUBEXP: 1446146040Stjr err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1447146040Stjr break; 1448146040Stjr 1449146040Stjr case OP_BACK_REF: 1450146040Stjr dfa->nexts[idx] = node->next->node_idx; 1451146040Stjr if (node->token.type == OP_BACK_REF) 1452250724Sjkim err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1453146040Stjr break; 1454146040Stjr 1455146040Stjr default: 1456146040Stjr assert (!IS_EPSILON_NODE (node->token.type)); 1457146040Stjr dfa->nexts[idx] = node->next->node_idx; 1458146040Stjr break; 1459146040Stjr } 1460146040Stjr 1461146040Stjr return err; 1462146040Stjr} 1463146040Stjr 1464146040Stjr/* Duplicate the epsilon closure of the node ROOT_NODE. 1465146040Stjr Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1466146040Stjr to their own constraint. */ 1467146040Stjr 1468146040Stjrstatic reg_errcode_t 1469250724Sjkiminternal_function 1470250724Sjkimduplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, 1471250724Sjkim int root_node, unsigned int init_constraint) 1472146040Stjr{ 1473146040Stjr int org_node, clone_node, ret; 1474146040Stjr unsigned int constraint = init_constraint; 1475146040Stjr for (org_node = top_org_node, clone_node = top_clone_node;;) 1476146040Stjr { 1477146040Stjr int org_dest, clone_dest; 1478146040Stjr if (dfa->nodes[org_node].type == OP_BACK_REF) 1479146040Stjr { 1480146040Stjr /* If the back reference epsilon-transit, its destination must 1481146040Stjr also have the constraint. Then duplicate the epsilon closure 1482146040Stjr of the destination of the back reference, and store it in 1483146040Stjr edests of the back reference. */ 1484146040Stjr org_dest = dfa->nexts[org_node]; 1485146040Stjr re_node_set_empty (dfa->edests + clone_node); 1486250724Sjkim clone_dest = duplicate_node (dfa, org_dest, constraint); 1487250724Sjkim if (BE (clone_dest == -1, 0)) 1488250724Sjkim return REG_ESPACE; 1489146040Stjr dfa->nexts[clone_node] = dfa->nexts[org_node]; 1490146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1491146040Stjr if (BE (ret < 0, 0)) 1492146040Stjr return REG_ESPACE; 1493146040Stjr } 1494146040Stjr else if (dfa->edests[org_node].nelem == 0) 1495146040Stjr { 1496146040Stjr /* In case of the node can't epsilon-transit, don't duplicate the 1497146040Stjr destination and store the original destination as the 1498146040Stjr destination of the node. */ 1499146040Stjr dfa->nexts[clone_node] = dfa->nexts[org_node]; 1500146040Stjr break; 1501146040Stjr } 1502146040Stjr else if (dfa->edests[org_node].nelem == 1) 1503146040Stjr { 1504146040Stjr /* In case of the node can epsilon-transit, and it has only one 1505146040Stjr destination. */ 1506146040Stjr org_dest = dfa->edests[org_node].elems[0]; 1507146040Stjr re_node_set_empty (dfa->edests + clone_node); 1508250724Sjkim /* If the node is root_node itself, it means the epsilon clsoure 1509250724Sjkim has a loop. Then tie it to the destination of the root_node. */ 1510250724Sjkim if (org_node == root_node && clone_node != org_node) 1511146040Stjr { 1512250724Sjkim ret = re_node_set_insert (dfa->edests + clone_node, org_dest); 1513250724Sjkim if (BE (ret < 0, 0)) 1514250724Sjkim return REG_ESPACE; 1515250724Sjkim break; 1516146040Stjr } 1517250724Sjkim /* In case of the node has another constraint, add it. */ 1518250724Sjkim constraint |= dfa->nodes[org_node].constraint; 1519250724Sjkim clone_dest = duplicate_node (dfa, org_dest, constraint); 1520250724Sjkim if (BE (clone_dest == -1, 0)) 1521250724Sjkim return REG_ESPACE; 1522146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1523146040Stjr if (BE (ret < 0, 0)) 1524146040Stjr return REG_ESPACE; 1525146040Stjr } 1526146040Stjr else /* dfa->edests[org_node].nelem == 2 */ 1527146040Stjr { 1528146040Stjr /* In case of the node can epsilon-transit, and it has two 1529146040Stjr destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1530146040Stjr org_dest = dfa->edests[org_node].elems[0]; 1531146040Stjr re_node_set_empty (dfa->edests + clone_node); 1532146040Stjr /* Search for a duplicated node which satisfies the constraint. */ 1533146040Stjr clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1534146040Stjr if (clone_dest == -1) 1535146040Stjr { 1536250724Sjkim /* There is no such duplicated node, create a new one. */ 1537250724Sjkim reg_errcode_t err; 1538250724Sjkim clone_dest = duplicate_node (dfa, org_dest, constraint); 1539250724Sjkim if (BE (clone_dest == -1, 0)) 1540250724Sjkim return REG_ESPACE; 1541146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1542146040Stjr if (BE (ret < 0, 0)) 1543146040Stjr return REG_ESPACE; 1544146040Stjr err = duplicate_node_closure (dfa, org_dest, clone_dest, 1545146040Stjr root_node, constraint); 1546146040Stjr if (BE (err != REG_NOERROR, 0)) 1547146040Stjr return err; 1548146040Stjr } 1549146040Stjr else 1550146040Stjr { 1551250724Sjkim /* There is a duplicated node which satisfies the constraint, 1552146040Stjr use it to avoid infinite loop. */ 1553146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1554146040Stjr if (BE (ret < 0, 0)) 1555146040Stjr return REG_ESPACE; 1556146040Stjr } 1557146040Stjr 1558146040Stjr org_dest = dfa->edests[org_node].elems[1]; 1559250724Sjkim clone_dest = duplicate_node (dfa, org_dest, constraint); 1560250724Sjkim if (BE (clone_dest == -1, 0)) 1561250724Sjkim return REG_ESPACE; 1562146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1563146040Stjr if (BE (ret < 0, 0)) 1564146040Stjr return REG_ESPACE; 1565146040Stjr } 1566146040Stjr org_node = org_dest; 1567146040Stjr clone_node = clone_dest; 1568146040Stjr } 1569146040Stjr return REG_NOERROR; 1570146040Stjr} 1571146040Stjr 1572146040Stjr/* Search for a node which is duplicated from the node ORG_NODE, and 1573146040Stjr satisfies the constraint CONSTRAINT. */ 1574146040Stjr 1575146040Stjrstatic int 1576250724Sjkimsearch_duplicated_node (const re_dfa_t *dfa, int org_node, 1577250724Sjkim unsigned int constraint) 1578146040Stjr{ 1579146040Stjr int idx; 1580146040Stjr for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1581146040Stjr { 1582146040Stjr if (org_node == dfa->org_indices[idx] 1583146040Stjr && constraint == dfa->nodes[idx].constraint) 1584146040Stjr return idx; /* Found. */ 1585146040Stjr } 1586146040Stjr return -1; /* Not found. */ 1587146040Stjr} 1588146040Stjr 1589146040Stjr/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1590250724Sjkim Return the index of the new node, or -1 if insufficient storage is 1591250724Sjkim available. */ 1592146040Stjr 1593250724Sjkimstatic int 1594250724Sjkimduplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) 1595146040Stjr{ 1596146040Stjr int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1597250724Sjkim if (BE (dup_idx != -1, 1)) 1598250724Sjkim { 1599250724Sjkim dfa->nodes[dup_idx].constraint = constraint; 1600250724Sjkim dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1601250724Sjkim dfa->nodes[dup_idx].duplicated = 1; 1602146040Stjr 1603250724Sjkim /* Store the index of the original node. */ 1604250724Sjkim dfa->org_indices[dup_idx] = org_idx; 1605250724Sjkim } 1606250724Sjkim return dup_idx; 1607146040Stjr} 1608146040Stjr 1609146040Stjrstatic reg_errcode_t 1610250724Sjkimcalc_inveclosure (re_dfa_t *dfa) 1611146040Stjr{ 1612146040Stjr int src, idx, ret; 1613146040Stjr for (idx = 0; idx < dfa->nodes_len; ++idx) 1614146040Stjr re_node_set_init_empty (dfa->inveclosures + idx); 1615146040Stjr 1616146040Stjr for (src = 0; src < dfa->nodes_len; ++src) 1617146040Stjr { 1618146040Stjr int *elems = dfa->eclosures[src].elems; 1619146040Stjr for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1620146040Stjr { 1621146040Stjr ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1622146040Stjr if (BE (ret == -1, 0)) 1623146040Stjr return REG_ESPACE; 1624146040Stjr } 1625146040Stjr } 1626146040Stjr 1627146040Stjr return REG_NOERROR; 1628146040Stjr} 1629146040Stjr 1630146040Stjr/* Calculate "eclosure" for all the node in DFA. */ 1631146040Stjr 1632146040Stjrstatic reg_errcode_t 1633250724Sjkimcalc_eclosure (re_dfa_t *dfa) 1634146040Stjr{ 1635146040Stjr int node_idx, incomplete; 1636146040Stjr#ifdef DEBUG 1637146040Stjr assert (dfa->nodes_len > 0); 1638146040Stjr#endif 1639146040Stjr incomplete = 0; 1640146040Stjr /* For each nodes, calculate epsilon closure. */ 1641146040Stjr for (node_idx = 0; ; ++node_idx) 1642146040Stjr { 1643146040Stjr reg_errcode_t err; 1644146040Stjr re_node_set eclosure_elem; 1645146040Stjr if (node_idx == dfa->nodes_len) 1646146040Stjr { 1647146040Stjr if (!incomplete) 1648146040Stjr break; 1649146040Stjr incomplete = 0; 1650146040Stjr node_idx = 0; 1651146040Stjr } 1652146040Stjr 1653146040Stjr#ifdef DEBUG 1654146040Stjr assert (dfa->eclosures[node_idx].nelem != -1); 1655146040Stjr#endif 1656146040Stjr 1657146040Stjr /* If we have already calculated, skip it. */ 1658146040Stjr if (dfa->eclosures[node_idx].nelem != 0) 1659146040Stjr continue; 1660146040Stjr /* Calculate epsilon closure of `node_idx'. */ 1661146040Stjr err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1662146040Stjr if (BE (err != REG_NOERROR, 0)) 1663146040Stjr return err; 1664146040Stjr 1665146040Stjr if (dfa->eclosures[node_idx].nelem == 0) 1666146040Stjr { 1667146040Stjr incomplete = 1; 1668146040Stjr re_node_set_free (&eclosure_elem); 1669146040Stjr } 1670146040Stjr } 1671146040Stjr return REG_NOERROR; 1672146040Stjr} 1673146040Stjr 1674146040Stjr/* Calculate epsilon closure of NODE. */ 1675146040Stjr 1676146040Stjrstatic reg_errcode_t 1677250724Sjkimcalc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) 1678146040Stjr{ 1679146040Stjr reg_errcode_t err; 1680250724Sjkim int i; 1681146040Stjr re_node_set eclosure; 1682250724Sjkim int ret; 1683250724Sjkim int incomplete = 0; 1684146040Stjr err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1685146040Stjr if (BE (err != REG_NOERROR, 0)) 1686146040Stjr return err; 1687146040Stjr 1688146040Stjr /* This indicates that we are calculating this node now. 1689146040Stjr We reference this value to avoid infinite loop. */ 1690146040Stjr dfa->eclosures[node].nelem = -1; 1691146040Stjr 1692250724Sjkim /* If the current node has constraints, duplicate all nodes 1693250724Sjkim since they must inherit the constraints. */ 1694250724Sjkim if (dfa->nodes[node].constraint 1695146040Stjr && dfa->edests[node].nelem 1696146040Stjr && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1697146040Stjr { 1698250724Sjkim err = duplicate_node_closure (dfa, node, node, node, 1699250724Sjkim dfa->nodes[node].constraint); 1700146040Stjr if (BE (err != REG_NOERROR, 0)) 1701146040Stjr return err; 1702146040Stjr } 1703146040Stjr 1704146040Stjr /* Expand each epsilon destination nodes. */ 1705146040Stjr if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1706146040Stjr for (i = 0; i < dfa->edests[node].nelem; ++i) 1707146040Stjr { 1708146040Stjr re_node_set eclosure_elem; 1709146040Stjr int edest = dfa->edests[node].elems[i]; 1710146040Stjr /* If calculating the epsilon closure of `edest' is in progress, 1711146040Stjr return intermediate result. */ 1712146040Stjr if (dfa->eclosures[edest].nelem == -1) 1713146040Stjr { 1714146040Stjr incomplete = 1; 1715146040Stjr continue; 1716146040Stjr } 1717146040Stjr /* If we haven't calculated the epsilon closure of `edest' yet, 1718146040Stjr calculate now. Otherwise use calculated epsilon closure. */ 1719146040Stjr if (dfa->eclosures[edest].nelem == 0) 1720146040Stjr { 1721146040Stjr err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1722146040Stjr if (BE (err != REG_NOERROR, 0)) 1723146040Stjr return err; 1724146040Stjr } 1725146040Stjr else 1726146040Stjr eclosure_elem = dfa->eclosures[edest]; 1727146040Stjr /* Merge the epsilon closure of `edest'. */ 1728250724Sjkim err = re_node_set_merge (&eclosure, &eclosure_elem); 1729250724Sjkim if (BE (err != REG_NOERROR, 0)) 1730250724Sjkim return err; 1731146040Stjr /* If the epsilon closure of `edest' is incomplete, 1732146040Stjr the epsilon closure of this node is also incomplete. */ 1733146040Stjr if (dfa->eclosures[edest].nelem == 0) 1734146040Stjr { 1735146040Stjr incomplete = 1; 1736146040Stjr re_node_set_free (&eclosure_elem); 1737146040Stjr } 1738146040Stjr } 1739146040Stjr 1740250724Sjkim /* An epsilon closure includes itself. */ 1741250724Sjkim ret = re_node_set_insert (&eclosure, node); 1742250724Sjkim if (BE (ret < 0, 0)) 1743250724Sjkim return REG_ESPACE; 1744146040Stjr if (incomplete && !root) 1745146040Stjr dfa->eclosures[node].nelem = 0; 1746146040Stjr else 1747146040Stjr dfa->eclosures[node] = eclosure; 1748146040Stjr *new_set = eclosure; 1749146040Stjr return REG_NOERROR; 1750146040Stjr} 1751146040Stjr 1752146040Stjr/* Functions for token which are used in the parser. */ 1753146040Stjr 1754146040Stjr/* Fetch a token from INPUT. 1755146040Stjr We must not use this function inside bracket expressions. */ 1756146040Stjr 1757146040Stjrstatic void 1758250724Sjkiminternal_function 1759250724Sjkimfetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1760146040Stjr{ 1761146040Stjr re_string_skip_bytes (input, peek_token (result, input, syntax)); 1762146040Stjr} 1763146040Stjr 1764146040Stjr/* Peek a token from INPUT, and return the length of the token. 1765146040Stjr We must not use this function inside bracket expressions. */ 1766146040Stjr 1767146040Stjrstatic int 1768250724Sjkiminternal_function 1769250724Sjkimpeek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1770146040Stjr{ 1771146040Stjr unsigned char c; 1772146040Stjr 1773146040Stjr if (re_string_eoi (input)) 1774146040Stjr { 1775146040Stjr token->type = END_OF_RE; 1776146040Stjr return 0; 1777146040Stjr } 1778146040Stjr 1779146040Stjr c = re_string_peek_byte (input, 0); 1780146040Stjr token->opr.c = c; 1781146040Stjr 1782146040Stjr token->word_char = 0; 1783146040Stjr#ifdef RE_ENABLE_I18N 1784146040Stjr token->mb_partial = 0; 1785146040Stjr if (input->mb_cur_max > 1 && 1786146040Stjr !re_string_first_byte (input, re_string_cur_idx (input))) 1787146040Stjr { 1788146040Stjr token->type = CHARACTER; 1789146040Stjr token->mb_partial = 1; 1790146040Stjr return 1; 1791146040Stjr } 1792146040Stjr#endif 1793146040Stjr if (c == '\\') 1794146040Stjr { 1795146040Stjr unsigned char c2; 1796146040Stjr if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1797146040Stjr { 1798146040Stjr token->type = BACK_SLASH; 1799146040Stjr return 1; 1800146040Stjr } 1801146040Stjr 1802146040Stjr c2 = re_string_peek_byte_case (input, 1); 1803146040Stjr token->opr.c = c2; 1804146040Stjr token->type = CHARACTER; 1805146040Stjr#ifdef RE_ENABLE_I18N 1806146040Stjr if (input->mb_cur_max > 1) 1807146040Stjr { 1808146040Stjr wint_t wc = re_string_wchar_at (input, 1809146040Stjr re_string_cur_idx (input) + 1); 1810146040Stjr token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1811146040Stjr } 1812146040Stjr else 1813146040Stjr#endif 1814146040Stjr token->word_char = IS_WORD_CHAR (c2) != 0; 1815146040Stjr 1816146040Stjr switch (c2) 1817146040Stjr { 1818146040Stjr case '|': 1819146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1820146040Stjr token->type = OP_ALT; 1821146040Stjr break; 1822146040Stjr case '1': case '2': case '3': case '4': case '5': 1823146040Stjr case '6': case '7': case '8': case '9': 1824146040Stjr if (!(syntax & RE_NO_BK_REFS)) 1825146040Stjr { 1826146040Stjr token->type = OP_BACK_REF; 1827146040Stjr token->opr.idx = c2 - '1'; 1828146040Stjr } 1829146040Stjr break; 1830146040Stjr case '<': 1831146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1832146040Stjr { 1833146040Stjr token->type = ANCHOR; 1834146040Stjr token->opr.ctx_type = WORD_FIRST; 1835146040Stjr } 1836146040Stjr break; 1837146040Stjr case '>': 1838146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1839146040Stjr { 1840146040Stjr token->type = ANCHOR; 1841146040Stjr token->opr.ctx_type = WORD_LAST; 1842146040Stjr } 1843146040Stjr break; 1844146040Stjr case 'b': 1845146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1846146040Stjr { 1847146040Stjr token->type = ANCHOR; 1848146040Stjr token->opr.ctx_type = WORD_DELIM; 1849146040Stjr } 1850146040Stjr break; 1851146040Stjr case 'B': 1852146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1853146040Stjr { 1854146040Stjr token->type = ANCHOR; 1855146040Stjr token->opr.ctx_type = NOT_WORD_DELIM; 1856146040Stjr } 1857146040Stjr break; 1858146040Stjr case 'w': 1859146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1860146040Stjr token->type = OP_WORD; 1861146040Stjr break; 1862146040Stjr case 'W': 1863146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1864146040Stjr token->type = OP_NOTWORD; 1865146040Stjr break; 1866146040Stjr case 's': 1867146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1868146040Stjr token->type = OP_SPACE; 1869146040Stjr break; 1870146040Stjr case 'S': 1871146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1872146040Stjr token->type = OP_NOTSPACE; 1873146040Stjr break; 1874146040Stjr case '`': 1875146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1876146040Stjr { 1877146040Stjr token->type = ANCHOR; 1878146040Stjr token->opr.ctx_type = BUF_FIRST; 1879146040Stjr } 1880146040Stjr break; 1881146040Stjr case '\'': 1882146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1883146040Stjr { 1884146040Stjr token->type = ANCHOR; 1885146040Stjr token->opr.ctx_type = BUF_LAST; 1886146040Stjr } 1887146040Stjr break; 1888146040Stjr case '(': 1889146040Stjr if (!(syntax & RE_NO_BK_PARENS)) 1890146040Stjr token->type = OP_OPEN_SUBEXP; 1891146040Stjr break; 1892146040Stjr case ')': 1893146040Stjr if (!(syntax & RE_NO_BK_PARENS)) 1894146040Stjr token->type = OP_CLOSE_SUBEXP; 1895146040Stjr break; 1896146040Stjr case '+': 1897146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1898146040Stjr token->type = OP_DUP_PLUS; 1899146040Stjr break; 1900146040Stjr case '?': 1901146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1902146040Stjr token->type = OP_DUP_QUESTION; 1903146040Stjr break; 1904146040Stjr case '{': 1905146040Stjr if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1906146040Stjr token->type = OP_OPEN_DUP_NUM; 1907146040Stjr break; 1908146040Stjr case '}': 1909146040Stjr if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1910146040Stjr token->type = OP_CLOSE_DUP_NUM; 1911146040Stjr break; 1912146040Stjr default: 1913146040Stjr break; 1914146040Stjr } 1915146040Stjr return 2; 1916146040Stjr } 1917146040Stjr 1918146040Stjr token->type = CHARACTER; 1919146040Stjr#ifdef RE_ENABLE_I18N 1920146040Stjr if (input->mb_cur_max > 1) 1921146040Stjr { 1922146040Stjr wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1923146040Stjr token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1924146040Stjr } 1925146040Stjr else 1926146040Stjr#endif 1927146040Stjr token->word_char = IS_WORD_CHAR (token->opr.c); 1928146040Stjr 1929146040Stjr switch (c) 1930146040Stjr { 1931146040Stjr case '\n': 1932146040Stjr if (syntax & RE_NEWLINE_ALT) 1933146040Stjr token->type = OP_ALT; 1934146040Stjr break; 1935146040Stjr case '|': 1936146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1937146040Stjr token->type = OP_ALT; 1938146040Stjr break; 1939146040Stjr case '*': 1940146040Stjr token->type = OP_DUP_ASTERISK; 1941146040Stjr break; 1942146040Stjr case '+': 1943146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1944146040Stjr token->type = OP_DUP_PLUS; 1945146040Stjr break; 1946146040Stjr case '?': 1947146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1948146040Stjr token->type = OP_DUP_QUESTION; 1949146040Stjr break; 1950146040Stjr case '{': 1951146040Stjr if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1952146040Stjr token->type = OP_OPEN_DUP_NUM; 1953146040Stjr break; 1954146040Stjr case '}': 1955146040Stjr if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1956146040Stjr token->type = OP_CLOSE_DUP_NUM; 1957146040Stjr break; 1958146040Stjr case '(': 1959146040Stjr if (syntax & RE_NO_BK_PARENS) 1960146040Stjr token->type = OP_OPEN_SUBEXP; 1961146040Stjr break; 1962146040Stjr case ')': 1963146040Stjr if (syntax & RE_NO_BK_PARENS) 1964146040Stjr token->type = OP_CLOSE_SUBEXP; 1965146040Stjr break; 1966146040Stjr case '[': 1967146040Stjr token->type = OP_OPEN_BRACKET; 1968146040Stjr break; 1969146040Stjr case '.': 1970146040Stjr token->type = OP_PERIOD; 1971146040Stjr break; 1972146040Stjr case '^': 1973146040Stjr if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1974146040Stjr re_string_cur_idx (input) != 0) 1975146040Stjr { 1976146040Stjr char prev = re_string_peek_byte (input, -1); 1977146040Stjr if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1978146040Stjr break; 1979146040Stjr } 1980146040Stjr token->type = ANCHOR; 1981146040Stjr token->opr.ctx_type = LINE_FIRST; 1982146040Stjr break; 1983146040Stjr case '$': 1984146040Stjr if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1985146040Stjr re_string_cur_idx (input) + 1 != re_string_length (input)) 1986146040Stjr { 1987146040Stjr re_token_t next; 1988146040Stjr re_string_skip_bytes (input, 1); 1989146040Stjr peek_token (&next, input, syntax); 1990146040Stjr re_string_skip_bytes (input, -1); 1991146040Stjr if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1992146040Stjr break; 1993146040Stjr } 1994146040Stjr token->type = ANCHOR; 1995146040Stjr token->opr.ctx_type = LINE_LAST; 1996146040Stjr break; 1997146040Stjr default: 1998146040Stjr break; 1999146040Stjr } 2000146040Stjr return 1; 2001146040Stjr} 2002146040Stjr 2003146040Stjr/* Peek a token from INPUT, and return the length of the token. 2004146040Stjr We must not use this function out of bracket expressions. */ 2005146040Stjr 2006146040Stjrstatic int 2007250724Sjkiminternal_function 2008250724Sjkimpeek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2009146040Stjr{ 2010146040Stjr unsigned char c; 2011146040Stjr if (re_string_eoi (input)) 2012146040Stjr { 2013146040Stjr token->type = END_OF_RE; 2014146040Stjr return 0; 2015146040Stjr } 2016146040Stjr c = re_string_peek_byte (input, 0); 2017146040Stjr token->opr.c = c; 2018146040Stjr 2019146040Stjr#ifdef RE_ENABLE_I18N 2020146040Stjr if (input->mb_cur_max > 1 && 2021146040Stjr !re_string_first_byte (input, re_string_cur_idx (input))) 2022146040Stjr { 2023146040Stjr token->type = CHARACTER; 2024146040Stjr return 1; 2025146040Stjr } 2026146040Stjr#endif /* RE_ENABLE_I18N */ 2027146040Stjr 2028146040Stjr if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2029146040Stjr && re_string_cur_idx (input) + 1 < re_string_length (input)) 2030146040Stjr { 2031146040Stjr /* In this case, '\' escape a character. */ 2032146040Stjr unsigned char c2; 2033146040Stjr re_string_skip_bytes (input, 1); 2034146040Stjr c2 = re_string_peek_byte (input, 0); 2035146040Stjr token->opr.c = c2; 2036146040Stjr token->type = CHARACTER; 2037146040Stjr return 1; 2038146040Stjr } 2039146040Stjr if (c == '[') /* '[' is a special char in a bracket exps. */ 2040146040Stjr { 2041146040Stjr unsigned char c2; 2042146040Stjr int token_len; 2043146040Stjr if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2044146040Stjr c2 = re_string_peek_byte (input, 1); 2045146040Stjr else 2046146040Stjr c2 = 0; 2047146040Stjr token->opr.c = c2; 2048146040Stjr token_len = 2; 2049146040Stjr switch (c2) 2050146040Stjr { 2051146040Stjr case '.': 2052146040Stjr token->type = OP_OPEN_COLL_ELEM; 2053146040Stjr break; 2054146040Stjr case '=': 2055146040Stjr token->type = OP_OPEN_EQUIV_CLASS; 2056146040Stjr break; 2057146040Stjr case ':': 2058146040Stjr if (syntax & RE_CHAR_CLASSES) 2059146040Stjr { 2060146040Stjr token->type = OP_OPEN_CHAR_CLASS; 2061146040Stjr break; 2062146040Stjr } 2063146040Stjr /* else fall through. */ 2064146040Stjr default: 2065146040Stjr token->type = CHARACTER; 2066146040Stjr token->opr.c = c; 2067146040Stjr token_len = 1; 2068146040Stjr break; 2069146040Stjr } 2070146040Stjr return token_len; 2071146040Stjr } 2072146040Stjr switch (c) 2073146040Stjr { 2074146040Stjr case '-': 2075146040Stjr token->type = OP_CHARSET_RANGE; 2076146040Stjr break; 2077146040Stjr case ']': 2078146040Stjr token->type = OP_CLOSE_BRACKET; 2079146040Stjr break; 2080146040Stjr case '^': 2081146040Stjr token->type = OP_NON_MATCH_LIST; 2082146040Stjr break; 2083146040Stjr default: 2084146040Stjr token->type = CHARACTER; 2085146040Stjr } 2086146040Stjr return 1; 2087146040Stjr} 2088146040Stjr 2089146040Stjr/* Functions for parser. */ 2090146040Stjr 2091146040Stjr/* Entry point of the parser. 2092146040Stjr Parse the regular expression REGEXP and return the structure tree. 2093146040Stjr If an error is occured, ERR is set by error code, and return NULL. 2094146040Stjr This function build the following tree, from regular expression <reg_exp>: 2095146040Stjr CAT 2096146040Stjr / \ 2097146040Stjr / \ 2098146040Stjr <reg_exp> EOR 2099146040Stjr 2100146040Stjr CAT means concatenation. 2101146040Stjr EOR means end of regular expression. */ 2102146040Stjr 2103146040Stjrstatic bin_tree_t * 2104250724Sjkimparse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2105250724Sjkim reg_errcode_t *err) 2106146040Stjr{ 2107146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2108146040Stjr bin_tree_t *tree, *eor, *root; 2109146040Stjr re_token_t current_token; 2110146040Stjr dfa->syntax = syntax; 2111146040Stjr fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2112146040Stjr tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2113146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2114146040Stjr return NULL; 2115146040Stjr eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2116146040Stjr if (tree != NULL) 2117146040Stjr root = create_tree (dfa, tree, eor, CONCAT); 2118146040Stjr else 2119146040Stjr root = eor; 2120146040Stjr if (BE (eor == NULL || root == NULL, 0)) 2121146040Stjr { 2122146040Stjr *err = REG_ESPACE; 2123146040Stjr return NULL; 2124146040Stjr } 2125146040Stjr return root; 2126146040Stjr} 2127146040Stjr 2128146040Stjr/* This function build the following tree, from regular expression 2129146040Stjr <branch1>|<branch2>: 2130146040Stjr ALT 2131146040Stjr / \ 2132146040Stjr / \ 2133146040Stjr <branch1> <branch2> 2134146040Stjr 2135146040Stjr ALT means alternative, which represents the operator `|'. */ 2136146040Stjr 2137146040Stjrstatic bin_tree_t * 2138250724Sjkimparse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2139250724Sjkim reg_syntax_t syntax, int nest, reg_errcode_t *err) 2140146040Stjr{ 2141146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2142146040Stjr bin_tree_t *tree, *branch = NULL; 2143146040Stjr tree = parse_branch (regexp, preg, token, syntax, nest, err); 2144146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2145146040Stjr return NULL; 2146146040Stjr 2147146040Stjr while (token->type == OP_ALT) 2148146040Stjr { 2149146040Stjr fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2150146040Stjr if (token->type != OP_ALT && token->type != END_OF_RE 2151146040Stjr && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2152146040Stjr { 2153146040Stjr branch = parse_branch (regexp, preg, token, syntax, nest, err); 2154146040Stjr if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2155146040Stjr return NULL; 2156146040Stjr } 2157146040Stjr else 2158146040Stjr branch = NULL; 2159146040Stjr tree = create_tree (dfa, tree, branch, OP_ALT); 2160146040Stjr if (BE (tree == NULL, 0)) 2161146040Stjr { 2162146040Stjr *err = REG_ESPACE; 2163146040Stjr return NULL; 2164146040Stjr } 2165146040Stjr } 2166146040Stjr return tree; 2167146040Stjr} 2168146040Stjr 2169146040Stjr/* This function build the following tree, from regular expression 2170146040Stjr <exp1><exp2>: 2171146040Stjr CAT 2172146040Stjr / \ 2173146040Stjr / \ 2174146040Stjr <exp1> <exp2> 2175146040Stjr 2176146040Stjr CAT means concatenation. */ 2177146040Stjr 2178146040Stjrstatic bin_tree_t * 2179250724Sjkimparse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2180250724Sjkim reg_syntax_t syntax, int nest, reg_errcode_t *err) 2181146040Stjr{ 2182146040Stjr bin_tree_t *tree, *exp; 2183146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2184146040Stjr tree = parse_expression (regexp, preg, token, syntax, nest, err); 2185146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2186146040Stjr return NULL; 2187146040Stjr 2188146040Stjr while (token->type != OP_ALT && token->type != END_OF_RE 2189146040Stjr && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2190146040Stjr { 2191146040Stjr exp = parse_expression (regexp, preg, token, syntax, nest, err); 2192146040Stjr if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2193146040Stjr { 2194250724Sjkim if (tree != NULL) 2195250724Sjkim postorder (tree, free_tree, NULL); 2196146040Stjr return NULL; 2197146040Stjr } 2198146040Stjr if (tree != NULL && exp != NULL) 2199146040Stjr { 2200250724Sjkim bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT); 2201250724Sjkim if (newtree == NULL) 2202146040Stjr { 2203250724Sjkim postorder (exp, free_tree, NULL); 2204250724Sjkim postorder (tree, free_tree, NULL); 2205146040Stjr *err = REG_ESPACE; 2206146040Stjr return NULL; 2207146040Stjr } 2208250724Sjkim tree = newtree; 2209146040Stjr } 2210146040Stjr else if (tree == NULL) 2211146040Stjr tree = exp; 2212146040Stjr /* Otherwise exp == NULL, we don't need to create new tree. */ 2213146040Stjr } 2214146040Stjr return tree; 2215146040Stjr} 2216146040Stjr 2217146040Stjr/* This function build the following tree, from regular expression a*: 2218146040Stjr * 2219146040Stjr | 2220146040Stjr a 2221146040Stjr*/ 2222146040Stjr 2223146040Stjrstatic bin_tree_t * 2224250724Sjkimparse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2225250724Sjkim reg_syntax_t syntax, int nest, reg_errcode_t *err) 2226146040Stjr{ 2227146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2228146040Stjr bin_tree_t *tree; 2229146040Stjr switch (token->type) 2230146040Stjr { 2231146040Stjr case CHARACTER: 2232146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2233146040Stjr if (BE (tree == NULL, 0)) 2234146040Stjr { 2235146040Stjr *err = REG_ESPACE; 2236146040Stjr return NULL; 2237146040Stjr } 2238146040Stjr#ifdef RE_ENABLE_I18N 2239146040Stjr if (dfa->mb_cur_max > 1) 2240146040Stjr { 2241146040Stjr while (!re_string_eoi (regexp) 2242146040Stjr && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2243146040Stjr { 2244146040Stjr bin_tree_t *mbc_remain; 2245146040Stjr fetch_token (token, regexp, syntax); 2246146040Stjr mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2247146040Stjr tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2248146040Stjr if (BE (mbc_remain == NULL || tree == NULL, 0)) 2249146040Stjr { 2250146040Stjr *err = REG_ESPACE; 2251146040Stjr return NULL; 2252146040Stjr } 2253146040Stjr } 2254146040Stjr } 2255146040Stjr#endif 2256146040Stjr break; 2257146040Stjr case OP_OPEN_SUBEXP: 2258146040Stjr tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2259146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2260146040Stjr return NULL; 2261146040Stjr break; 2262146040Stjr case OP_OPEN_BRACKET: 2263146040Stjr tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2264146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2265146040Stjr return NULL; 2266146040Stjr break; 2267146040Stjr case OP_BACK_REF: 2268146040Stjr if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2269146040Stjr { 2270146040Stjr *err = REG_ESUBREG; 2271146040Stjr return NULL; 2272146040Stjr } 2273146040Stjr dfa->used_bkref_map |= 1 << token->opr.idx; 2274146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2275146040Stjr if (BE (tree == NULL, 0)) 2276146040Stjr { 2277146040Stjr *err = REG_ESPACE; 2278146040Stjr return NULL; 2279146040Stjr } 2280146040Stjr ++dfa->nbackref; 2281146040Stjr dfa->has_mb_node = 1; 2282146040Stjr break; 2283146040Stjr case OP_OPEN_DUP_NUM: 2284146040Stjr if (syntax & RE_CONTEXT_INVALID_DUP) 2285146040Stjr { 2286146040Stjr *err = REG_BADRPT; 2287146040Stjr return NULL; 2288146040Stjr } 2289146040Stjr /* FALLTHROUGH */ 2290146040Stjr case OP_DUP_ASTERISK: 2291146040Stjr case OP_DUP_PLUS: 2292146040Stjr case OP_DUP_QUESTION: 2293146040Stjr if (syntax & RE_CONTEXT_INVALID_OPS) 2294146040Stjr { 2295146040Stjr *err = REG_BADRPT; 2296146040Stjr return NULL; 2297146040Stjr } 2298146040Stjr else if (syntax & RE_CONTEXT_INDEP_OPS) 2299146040Stjr { 2300146040Stjr fetch_token (token, regexp, syntax); 2301146040Stjr return parse_expression (regexp, preg, token, syntax, nest, err); 2302146040Stjr } 2303146040Stjr /* else fall through */ 2304146040Stjr case OP_CLOSE_SUBEXP: 2305146040Stjr if ((token->type == OP_CLOSE_SUBEXP) && 2306146040Stjr !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2307146040Stjr { 2308146040Stjr *err = REG_ERPAREN; 2309146040Stjr return NULL; 2310146040Stjr } 2311146040Stjr /* else fall through */ 2312146040Stjr case OP_CLOSE_DUP_NUM: 2313146040Stjr /* We treat it as a normal character. */ 2314146040Stjr 2315146040Stjr /* Then we can these characters as normal characters. */ 2316146040Stjr token->type = CHARACTER; 2317146040Stjr /* mb_partial and word_char bits should be initialized already 2318146040Stjr by peek_token. */ 2319146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2320146040Stjr if (BE (tree == NULL, 0)) 2321146040Stjr { 2322146040Stjr *err = REG_ESPACE; 2323146040Stjr return NULL; 2324146040Stjr } 2325146040Stjr break; 2326146040Stjr case ANCHOR: 2327146040Stjr if ((token->opr.ctx_type 2328146040Stjr & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2329146040Stjr && dfa->word_ops_used == 0) 2330146040Stjr init_word_char (dfa); 2331146040Stjr if (token->opr.ctx_type == WORD_DELIM 2332250724Sjkim || token->opr.ctx_type == NOT_WORD_DELIM) 2333146040Stjr { 2334146040Stjr bin_tree_t *tree_first, *tree_last; 2335146040Stjr if (token->opr.ctx_type == WORD_DELIM) 2336146040Stjr { 2337146040Stjr token->opr.ctx_type = WORD_FIRST; 2338146040Stjr tree_first = create_token_tree (dfa, NULL, NULL, token); 2339146040Stjr token->opr.ctx_type = WORD_LAST; 2340250724Sjkim } 2341250724Sjkim else 2342250724Sjkim { 2343146040Stjr token->opr.ctx_type = INSIDE_WORD; 2344146040Stjr tree_first = create_token_tree (dfa, NULL, NULL, token); 2345146040Stjr token->opr.ctx_type = INSIDE_NOTWORD; 2346250724Sjkim } 2347146040Stjr tree_last = create_token_tree (dfa, NULL, NULL, token); 2348146040Stjr tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2349146040Stjr if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2350146040Stjr { 2351146040Stjr *err = REG_ESPACE; 2352146040Stjr return NULL; 2353146040Stjr } 2354146040Stjr } 2355146040Stjr else 2356146040Stjr { 2357146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2358146040Stjr if (BE (tree == NULL, 0)) 2359146040Stjr { 2360146040Stjr *err = REG_ESPACE; 2361146040Stjr return NULL; 2362146040Stjr } 2363146040Stjr } 2364146040Stjr /* We must return here, since ANCHORs can't be followed 2365146040Stjr by repetition operators. 2366146040Stjr eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2367146040Stjr it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2368146040Stjr fetch_token (token, regexp, syntax); 2369146040Stjr return tree; 2370146040Stjr case OP_PERIOD: 2371146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2372146040Stjr if (BE (tree == NULL, 0)) 2373146040Stjr { 2374146040Stjr *err = REG_ESPACE; 2375146040Stjr return NULL; 2376146040Stjr } 2377146040Stjr if (dfa->mb_cur_max > 1) 2378146040Stjr dfa->has_mb_node = 1; 2379146040Stjr break; 2380146040Stjr case OP_WORD: 2381146040Stjr case OP_NOTWORD: 2382146040Stjr tree = build_charclass_op (dfa, regexp->trans, 2383146040Stjr (const unsigned char *) "alnum", 2384146040Stjr (const unsigned char *) "_", 2385146040Stjr token->type == OP_NOTWORD, err); 2386146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2387146040Stjr return NULL; 2388146040Stjr break; 2389146040Stjr case OP_SPACE: 2390146040Stjr case OP_NOTSPACE: 2391146040Stjr tree = build_charclass_op (dfa, regexp->trans, 2392146040Stjr (const unsigned char *) "space", 2393146040Stjr (const unsigned char *) "", 2394146040Stjr token->type == OP_NOTSPACE, err); 2395146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2396146040Stjr return NULL; 2397146040Stjr break; 2398146040Stjr case OP_ALT: 2399146040Stjr case END_OF_RE: 2400146040Stjr return NULL; 2401146040Stjr case BACK_SLASH: 2402146040Stjr *err = REG_EESCAPE; 2403146040Stjr return NULL; 2404146040Stjr default: 2405146040Stjr /* Must not happen? */ 2406146040Stjr#ifdef DEBUG 2407146040Stjr assert (0); 2408146040Stjr#endif 2409146040Stjr return NULL; 2410146040Stjr } 2411146040Stjr fetch_token (token, regexp, syntax); 2412146040Stjr 2413146040Stjr while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2414146040Stjr || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2415146040Stjr { 2416146040Stjr tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2417146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2418146040Stjr return NULL; 2419146040Stjr /* In BRE consecutive duplications are not allowed. */ 2420146040Stjr if ((syntax & RE_CONTEXT_INVALID_DUP) 2421146040Stjr && (token->type == OP_DUP_ASTERISK 2422146040Stjr || token->type == OP_OPEN_DUP_NUM)) 2423146040Stjr { 2424146040Stjr *err = REG_BADRPT; 2425146040Stjr return NULL; 2426146040Stjr } 2427146040Stjr } 2428146040Stjr 2429146040Stjr return tree; 2430146040Stjr} 2431146040Stjr 2432146040Stjr/* This function build the following tree, from regular expression 2433146040Stjr (<reg_exp>): 2434146040Stjr SUBEXP 2435146040Stjr | 2436146040Stjr <reg_exp> 2437146040Stjr*/ 2438146040Stjr 2439146040Stjrstatic bin_tree_t * 2440250724Sjkimparse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2441250724Sjkim reg_syntax_t syntax, int nest, reg_errcode_t *err) 2442146040Stjr{ 2443146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2444146040Stjr bin_tree_t *tree; 2445146040Stjr size_t cur_nsub; 2446146040Stjr cur_nsub = preg->re_nsub++; 2447146040Stjr 2448146040Stjr fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2449146040Stjr 2450146040Stjr /* The subexpression may be a null string. */ 2451146040Stjr if (token->type == OP_CLOSE_SUBEXP) 2452146040Stjr tree = NULL; 2453146040Stjr else 2454146040Stjr { 2455146040Stjr tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2456146040Stjr if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2457250724Sjkim { 2458250724Sjkim if (tree != NULL) 2459250724Sjkim postorder (tree, free_tree, NULL); 2460250724Sjkim *err = REG_EPAREN; 2461250724Sjkim } 2462146040Stjr if (BE (*err != REG_NOERROR, 0)) 2463146040Stjr return NULL; 2464146040Stjr } 2465146040Stjr 2466250724Sjkim if (cur_nsub <= '9' - '1') 2467250724Sjkim dfa->completed_bkref_map |= 1 << cur_nsub; 2468250724Sjkim 2469146040Stjr tree = create_tree (dfa, tree, NULL, SUBEXP); 2470146040Stjr if (BE (tree == NULL, 0)) 2471146040Stjr { 2472146040Stjr *err = REG_ESPACE; 2473146040Stjr return NULL; 2474146040Stjr } 2475146040Stjr tree->token.opr.idx = cur_nsub; 2476146040Stjr return tree; 2477146040Stjr} 2478146040Stjr 2479146040Stjr/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2480146040Stjr 2481146040Stjrstatic bin_tree_t * 2482250724Sjkimparse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2483250724Sjkim re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2484146040Stjr{ 2485146040Stjr bin_tree_t *tree = NULL, *old_tree = NULL; 2486146040Stjr int i, start, end, start_idx = re_string_cur_idx (regexp); 2487146040Stjr re_token_t start_token = *token; 2488146040Stjr 2489146040Stjr if (token->type == OP_OPEN_DUP_NUM) 2490146040Stjr { 2491146040Stjr end = 0; 2492146040Stjr start = fetch_number (regexp, token, syntax); 2493146040Stjr if (start == -1) 2494146040Stjr { 2495146040Stjr if (token->type == CHARACTER && token->opr.c == ',') 2496146040Stjr start = 0; /* We treat "{,m}" as "{0,m}". */ 2497146040Stjr else 2498146040Stjr { 2499146040Stjr *err = REG_BADBR; /* <re>{} is invalid. */ 2500146040Stjr return NULL; 2501146040Stjr } 2502146040Stjr } 2503146040Stjr if (BE (start != -2, 1)) 2504146040Stjr { 2505146040Stjr /* We treat "{n}" as "{n,n}". */ 2506146040Stjr end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2507146040Stjr : ((token->type == CHARACTER && token->opr.c == ',') 2508146040Stjr ? fetch_number (regexp, token, syntax) : -2)); 2509146040Stjr } 2510146040Stjr if (BE (start == -2 || end == -2, 0)) 2511146040Stjr { 2512146040Stjr /* Invalid sequence. */ 2513146040Stjr if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2514146040Stjr { 2515146040Stjr if (token->type == END_OF_RE) 2516146040Stjr *err = REG_EBRACE; 2517146040Stjr else 2518146040Stjr *err = REG_BADBR; 2519146040Stjr 2520146040Stjr return NULL; 2521146040Stjr } 2522146040Stjr 2523146040Stjr /* If the syntax bit is set, rollback. */ 2524146040Stjr re_string_set_index (regexp, start_idx); 2525146040Stjr *token = start_token; 2526146040Stjr token->type = CHARACTER; 2527146040Stjr /* mb_partial and word_char bits should be already initialized by 2528146040Stjr peek_token. */ 2529146040Stjr return elem; 2530146040Stjr } 2531146040Stjr 2532250724Sjkim if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0)) 2533146040Stjr { 2534146040Stjr /* First number greater than second. */ 2535146040Stjr *err = REG_BADBR; 2536146040Stjr return NULL; 2537146040Stjr } 2538146040Stjr } 2539146040Stjr else 2540146040Stjr { 2541146040Stjr start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2542146040Stjr end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2543146040Stjr } 2544146040Stjr 2545146040Stjr fetch_token (token, regexp, syntax); 2546146040Stjr 2547146040Stjr if (BE (elem == NULL, 0)) 2548146040Stjr return NULL; 2549146040Stjr if (BE (start == 0 && end == 0, 0)) 2550146040Stjr { 2551146040Stjr postorder (elem, free_tree, NULL); 2552146040Stjr return NULL; 2553146040Stjr } 2554146040Stjr 2555146040Stjr /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2556146040Stjr if (BE (start > 0, 0)) 2557146040Stjr { 2558146040Stjr tree = elem; 2559146040Stjr for (i = 2; i <= start; ++i) 2560146040Stjr { 2561146040Stjr elem = duplicate_tree (elem, dfa); 2562146040Stjr tree = create_tree (dfa, tree, elem, CONCAT); 2563146040Stjr if (BE (elem == NULL || tree == NULL, 0)) 2564146040Stjr goto parse_dup_op_espace; 2565146040Stjr } 2566146040Stjr 2567146040Stjr if (start == end) 2568146040Stjr return tree; 2569146040Stjr 2570146040Stjr /* Duplicate ELEM before it is marked optional. */ 2571146040Stjr elem = duplicate_tree (elem, dfa); 2572146040Stjr old_tree = tree; 2573146040Stjr } 2574146040Stjr else 2575146040Stjr old_tree = NULL; 2576146040Stjr 2577146040Stjr if (elem->token.type == SUBEXP) 2578146040Stjr postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2579146040Stjr 2580146040Stjr tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2581146040Stjr if (BE (tree == NULL, 0)) 2582146040Stjr goto parse_dup_op_espace; 2583146040Stjr 2584146040Stjr /* This loop is actually executed only when end != -1, 2585146040Stjr to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2586146040Stjr already created the start+1-th copy. */ 2587146040Stjr for (i = start + 2; i <= end; ++i) 2588146040Stjr { 2589146040Stjr elem = duplicate_tree (elem, dfa); 2590146040Stjr tree = create_tree (dfa, tree, elem, CONCAT); 2591146040Stjr if (BE (elem == NULL || tree == NULL, 0)) 2592250724Sjkim goto parse_dup_op_espace; 2593146040Stjr 2594146040Stjr tree = create_tree (dfa, tree, NULL, OP_ALT); 2595146040Stjr if (BE (tree == NULL, 0)) 2596250724Sjkim goto parse_dup_op_espace; 2597146040Stjr } 2598146040Stjr 2599146040Stjr if (old_tree) 2600146040Stjr tree = create_tree (dfa, old_tree, tree, CONCAT); 2601146040Stjr 2602146040Stjr return tree; 2603146040Stjr 2604146040Stjr parse_dup_op_espace: 2605146040Stjr *err = REG_ESPACE; 2606146040Stjr return NULL; 2607146040Stjr} 2608146040Stjr 2609146040Stjr/* Size of the names for collating symbol/equivalence_class/character_class. 2610146040Stjr I'm not sure, but maybe enough. */ 2611146040Stjr#define BRACKET_NAME_BUF_SIZE 32 2612146040Stjr 2613146040Stjr#ifndef _LIBC 2614146040Stjr /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2615146040Stjr Build the range expression which starts from START_ELEM, and ends 2616146040Stjr at END_ELEM. The result are written to MBCSET and SBCSET. 2617146040Stjr RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2618146040Stjr mbcset->range_ends, is a pointer argument sinse we may 2619146040Stjr update it. */ 2620146040Stjr 2621146040Stjrstatic reg_errcode_t 2622250724Sjkiminternal_function 2623146040Stjr# ifdef RE_ENABLE_I18N 2624250724Sjkimbuild_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2625250724Sjkim bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2626146040Stjr# else /* not RE_ENABLE_I18N */ 2627250724Sjkimbuild_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2628250724Sjkim bracket_elem_t *end_elem) 2629146040Stjr# endif /* not RE_ENABLE_I18N */ 2630146040Stjr{ 2631146040Stjr unsigned int start_ch, end_ch; 2632146040Stjr /* Equivalence Classes and Character Classes can't be a range start/end. */ 2633146040Stjr if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2634146040Stjr || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2635146040Stjr 0)) 2636146040Stjr return REG_ERANGE; 2637146040Stjr 2638146040Stjr /* We can handle no multi character collating elements without libc 2639146040Stjr support. */ 2640146040Stjr if (BE ((start_elem->type == COLL_SYM 2641146040Stjr && strlen ((char *) start_elem->opr.name) > 1) 2642146040Stjr || (end_elem->type == COLL_SYM 2643146040Stjr && strlen ((char *) end_elem->opr.name) > 1), 0)) 2644146040Stjr return REG_ECOLLATE; 2645146040Stjr 2646146040Stjr# ifdef RE_ENABLE_I18N 2647146040Stjr { 2648250724Sjkim wchar_t wc; 2649250724Sjkim wint_t start_wc; 2650250724Sjkim wint_t end_wc; 2651146040Stjr wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2652146040Stjr 2653146040Stjr start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2654146040Stjr : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2655146040Stjr : 0)); 2656146040Stjr end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2657146040Stjr : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2658146040Stjr : 0)); 2659146040Stjr start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2660146040Stjr ? __btowc (start_ch) : start_elem->opr.wch); 2661146040Stjr end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2662146040Stjr ? __btowc (end_ch) : end_elem->opr.wch); 2663146040Stjr if (start_wc == WEOF || end_wc == WEOF) 2664146040Stjr return REG_ECOLLATE; 2665146040Stjr cmp_buf[0] = start_wc; 2666146040Stjr cmp_buf[4] = end_wc; 2667146040Stjr if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2668146040Stjr return REG_ERANGE; 2669146040Stjr 2670146040Stjr /* Got valid collation sequence values, add them as a new entry. 2671146040Stjr However, for !_LIBC we have no collation elements: if the 2672146040Stjr character set is single byte, the single byte character set 2673146040Stjr that we build below suffices. parse_bracket_exp passes 2674146040Stjr no MBCSET if dfa->mb_cur_max == 1. */ 2675146040Stjr if (mbcset) 2676146040Stjr { 2677250724Sjkim /* Check the space of the arrays. */ 2678250724Sjkim if (BE (*range_alloc == mbcset->nranges, 0)) 2679250724Sjkim { 2680146040Stjr /* There is not enough space, need realloc. */ 2681146040Stjr wchar_t *new_array_start, *new_array_end; 2682146040Stjr int new_nranges; 2683146040Stjr 2684146040Stjr /* +1 in case of mbcset->nranges is 0. */ 2685146040Stjr new_nranges = 2 * mbcset->nranges + 1; 2686146040Stjr /* Use realloc since mbcset->range_starts and mbcset->range_ends 2687146040Stjr are NULL if *range_alloc == 0. */ 2688146040Stjr new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2689250724Sjkim new_nranges); 2690146040Stjr new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2691250724Sjkim new_nranges); 2692146040Stjr 2693146040Stjr if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2694146040Stjr return REG_ESPACE; 2695146040Stjr 2696146040Stjr mbcset->range_starts = new_array_start; 2697146040Stjr mbcset->range_ends = new_array_end; 2698146040Stjr *range_alloc = new_nranges; 2699250724Sjkim } 2700146040Stjr 2701250724Sjkim mbcset->range_starts[mbcset->nranges] = start_wc; 2702250724Sjkim mbcset->range_ends[mbcset->nranges++] = end_wc; 2703146040Stjr } 2704146040Stjr 2705146040Stjr /* Build the table for single byte characters. */ 2706146040Stjr for (wc = 0; wc < SBC_MAX; ++wc) 2707146040Stjr { 2708146040Stjr cmp_buf[2] = wc; 2709146040Stjr if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2710146040Stjr && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2711146040Stjr bitset_set (sbcset, wc); 2712146040Stjr } 2713146040Stjr } 2714146040Stjr# else /* not RE_ENABLE_I18N */ 2715146040Stjr { 2716146040Stjr unsigned int ch; 2717146040Stjr start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2718146040Stjr : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2719146040Stjr : 0)); 2720146040Stjr end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2721146040Stjr : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2722146040Stjr : 0)); 2723146040Stjr if (start_ch > end_ch) 2724146040Stjr return REG_ERANGE; 2725146040Stjr /* Build the table for single byte characters. */ 2726146040Stjr for (ch = 0; ch < SBC_MAX; ++ch) 2727146040Stjr if (start_ch <= ch && ch <= end_ch) 2728146040Stjr bitset_set (sbcset, ch); 2729146040Stjr } 2730146040Stjr# endif /* not RE_ENABLE_I18N */ 2731146040Stjr return REG_NOERROR; 2732146040Stjr} 2733146040Stjr#endif /* not _LIBC */ 2734146040Stjr 2735146040Stjr#ifndef _LIBC 2736146040Stjr/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2737146040Stjr Build the collating element which is represented by NAME. 2738146040Stjr The result are written to MBCSET and SBCSET. 2739146040Stjr COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2740146040Stjr pointer argument since we may update it. */ 2741146040Stjr 2742146040Stjrstatic reg_errcode_t 2743250724Sjkiminternal_function 2744146040Stjr# ifdef RE_ENABLE_I18N 2745250724Sjkimbuild_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2746250724Sjkim int *coll_sym_alloc, const unsigned char *name) 2747146040Stjr# else /* not RE_ENABLE_I18N */ 2748250724Sjkimbuild_collating_symbol (bitset_t sbcset, const unsigned char *name) 2749146040Stjr# endif /* not RE_ENABLE_I18N */ 2750146040Stjr{ 2751146040Stjr size_t name_len = strlen ((const char *) name); 2752146040Stjr if (BE (name_len != 1, 0)) 2753146040Stjr return REG_ECOLLATE; 2754146040Stjr else 2755146040Stjr { 2756146040Stjr bitset_set (sbcset, name[0]); 2757146040Stjr return REG_NOERROR; 2758146040Stjr } 2759146040Stjr} 2760146040Stjr#endif /* not _LIBC */ 2761146040Stjr 2762146040Stjr/* This function parse bracket expression like "[abc]", "[a-c]", 2763146040Stjr "[[.a-a.]]" etc. */ 2764146040Stjr 2765146040Stjrstatic bin_tree_t * 2766250724Sjkimparse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2767250724Sjkim reg_syntax_t syntax, reg_errcode_t *err) 2768146040Stjr{ 2769146040Stjr#ifdef _LIBC 2770146040Stjr const unsigned char *collseqmb; 2771146040Stjr const char *collseqwc; 2772146040Stjr uint32_t nrules; 2773146040Stjr int32_t table_size; 2774146040Stjr const int32_t *symb_table; 2775146040Stjr const unsigned char *extra; 2776146040Stjr 2777146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2778146040Stjr Seek the collating symbol entry correspondings to NAME. 2779251435Sjkim Return the index of the symbol in the SYMB_TABLE, 2780251435Sjkim or -1 if not found. */ 2781146040Stjr 2782146040Stjr auto inline int32_t 2783146040Stjr __attribute ((always_inline)) 2784251435Sjkim seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2785146040Stjr { 2786251435Sjkim int32_t elem; 2787250724Sjkim 2788251435Sjkim for (elem = 0; elem < table_size; elem++) 2789251435Sjkim if (symb_table[2 * elem] != 0) 2790251435Sjkim { 2791251435Sjkim int32_t idx = symb_table[2 * elem + 1]; 2792251435Sjkim /* Skip the name of collating element name. */ 2793251435Sjkim idx += 1 + extra[idx]; 2794251435Sjkim if (/* Compare the length of the name. */ 2795251435Sjkim name_len == extra[idx] 2796251435Sjkim /* Compare the name. */ 2797251435Sjkim && memcmp (name, &extra[idx + 1], name_len) == 0) 2798251435Sjkim /* Yep, this is the entry. */ 2799251435Sjkim return elem; 2800251435Sjkim } 2801251435Sjkim return -1; 2802146040Stjr } 2803146040Stjr 2804250724Sjkim /* Local function for parse_bracket_exp used in _LIBC environment. 2805146040Stjr Look up the collation sequence value of BR_ELEM. 2806146040Stjr Return the value if succeeded, UINT_MAX otherwise. */ 2807146040Stjr 2808146040Stjr auto inline unsigned int 2809146040Stjr __attribute ((always_inline)) 2810251435Sjkim lookup_collation_sequence_value (bracket_elem_t *br_elem) 2811146040Stjr { 2812146040Stjr if (br_elem->type == SB_CHAR) 2813146040Stjr { 2814146040Stjr /* 2815146040Stjr if (MB_CUR_MAX == 1) 2816146040Stjr */ 2817146040Stjr if (nrules == 0) 2818146040Stjr return collseqmb[br_elem->opr.ch]; 2819146040Stjr else 2820146040Stjr { 2821146040Stjr wint_t wc = __btowc (br_elem->opr.ch); 2822146040Stjr return __collseq_table_lookup (collseqwc, wc); 2823146040Stjr } 2824146040Stjr } 2825146040Stjr else if (br_elem->type == MB_CHAR) 2826146040Stjr { 2827250724Sjkim if (nrules != 0) 2828250724Sjkim return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2829146040Stjr } 2830146040Stjr else if (br_elem->type == COLL_SYM) 2831146040Stjr { 2832146040Stjr size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2833146040Stjr if (nrules != 0) 2834146040Stjr { 2835146040Stjr int32_t elem, idx; 2836146040Stjr elem = seek_collating_symbol_entry (br_elem->opr.name, 2837146040Stjr sym_name_len); 2838251435Sjkim if (elem != -1) 2839146040Stjr { 2840146040Stjr /* We found the entry. */ 2841146040Stjr idx = symb_table[2 * elem + 1]; 2842146040Stjr /* Skip the name of collating element name. */ 2843146040Stjr idx += 1 + extra[idx]; 2844146040Stjr /* Skip the byte sequence of the collating element. */ 2845146040Stjr idx += 1 + extra[idx]; 2846146040Stjr /* Adjust for the alignment. */ 2847146040Stjr idx = (idx + 3) & ~3; 2848146040Stjr /* Skip the multibyte collation sequence value. */ 2849146040Stjr idx += sizeof (unsigned int); 2850146040Stjr /* Skip the wide char sequence of the collating element. */ 2851146040Stjr idx += sizeof (unsigned int) * 2852146040Stjr (1 + *(unsigned int *) (extra + idx)); 2853146040Stjr /* Return the collation sequence value. */ 2854146040Stjr return *(unsigned int *) (extra + idx); 2855146040Stjr } 2856251435Sjkim else if (sym_name_len == 1) 2857146040Stjr { 2858146040Stjr /* No valid character. Match it as a single byte 2859146040Stjr character. */ 2860146040Stjr return collseqmb[br_elem->opr.name[0]]; 2861146040Stjr } 2862146040Stjr } 2863146040Stjr else if (sym_name_len == 1) 2864146040Stjr return collseqmb[br_elem->opr.name[0]]; 2865146040Stjr } 2866146040Stjr return UINT_MAX; 2867146040Stjr } 2868146040Stjr 2869146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2870146040Stjr Build the range expression which starts from START_ELEM, and ends 2871146040Stjr at END_ELEM. The result are written to MBCSET and SBCSET. 2872146040Stjr RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2873146040Stjr mbcset->range_ends, is a pointer argument sinse we may 2874146040Stjr update it. */ 2875146040Stjr 2876146040Stjr auto inline reg_errcode_t 2877146040Stjr __attribute ((always_inline)) 2878251435Sjkim build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2879251435Sjkim bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2880146040Stjr { 2881146040Stjr unsigned int ch; 2882146040Stjr uint32_t start_collseq; 2883146040Stjr uint32_t end_collseq; 2884146040Stjr 2885146040Stjr /* Equivalence Classes and Character Classes can't be a range 2886146040Stjr start/end. */ 2887146040Stjr if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2888146040Stjr || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2889146040Stjr 0)) 2890146040Stjr return REG_ERANGE; 2891146040Stjr 2892146040Stjr start_collseq = lookup_collation_sequence_value (start_elem); 2893146040Stjr end_collseq = lookup_collation_sequence_value (end_elem); 2894146040Stjr /* Check start/end collation sequence values. */ 2895146040Stjr if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2896146040Stjr return REG_ECOLLATE; 2897146040Stjr if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2898146040Stjr return REG_ERANGE; 2899146040Stjr 2900146040Stjr /* Got valid collation sequence values, add them as a new entry. 2901146040Stjr However, if we have no collation elements, and the character set 2902146040Stjr is single byte, the single byte character set that we 2903146040Stjr build below suffices. */ 2904146040Stjr if (nrules > 0 || dfa->mb_cur_max > 1) 2905146040Stjr { 2906250724Sjkim /* Check the space of the arrays. */ 2907250724Sjkim if (BE (*range_alloc == mbcset->nranges, 0)) 2908146040Stjr { 2909146040Stjr /* There is not enough space, need realloc. */ 2910146040Stjr uint32_t *new_array_start; 2911146040Stjr uint32_t *new_array_end; 2912146040Stjr int new_nranges; 2913146040Stjr 2914146040Stjr /* +1 in case of mbcset->nranges is 0. */ 2915146040Stjr new_nranges = 2 * mbcset->nranges + 1; 2916146040Stjr new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2917146040Stjr new_nranges); 2918146040Stjr new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2919250724Sjkim new_nranges); 2920146040Stjr 2921146040Stjr if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2922250724Sjkim return REG_ESPACE; 2923146040Stjr 2924146040Stjr mbcset->range_starts = new_array_start; 2925146040Stjr mbcset->range_ends = new_array_end; 2926146040Stjr *range_alloc = new_nranges; 2927146040Stjr } 2928146040Stjr 2929250724Sjkim mbcset->range_starts[mbcset->nranges] = start_collseq; 2930250724Sjkim mbcset->range_ends[mbcset->nranges++] = end_collseq; 2931146040Stjr } 2932146040Stjr 2933146040Stjr /* Build the table for single byte characters. */ 2934146040Stjr for (ch = 0; ch < SBC_MAX; ch++) 2935146040Stjr { 2936146040Stjr uint32_t ch_collseq; 2937146040Stjr /* 2938146040Stjr if (MB_CUR_MAX == 1) 2939146040Stjr */ 2940146040Stjr if (nrules == 0) 2941146040Stjr ch_collseq = collseqmb[ch]; 2942146040Stjr else 2943146040Stjr ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2944146040Stjr if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2945146040Stjr bitset_set (sbcset, ch); 2946146040Stjr } 2947146040Stjr return REG_NOERROR; 2948146040Stjr } 2949146040Stjr 2950146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2951146040Stjr Build the collating element which is represented by NAME. 2952146040Stjr The result are written to MBCSET and SBCSET. 2953146040Stjr COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2954146040Stjr pointer argument sinse we may update it. */ 2955146040Stjr 2956146040Stjr auto inline reg_errcode_t 2957146040Stjr __attribute ((always_inline)) 2958251435Sjkim build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2959251435Sjkim int *coll_sym_alloc, const unsigned char *name) 2960146040Stjr { 2961146040Stjr int32_t elem, idx; 2962146040Stjr size_t name_len = strlen ((const char *) name); 2963146040Stjr if (nrules != 0) 2964146040Stjr { 2965146040Stjr elem = seek_collating_symbol_entry (name, name_len); 2966251435Sjkim if (elem != -1) 2967146040Stjr { 2968146040Stjr /* We found the entry. */ 2969146040Stjr idx = symb_table[2 * elem + 1]; 2970146040Stjr /* Skip the name of collating element name. */ 2971146040Stjr idx += 1 + extra[idx]; 2972146040Stjr } 2973251435Sjkim else if (name_len == 1) 2974146040Stjr { 2975146040Stjr /* No valid character, treat it as a normal 2976146040Stjr character. */ 2977146040Stjr bitset_set (sbcset, name[0]); 2978146040Stjr return REG_NOERROR; 2979146040Stjr } 2980146040Stjr else 2981146040Stjr return REG_ECOLLATE; 2982146040Stjr 2983146040Stjr /* Got valid collation sequence, add it as a new entry. */ 2984146040Stjr /* Check the space of the arrays. */ 2985146040Stjr if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 2986146040Stjr { 2987146040Stjr /* Not enough, realloc it. */ 2988146040Stjr /* +1 in case of mbcset->ncoll_syms is 0. */ 2989146040Stjr int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 2990146040Stjr /* Use realloc since mbcset->coll_syms is NULL 2991146040Stjr if *alloc == 0. */ 2992146040Stjr int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 2993146040Stjr new_coll_sym_alloc); 2994146040Stjr if (BE (new_coll_syms == NULL, 0)) 2995146040Stjr return REG_ESPACE; 2996146040Stjr mbcset->coll_syms = new_coll_syms; 2997146040Stjr *coll_sym_alloc = new_coll_sym_alloc; 2998146040Stjr } 2999146040Stjr mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3000146040Stjr return REG_NOERROR; 3001146040Stjr } 3002146040Stjr else 3003146040Stjr { 3004146040Stjr if (BE (name_len != 1, 0)) 3005146040Stjr return REG_ECOLLATE; 3006146040Stjr else 3007146040Stjr { 3008146040Stjr bitset_set (sbcset, name[0]); 3009146040Stjr return REG_NOERROR; 3010146040Stjr } 3011146040Stjr } 3012146040Stjr } 3013146040Stjr#endif 3014146040Stjr 3015146040Stjr re_token_t br_token; 3016146040Stjr re_bitset_ptr_t sbcset; 3017146040Stjr#ifdef RE_ENABLE_I18N 3018146040Stjr re_charset_t *mbcset; 3019146040Stjr int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3020146040Stjr int equiv_class_alloc = 0, char_class_alloc = 0; 3021146040Stjr#endif /* not RE_ENABLE_I18N */ 3022146040Stjr int non_match = 0; 3023146040Stjr bin_tree_t *work_tree; 3024146040Stjr int token_len; 3025146040Stjr int first_round = 1; 3026146040Stjr#ifdef _LIBC 3027146040Stjr collseqmb = (const unsigned char *) 3028146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3029146040Stjr nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3030146040Stjr if (nrules) 3031146040Stjr { 3032146040Stjr /* 3033146040Stjr if (MB_CUR_MAX > 1) 3034146040Stjr */ 3035250724Sjkim collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3036146040Stjr table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3037146040Stjr symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3038146040Stjr _NL_COLLATE_SYMB_TABLEMB); 3039146040Stjr extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3040146040Stjr _NL_COLLATE_SYMB_EXTRAMB); 3041146040Stjr } 3042146040Stjr#endif 3043250724Sjkim sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3044146040Stjr#ifdef RE_ENABLE_I18N 3045146040Stjr mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3046146040Stjr#endif /* RE_ENABLE_I18N */ 3047146040Stjr#ifdef RE_ENABLE_I18N 3048146040Stjr if (BE (sbcset == NULL || mbcset == NULL, 0)) 3049146040Stjr#else 3050146040Stjr if (BE (sbcset == NULL, 0)) 3051146040Stjr#endif /* RE_ENABLE_I18N */ 3052146040Stjr { 3053250724Sjkim re_free (sbcset); 3054250724Sjkim#ifdef RE_ENABLE_I18N 3055250724Sjkim re_free (mbcset); 3056250724Sjkim#endif 3057146040Stjr *err = REG_ESPACE; 3058146040Stjr return NULL; 3059146040Stjr } 3060146040Stjr 3061146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3062146040Stjr if (BE (token->type == END_OF_RE, 0)) 3063146040Stjr { 3064146040Stjr *err = REG_BADPAT; 3065146040Stjr goto parse_bracket_exp_free_return; 3066146040Stjr } 3067146040Stjr if (token->type == OP_NON_MATCH_LIST) 3068146040Stjr { 3069146040Stjr#ifdef RE_ENABLE_I18N 3070146040Stjr mbcset->non_match = 1; 3071146040Stjr#endif /* not RE_ENABLE_I18N */ 3072146040Stjr non_match = 1; 3073146040Stjr if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3074250724Sjkim bitset_set (sbcset, '\n'); 3075146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3076146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3077146040Stjr if (BE (token->type == END_OF_RE, 0)) 3078146040Stjr { 3079146040Stjr *err = REG_BADPAT; 3080146040Stjr goto parse_bracket_exp_free_return; 3081146040Stjr } 3082146040Stjr } 3083146040Stjr 3084146040Stjr /* We treat the first ']' as a normal character. */ 3085146040Stjr if (token->type == OP_CLOSE_BRACKET) 3086146040Stjr token->type = CHARACTER; 3087146040Stjr 3088146040Stjr while (1) 3089146040Stjr { 3090146040Stjr bracket_elem_t start_elem, end_elem; 3091146040Stjr unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3092146040Stjr unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3093146040Stjr reg_errcode_t ret; 3094146040Stjr int token_len2 = 0, is_range_exp = 0; 3095146040Stjr re_token_t token2; 3096146040Stjr 3097146040Stjr start_elem.opr.name = start_name_buf; 3098146040Stjr ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3099146040Stjr syntax, first_round); 3100146040Stjr if (BE (ret != REG_NOERROR, 0)) 3101146040Stjr { 3102146040Stjr *err = ret; 3103146040Stjr goto parse_bracket_exp_free_return; 3104146040Stjr } 3105146040Stjr first_round = 0; 3106146040Stjr 3107146040Stjr /* Get information about the next token. We need it in any case. */ 3108146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3109146040Stjr 3110146040Stjr /* Do not check for ranges if we know they are not allowed. */ 3111146040Stjr if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3112146040Stjr { 3113146040Stjr if (BE (token->type == END_OF_RE, 0)) 3114146040Stjr { 3115146040Stjr *err = REG_EBRACK; 3116146040Stjr goto parse_bracket_exp_free_return; 3117146040Stjr } 3118146040Stjr if (token->type == OP_CHARSET_RANGE) 3119146040Stjr { 3120146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3121146040Stjr token_len2 = peek_token_bracket (&token2, regexp, syntax); 3122146040Stjr if (BE (token2.type == END_OF_RE, 0)) 3123146040Stjr { 3124146040Stjr *err = REG_EBRACK; 3125146040Stjr goto parse_bracket_exp_free_return; 3126146040Stjr } 3127146040Stjr if (token2.type == OP_CLOSE_BRACKET) 3128146040Stjr { 3129146040Stjr /* We treat the last '-' as a normal character. */ 3130146040Stjr re_string_skip_bytes (regexp, -token_len); 3131146040Stjr token->type = CHARACTER; 3132146040Stjr } 3133146040Stjr else 3134146040Stjr is_range_exp = 1; 3135146040Stjr } 3136146040Stjr } 3137146040Stjr 3138146040Stjr if (is_range_exp == 1) 3139146040Stjr { 3140146040Stjr end_elem.opr.name = end_name_buf; 3141146040Stjr ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3142146040Stjr dfa, syntax, 1); 3143146040Stjr if (BE (ret != REG_NOERROR, 0)) 3144146040Stjr { 3145146040Stjr *err = ret; 3146146040Stjr goto parse_bracket_exp_free_return; 3147146040Stjr } 3148146040Stjr 3149146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3150146040Stjr 3151146040Stjr#ifdef _LIBC 3152146040Stjr *err = build_range_exp (sbcset, mbcset, &range_alloc, 3153146040Stjr &start_elem, &end_elem); 3154146040Stjr#else 3155146040Stjr# ifdef RE_ENABLE_I18N 3156146040Stjr *err = build_range_exp (sbcset, 3157146040Stjr dfa->mb_cur_max > 1 ? mbcset : NULL, 3158146040Stjr &range_alloc, &start_elem, &end_elem); 3159146040Stjr# else 3160146040Stjr *err = build_range_exp (sbcset, &start_elem, &end_elem); 3161146040Stjr# endif 3162146040Stjr#endif /* RE_ENABLE_I18N */ 3163146040Stjr if (BE (*err != REG_NOERROR, 0)) 3164146040Stjr goto parse_bracket_exp_free_return; 3165146040Stjr } 3166146040Stjr else 3167146040Stjr { 3168146040Stjr switch (start_elem.type) 3169146040Stjr { 3170146040Stjr case SB_CHAR: 3171146040Stjr bitset_set (sbcset, start_elem.opr.ch); 3172146040Stjr break; 3173146040Stjr#ifdef RE_ENABLE_I18N 3174146040Stjr case MB_CHAR: 3175146040Stjr /* Check whether the array has enough space. */ 3176146040Stjr if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3177146040Stjr { 3178146040Stjr wchar_t *new_mbchars; 3179146040Stjr /* Not enough, realloc it. */ 3180146040Stjr /* +1 in case of mbcset->nmbchars is 0. */ 3181146040Stjr mbchar_alloc = 2 * mbcset->nmbchars + 1; 3182146040Stjr /* Use realloc since array is NULL if *alloc == 0. */ 3183146040Stjr new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3184146040Stjr mbchar_alloc); 3185146040Stjr if (BE (new_mbchars == NULL, 0)) 3186146040Stjr goto parse_bracket_exp_espace; 3187146040Stjr mbcset->mbchars = new_mbchars; 3188146040Stjr } 3189146040Stjr mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3190146040Stjr break; 3191146040Stjr#endif /* RE_ENABLE_I18N */ 3192146040Stjr case EQUIV_CLASS: 3193146040Stjr *err = build_equiv_class (sbcset, 3194146040Stjr#ifdef RE_ENABLE_I18N 3195146040Stjr mbcset, &equiv_class_alloc, 3196146040Stjr#endif /* RE_ENABLE_I18N */ 3197146040Stjr start_elem.opr.name); 3198146040Stjr if (BE (*err != REG_NOERROR, 0)) 3199146040Stjr goto parse_bracket_exp_free_return; 3200146040Stjr break; 3201146040Stjr case COLL_SYM: 3202146040Stjr *err = build_collating_symbol (sbcset, 3203146040Stjr#ifdef RE_ENABLE_I18N 3204146040Stjr mbcset, &coll_sym_alloc, 3205146040Stjr#endif /* RE_ENABLE_I18N */ 3206146040Stjr start_elem.opr.name); 3207146040Stjr if (BE (*err != REG_NOERROR, 0)) 3208146040Stjr goto parse_bracket_exp_free_return; 3209146040Stjr break; 3210146040Stjr case CHAR_CLASS: 3211146040Stjr *err = build_charclass (regexp->trans, sbcset, 3212146040Stjr#ifdef RE_ENABLE_I18N 3213146040Stjr mbcset, &char_class_alloc, 3214146040Stjr#endif /* RE_ENABLE_I18N */ 3215146040Stjr start_elem.opr.name, syntax); 3216146040Stjr if (BE (*err != REG_NOERROR, 0)) 3217146040Stjr goto parse_bracket_exp_free_return; 3218146040Stjr break; 3219146040Stjr default: 3220146040Stjr assert (0); 3221146040Stjr break; 3222146040Stjr } 3223146040Stjr } 3224146040Stjr if (BE (token->type == END_OF_RE, 0)) 3225146040Stjr { 3226146040Stjr *err = REG_EBRACK; 3227146040Stjr goto parse_bracket_exp_free_return; 3228146040Stjr } 3229146040Stjr if (token->type == OP_CLOSE_BRACKET) 3230146040Stjr break; 3231146040Stjr } 3232146040Stjr 3233146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3234146040Stjr 3235146040Stjr /* If it is non-matching list. */ 3236146040Stjr if (non_match) 3237146040Stjr bitset_not (sbcset); 3238146040Stjr 3239146040Stjr#ifdef RE_ENABLE_I18N 3240146040Stjr /* Ensure only single byte characters are set. */ 3241146040Stjr if (dfa->mb_cur_max > 1) 3242146040Stjr bitset_mask (sbcset, dfa->sb_char); 3243146040Stjr 3244146040Stjr if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3245146040Stjr || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3246146040Stjr || mbcset->non_match))) 3247146040Stjr { 3248146040Stjr bin_tree_t *mbc_tree; 3249146040Stjr int sbc_idx; 3250146040Stjr /* Build a tree for complex bracket. */ 3251146040Stjr dfa->has_mb_node = 1; 3252146040Stjr br_token.type = COMPLEX_BRACKET; 3253146040Stjr br_token.opr.mbcset = mbcset; 3254146040Stjr mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3255146040Stjr if (BE (mbc_tree == NULL, 0)) 3256146040Stjr goto parse_bracket_exp_espace; 3257250724Sjkim for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3258146040Stjr if (sbcset[sbc_idx]) 3259146040Stjr break; 3260146040Stjr /* If there are no bits set in sbcset, there is no point 3261146040Stjr of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3262250724Sjkim if (sbc_idx < BITSET_WORDS) 3263146040Stjr { 3264250724Sjkim /* Build a tree for simple bracket. */ 3265250724Sjkim br_token.type = SIMPLE_BRACKET; 3266250724Sjkim br_token.opr.sbcset = sbcset; 3267250724Sjkim work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3268250724Sjkim if (BE (work_tree == NULL, 0)) 3269250724Sjkim goto parse_bracket_exp_espace; 3270146040Stjr 3271250724Sjkim /* Then join them by ALT node. */ 3272250724Sjkim work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3273250724Sjkim if (BE (work_tree == NULL, 0)) 3274250724Sjkim goto parse_bracket_exp_espace; 3275146040Stjr } 3276146040Stjr else 3277146040Stjr { 3278146040Stjr re_free (sbcset); 3279146040Stjr work_tree = mbc_tree; 3280146040Stjr } 3281146040Stjr } 3282146040Stjr else 3283146040Stjr#endif /* not RE_ENABLE_I18N */ 3284146040Stjr { 3285146040Stjr#ifdef RE_ENABLE_I18N 3286146040Stjr free_charset (mbcset); 3287146040Stjr#endif 3288146040Stjr /* Build a tree for simple bracket. */ 3289146040Stjr br_token.type = SIMPLE_BRACKET; 3290146040Stjr br_token.opr.sbcset = sbcset; 3291146040Stjr work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3292146040Stjr if (BE (work_tree == NULL, 0)) 3293250724Sjkim goto parse_bracket_exp_espace; 3294146040Stjr } 3295146040Stjr return work_tree; 3296146040Stjr 3297146040Stjr parse_bracket_exp_espace: 3298146040Stjr *err = REG_ESPACE; 3299146040Stjr parse_bracket_exp_free_return: 3300146040Stjr re_free (sbcset); 3301146040Stjr#ifdef RE_ENABLE_I18N 3302146040Stjr free_charset (mbcset); 3303146040Stjr#endif /* RE_ENABLE_I18N */ 3304146040Stjr return NULL; 3305146040Stjr} 3306146040Stjr 3307146040Stjr/* Parse an element in the bracket expression. */ 3308146040Stjr 3309146040Stjrstatic reg_errcode_t 3310250724Sjkimparse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3311250724Sjkim re_token_t *token, int token_len, re_dfa_t *dfa, 3312250724Sjkim reg_syntax_t syntax, int accept_hyphen) 3313146040Stjr{ 3314146040Stjr#ifdef RE_ENABLE_I18N 3315146040Stjr int cur_char_size; 3316146040Stjr cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3317146040Stjr if (cur_char_size > 1) 3318146040Stjr { 3319146040Stjr elem->type = MB_CHAR; 3320146040Stjr elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3321146040Stjr re_string_skip_bytes (regexp, cur_char_size); 3322146040Stjr return REG_NOERROR; 3323146040Stjr } 3324146040Stjr#endif /* RE_ENABLE_I18N */ 3325146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3326146040Stjr if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3327146040Stjr || token->type == OP_OPEN_EQUIV_CLASS) 3328146040Stjr return parse_bracket_symbol (elem, regexp, token); 3329146040Stjr if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3330146040Stjr { 3331146040Stjr /* A '-' must only appear as anything but a range indicator before 3332146040Stjr the closing bracket. Everything else is an error. */ 3333146040Stjr re_token_t token2; 3334146040Stjr (void) peek_token_bracket (&token2, regexp, syntax); 3335146040Stjr if (token2.type != OP_CLOSE_BRACKET) 3336146040Stjr /* The actual error value is not standardized since this whole 3337146040Stjr case is undefined. But ERANGE makes good sense. */ 3338146040Stjr return REG_ERANGE; 3339146040Stjr } 3340146040Stjr elem->type = SB_CHAR; 3341146040Stjr elem->opr.ch = token->opr.c; 3342146040Stjr return REG_NOERROR; 3343146040Stjr} 3344146040Stjr 3345146040Stjr/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3346146040Stjr such as [:<character_class>:], [.<collating_element>.], and 3347146040Stjr [=<equivalent_class>=]. */ 3348146040Stjr 3349146040Stjrstatic reg_errcode_t 3350250724Sjkimparse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3351250724Sjkim re_token_t *token) 3352146040Stjr{ 3353146040Stjr unsigned char ch, delim = token->opr.c; 3354146040Stjr int i = 0; 3355146040Stjr if (re_string_eoi(regexp)) 3356146040Stjr return REG_EBRACK; 3357146040Stjr for (;; ++i) 3358146040Stjr { 3359146040Stjr if (i >= BRACKET_NAME_BUF_SIZE) 3360146040Stjr return REG_EBRACK; 3361146040Stjr if (token->type == OP_OPEN_CHAR_CLASS) 3362146040Stjr ch = re_string_fetch_byte_case (regexp); 3363146040Stjr else 3364146040Stjr ch = re_string_fetch_byte (regexp); 3365146040Stjr if (re_string_eoi(regexp)) 3366146040Stjr return REG_EBRACK; 3367146040Stjr if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3368146040Stjr break; 3369146040Stjr elem->opr.name[i] = ch; 3370146040Stjr } 3371146040Stjr re_string_skip_bytes (regexp, 1); 3372146040Stjr elem->opr.name[i] = '\0'; 3373146040Stjr switch (token->type) 3374146040Stjr { 3375146040Stjr case OP_OPEN_COLL_ELEM: 3376146040Stjr elem->type = COLL_SYM; 3377146040Stjr break; 3378146040Stjr case OP_OPEN_EQUIV_CLASS: 3379146040Stjr elem->type = EQUIV_CLASS; 3380146040Stjr break; 3381146040Stjr case OP_OPEN_CHAR_CLASS: 3382146040Stjr elem->type = CHAR_CLASS; 3383146040Stjr break; 3384146040Stjr default: 3385146040Stjr break; 3386146040Stjr } 3387146040Stjr return REG_NOERROR; 3388146040Stjr} 3389146040Stjr 3390146040Stjr /* Helper function for parse_bracket_exp. 3391146040Stjr Build the equivalence class which is represented by NAME. 3392146040Stjr The result are written to MBCSET and SBCSET. 3393146040Stjr EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3394146040Stjr is a pointer argument sinse we may update it. */ 3395146040Stjr 3396146040Stjrstatic reg_errcode_t 3397146040Stjr#ifdef RE_ENABLE_I18N 3398250724Sjkimbuild_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3399250724Sjkim int *equiv_class_alloc, const unsigned char *name) 3400146040Stjr#else /* not RE_ENABLE_I18N */ 3401250724Sjkimbuild_equiv_class (bitset_t sbcset, const unsigned char *name) 3402146040Stjr#endif /* not RE_ENABLE_I18N */ 3403146040Stjr{ 3404250724Sjkim#ifdef _LIBC 3405146040Stjr uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3406146040Stjr if (nrules != 0) 3407146040Stjr { 3408146040Stjr const int32_t *table, *indirect; 3409146040Stjr const unsigned char *weights, *extra, *cp; 3410146040Stjr unsigned char char_buf[2]; 3411146040Stjr int32_t idx1, idx2; 3412146040Stjr unsigned int ch; 3413146040Stjr size_t len; 3414146040Stjr /* This #include defines a local function! */ 3415146040Stjr# include <locale/weight.h> 3416146040Stjr /* Calculate the index for equivalence class. */ 3417146040Stjr cp = name; 3418146040Stjr table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3419146040Stjr weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3420146040Stjr _NL_COLLATE_WEIGHTMB); 3421146040Stjr extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3422146040Stjr _NL_COLLATE_EXTRAMB); 3423146040Stjr indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3424146040Stjr _NL_COLLATE_INDIRECTMB); 3425250724Sjkim idx1 = findidx (&cp, -1); 3426250724Sjkim if (BE (idx1 == 0 || *cp != '\0', 0)) 3427146040Stjr /* This isn't a valid character. */ 3428146040Stjr return REG_ECOLLATE; 3429146040Stjr 3430146040Stjr /* Build single byte matcing table for this equivalence class. */ 3431250724Sjkim len = weights[idx1 & 0xffffff]; 3432146040Stjr for (ch = 0; ch < SBC_MAX; ++ch) 3433146040Stjr { 3434146040Stjr char_buf[0] = ch; 3435146040Stjr cp = char_buf; 3436250724Sjkim idx2 = findidx (&cp, 1); 3437146040Stjr/* 3438146040Stjr idx2 = table[ch]; 3439146040Stjr*/ 3440146040Stjr if (idx2 == 0) 3441146040Stjr /* This isn't a valid character. */ 3442146040Stjr continue; 3443250724Sjkim /* Compare only if the length matches and the collation rule 3444250724Sjkim index is the same. */ 3445250724Sjkim if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3446146040Stjr { 3447146040Stjr int cnt = 0; 3448250724Sjkim 3449146040Stjr while (cnt <= len && 3450250724Sjkim weights[(idx1 & 0xffffff) + 1 + cnt] 3451250724Sjkim == weights[(idx2 & 0xffffff) + 1 + cnt]) 3452146040Stjr ++cnt; 3453146040Stjr 3454146040Stjr if (cnt > len) 3455146040Stjr bitset_set (sbcset, ch); 3456146040Stjr } 3457146040Stjr } 3458146040Stjr /* Check whether the array has enough space. */ 3459146040Stjr if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3460146040Stjr { 3461146040Stjr /* Not enough, realloc it. */ 3462146040Stjr /* +1 in case of mbcset->nequiv_classes is 0. */ 3463146040Stjr int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3464146040Stjr /* Use realloc since the array is NULL if *alloc == 0. */ 3465146040Stjr int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3466146040Stjr int32_t, 3467146040Stjr new_equiv_class_alloc); 3468146040Stjr if (BE (new_equiv_classes == NULL, 0)) 3469146040Stjr return REG_ESPACE; 3470146040Stjr mbcset->equiv_classes = new_equiv_classes; 3471146040Stjr *equiv_class_alloc = new_equiv_class_alloc; 3472146040Stjr } 3473146040Stjr mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3474146040Stjr } 3475146040Stjr else 3476146040Stjr#endif /* _LIBC */ 3477146040Stjr { 3478146040Stjr if (BE (strlen ((const char *) name) != 1, 0)) 3479146040Stjr return REG_ECOLLATE; 3480146040Stjr bitset_set (sbcset, *name); 3481146040Stjr } 3482146040Stjr return REG_NOERROR; 3483146040Stjr} 3484146040Stjr 3485146040Stjr /* Helper function for parse_bracket_exp. 3486146040Stjr Build the character class which is represented by NAME. 3487146040Stjr The result are written to MBCSET and SBCSET. 3488146040Stjr CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3489146040Stjr is a pointer argument sinse we may update it. */ 3490146040Stjr 3491146040Stjrstatic reg_errcode_t 3492146040Stjr#ifdef RE_ENABLE_I18N 3493250724Sjkimbuild_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3494250724Sjkim re_charset_t *mbcset, int *char_class_alloc, 3495250724Sjkim const unsigned char *class_name, reg_syntax_t syntax) 3496146040Stjr#else /* not RE_ENABLE_I18N */ 3497250724Sjkimbuild_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3498250724Sjkim const unsigned char *class_name, reg_syntax_t syntax) 3499146040Stjr#endif /* not RE_ENABLE_I18N */ 3500146040Stjr{ 3501146040Stjr int i; 3502146040Stjr const char *name = (const char *) class_name; 3503146040Stjr 3504146040Stjr /* In case of REG_ICASE "upper" and "lower" match the both of 3505146040Stjr upper and lower cases. */ 3506146040Stjr if ((syntax & RE_ICASE) 3507146040Stjr && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3508146040Stjr name = "alpha"; 3509146040Stjr 3510146040Stjr#ifdef RE_ENABLE_I18N 3511146040Stjr /* Check the space of the arrays. */ 3512146040Stjr if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3513146040Stjr { 3514146040Stjr /* Not enough, realloc it. */ 3515146040Stjr /* +1 in case of mbcset->nchar_classes is 0. */ 3516146040Stjr int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3517146040Stjr /* Use realloc since array is NULL if *alloc == 0. */ 3518146040Stjr wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3519146040Stjr new_char_class_alloc); 3520146040Stjr if (BE (new_char_classes == NULL, 0)) 3521146040Stjr return REG_ESPACE; 3522146040Stjr mbcset->char_classes = new_char_classes; 3523146040Stjr *char_class_alloc = new_char_class_alloc; 3524146040Stjr } 3525146040Stjr mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3526146040Stjr#endif /* RE_ENABLE_I18N */ 3527146040Stjr 3528146040Stjr#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3529250724Sjkim do { \ 3530250724Sjkim if (BE (trans != NULL, 0)) \ 3531146040Stjr { \ 3532250724Sjkim for (i = 0; i < SBC_MAX; ++i) \ 3533250724Sjkim if (ctype_func (i)) \ 3534250724Sjkim bitset_set (sbcset, trans[i]); \ 3535250724Sjkim } \ 3536250724Sjkim else \ 3537250724Sjkim { \ 3538250724Sjkim for (i = 0; i < SBC_MAX; ++i) \ 3539250724Sjkim if (ctype_func (i)) \ 3540250724Sjkim bitset_set (sbcset, i); \ 3541250724Sjkim } \ 3542250724Sjkim } while (0) 3543146040Stjr 3544146040Stjr if (strcmp (name, "alnum") == 0) 3545250724Sjkim BUILD_CHARCLASS_LOOP (isalnum); 3546146040Stjr else if (strcmp (name, "cntrl") == 0) 3547250724Sjkim BUILD_CHARCLASS_LOOP (iscntrl); 3548146040Stjr else if (strcmp (name, "lower") == 0) 3549250724Sjkim BUILD_CHARCLASS_LOOP (islower); 3550146040Stjr else if (strcmp (name, "space") == 0) 3551250724Sjkim BUILD_CHARCLASS_LOOP (isspace); 3552146040Stjr else if (strcmp (name, "alpha") == 0) 3553250724Sjkim BUILD_CHARCLASS_LOOP (isalpha); 3554146040Stjr else if (strcmp (name, "digit") == 0) 3555250724Sjkim BUILD_CHARCLASS_LOOP (isdigit); 3556146040Stjr else if (strcmp (name, "print") == 0) 3557250724Sjkim BUILD_CHARCLASS_LOOP (isprint); 3558146040Stjr else if (strcmp (name, "upper") == 0) 3559250724Sjkim BUILD_CHARCLASS_LOOP (isupper); 3560146040Stjr else if (strcmp (name, "blank") == 0) 3561250724Sjkim BUILD_CHARCLASS_LOOP (isblank); 3562146040Stjr else if (strcmp (name, "graph") == 0) 3563250724Sjkim BUILD_CHARCLASS_LOOP (isgraph); 3564146040Stjr else if (strcmp (name, "punct") == 0) 3565250724Sjkim BUILD_CHARCLASS_LOOP (ispunct); 3566146040Stjr else if (strcmp (name, "xdigit") == 0) 3567250724Sjkim BUILD_CHARCLASS_LOOP (isxdigit); 3568146040Stjr else 3569146040Stjr return REG_ECTYPE; 3570146040Stjr 3571146040Stjr return REG_NOERROR; 3572146040Stjr} 3573146040Stjr 3574146040Stjrstatic bin_tree_t * 3575250724Sjkimbuild_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3576250724Sjkim const unsigned char *class_name, 3577250724Sjkim const unsigned char *extra, int non_match, 3578250724Sjkim reg_errcode_t *err) 3579146040Stjr{ 3580146040Stjr re_bitset_ptr_t sbcset; 3581146040Stjr#ifdef RE_ENABLE_I18N 3582146040Stjr re_charset_t *mbcset; 3583146040Stjr int alloc = 0; 3584146040Stjr#endif /* not RE_ENABLE_I18N */ 3585146040Stjr reg_errcode_t ret; 3586146040Stjr re_token_t br_token; 3587146040Stjr bin_tree_t *tree; 3588146040Stjr 3589250724Sjkim sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3590146040Stjr#ifdef RE_ENABLE_I18N 3591146040Stjr mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3592146040Stjr#endif /* RE_ENABLE_I18N */ 3593146040Stjr 3594146040Stjr#ifdef RE_ENABLE_I18N 3595146040Stjr if (BE (sbcset == NULL || mbcset == NULL, 0)) 3596146040Stjr#else /* not RE_ENABLE_I18N */ 3597146040Stjr if (BE (sbcset == NULL, 0)) 3598146040Stjr#endif /* not RE_ENABLE_I18N */ 3599146040Stjr { 3600146040Stjr *err = REG_ESPACE; 3601146040Stjr return NULL; 3602146040Stjr } 3603146040Stjr 3604146040Stjr if (non_match) 3605146040Stjr { 3606146040Stjr#ifdef RE_ENABLE_I18N 3607146040Stjr mbcset->non_match = 1; 3608146040Stjr#endif /* not RE_ENABLE_I18N */ 3609146040Stjr } 3610146040Stjr 3611146040Stjr /* We don't care the syntax in this case. */ 3612146040Stjr ret = build_charclass (trans, sbcset, 3613146040Stjr#ifdef RE_ENABLE_I18N 3614146040Stjr mbcset, &alloc, 3615146040Stjr#endif /* RE_ENABLE_I18N */ 3616146040Stjr class_name, 0); 3617146040Stjr 3618146040Stjr if (BE (ret != REG_NOERROR, 0)) 3619146040Stjr { 3620146040Stjr re_free (sbcset); 3621146040Stjr#ifdef RE_ENABLE_I18N 3622146040Stjr free_charset (mbcset); 3623146040Stjr#endif /* RE_ENABLE_I18N */ 3624146040Stjr *err = ret; 3625146040Stjr return NULL; 3626146040Stjr } 3627146040Stjr /* \w match '_' also. */ 3628146040Stjr for (; *extra; extra++) 3629146040Stjr bitset_set (sbcset, *extra); 3630146040Stjr 3631146040Stjr /* If it is non-matching list. */ 3632146040Stjr if (non_match) 3633146040Stjr bitset_not (sbcset); 3634146040Stjr 3635146040Stjr#ifdef RE_ENABLE_I18N 3636146040Stjr /* Ensure only single byte characters are set. */ 3637146040Stjr if (dfa->mb_cur_max > 1) 3638146040Stjr bitset_mask (sbcset, dfa->sb_char); 3639146040Stjr#endif 3640146040Stjr 3641146040Stjr /* Build a tree for simple bracket. */ 3642146040Stjr br_token.type = SIMPLE_BRACKET; 3643146040Stjr br_token.opr.sbcset = sbcset; 3644146040Stjr tree = create_token_tree (dfa, NULL, NULL, &br_token); 3645146040Stjr if (BE (tree == NULL, 0)) 3646146040Stjr goto build_word_op_espace; 3647146040Stjr 3648146040Stjr#ifdef RE_ENABLE_I18N 3649146040Stjr if (dfa->mb_cur_max > 1) 3650146040Stjr { 3651146040Stjr bin_tree_t *mbc_tree; 3652146040Stjr /* Build a tree for complex bracket. */ 3653146040Stjr br_token.type = COMPLEX_BRACKET; 3654146040Stjr br_token.opr.mbcset = mbcset; 3655146040Stjr dfa->has_mb_node = 1; 3656146040Stjr mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3657146040Stjr if (BE (mbc_tree == NULL, 0)) 3658146040Stjr goto build_word_op_espace; 3659146040Stjr /* Then join them by ALT node. */ 3660146040Stjr tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3661146040Stjr if (BE (mbc_tree != NULL, 1)) 3662146040Stjr return tree; 3663146040Stjr } 3664146040Stjr else 3665146040Stjr { 3666146040Stjr free_charset (mbcset); 3667146040Stjr return tree; 3668146040Stjr } 3669146040Stjr#else /* not RE_ENABLE_I18N */ 3670146040Stjr return tree; 3671146040Stjr#endif /* not RE_ENABLE_I18N */ 3672146040Stjr 3673146040Stjr build_word_op_espace: 3674146040Stjr re_free (sbcset); 3675146040Stjr#ifdef RE_ENABLE_I18N 3676146040Stjr free_charset (mbcset); 3677146040Stjr#endif /* RE_ENABLE_I18N */ 3678146040Stjr *err = REG_ESPACE; 3679146040Stjr return NULL; 3680146040Stjr} 3681146040Stjr 3682146040Stjr/* This is intended for the expressions like "a{1,3}". 3683146040Stjr Fetch a number from `input', and return the number. 3684146040Stjr Return -1, if the number field is empty like "{,1}". 3685146040Stjr Return -2, If an error is occured. */ 3686146040Stjr 3687146040Stjrstatic int 3688250724Sjkimfetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3689146040Stjr{ 3690146040Stjr int num = -1; 3691146040Stjr unsigned char c; 3692146040Stjr while (1) 3693146040Stjr { 3694146040Stjr fetch_token (token, input, syntax); 3695146040Stjr c = token->opr.c; 3696146040Stjr if (BE (token->type == END_OF_RE, 0)) 3697146040Stjr return -2; 3698146040Stjr if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3699146040Stjr break; 3700146040Stjr num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3701146040Stjr ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3702146040Stjr num = (num > RE_DUP_MAX) ? -2 : num; 3703146040Stjr } 3704146040Stjr return num; 3705146040Stjr} 3706146040Stjr 3707146040Stjr#ifdef RE_ENABLE_I18N 3708146040Stjrstatic void 3709146040Stjrfree_charset (re_charset_t *cset) 3710146040Stjr{ 3711146040Stjr re_free (cset->mbchars); 3712146040Stjr# ifdef _LIBC 3713146040Stjr re_free (cset->coll_syms); 3714146040Stjr re_free (cset->equiv_classes); 3715146040Stjr re_free (cset->range_starts); 3716146040Stjr re_free (cset->range_ends); 3717146040Stjr# endif 3718146040Stjr re_free (cset->char_classes); 3719146040Stjr re_free (cset); 3720146040Stjr} 3721146040Stjr#endif /* RE_ENABLE_I18N */ 3722146040Stjr 3723146040Stjr/* Functions for binary tree operation. */ 3724146040Stjr 3725146040Stjr/* Create a tree node. */ 3726146040Stjr 3727146040Stjrstatic bin_tree_t * 3728250724Sjkimcreate_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3729250724Sjkim re_token_type_t type) 3730146040Stjr{ 3731146040Stjr re_token_t t; 3732146040Stjr t.type = type; 3733146040Stjr return create_token_tree (dfa, left, right, &t); 3734146040Stjr} 3735146040Stjr 3736146040Stjrstatic bin_tree_t * 3737250724Sjkimcreate_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3738250724Sjkim const re_token_t *token) 3739146040Stjr{ 3740146040Stjr bin_tree_t *tree; 3741146040Stjr if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3742146040Stjr { 3743146040Stjr bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3744146040Stjr 3745146040Stjr if (storage == NULL) 3746146040Stjr return NULL; 3747146040Stjr storage->next = dfa->str_tree_storage; 3748146040Stjr dfa->str_tree_storage = storage; 3749146040Stjr dfa->str_tree_storage_idx = 0; 3750146040Stjr } 3751146040Stjr tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3752146040Stjr 3753146040Stjr tree->parent = NULL; 3754146040Stjr tree->left = left; 3755146040Stjr tree->right = right; 3756146040Stjr tree->token = *token; 3757146040Stjr tree->token.duplicated = 0; 3758146040Stjr tree->token.opt_subexp = 0; 3759146040Stjr tree->first = NULL; 3760146040Stjr tree->next = NULL; 3761146040Stjr tree->node_idx = -1; 3762146040Stjr 3763146040Stjr if (left != NULL) 3764146040Stjr left->parent = tree; 3765146040Stjr if (right != NULL) 3766146040Stjr right->parent = tree; 3767146040Stjr return tree; 3768146040Stjr} 3769146040Stjr 3770146040Stjr/* Mark the tree SRC as an optional subexpression. 3771146040Stjr To be called from preorder or postorder. */ 3772146040Stjr 3773146040Stjrstatic reg_errcode_t 3774250724Sjkimmark_opt_subexp (void *extra, bin_tree_t *node) 3775146040Stjr{ 3776146040Stjr int idx = (int) (long) extra; 3777146040Stjr if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3778146040Stjr node->token.opt_subexp = 1; 3779146040Stjr 3780146040Stjr return REG_NOERROR; 3781146040Stjr} 3782146040Stjr 3783146040Stjr/* Free the allocated memory inside NODE. */ 3784146040Stjr 3785146040Stjrstatic void 3786146040Stjrfree_token (re_token_t *node) 3787146040Stjr{ 3788146040Stjr#ifdef RE_ENABLE_I18N 3789146040Stjr if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3790146040Stjr free_charset (node->opr.mbcset); 3791146040Stjr else 3792146040Stjr#endif /* RE_ENABLE_I18N */ 3793146040Stjr if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3794146040Stjr re_free (node->opr.sbcset); 3795146040Stjr} 3796146040Stjr 3797146040Stjr/* Worker function for tree walking. Free the allocated memory inside NODE 3798146040Stjr and its children. */ 3799146040Stjr 3800146040Stjrstatic reg_errcode_t 3801146040Stjrfree_tree (void *extra, bin_tree_t *node) 3802146040Stjr{ 3803146040Stjr free_token (&node->token); 3804146040Stjr return REG_NOERROR; 3805146040Stjr} 3806146040Stjr 3807146040Stjr 3808146040Stjr/* Duplicate the node SRC, and return new node. This is a preorder 3809146040Stjr visit similar to the one implemented by the generic visitor, but 3810146040Stjr we need more infrastructure to maintain two parallel trees --- so, 3811146040Stjr it's easier to duplicate. */ 3812146040Stjr 3813146040Stjrstatic bin_tree_t * 3814250724Sjkimduplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3815146040Stjr{ 3816146040Stjr const bin_tree_t *node; 3817146040Stjr bin_tree_t *dup_root; 3818146040Stjr bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3819146040Stjr 3820146040Stjr for (node = root; ; ) 3821146040Stjr { 3822146040Stjr /* Create a new tree and link it back to the current parent. */ 3823146040Stjr *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3824146040Stjr if (*p_new == NULL) 3825146040Stjr return NULL; 3826146040Stjr (*p_new)->parent = dup_node; 3827146040Stjr (*p_new)->token.duplicated = 1; 3828146040Stjr dup_node = *p_new; 3829146040Stjr 3830146040Stjr /* Go to the left node, or up and to the right. */ 3831146040Stjr if (node->left) 3832146040Stjr { 3833146040Stjr node = node->left; 3834146040Stjr p_new = &dup_node->left; 3835146040Stjr } 3836146040Stjr else 3837146040Stjr { 3838146040Stjr const bin_tree_t *prev = NULL; 3839146040Stjr while (node->right == prev || node->right == NULL) 3840146040Stjr { 3841146040Stjr prev = node; 3842146040Stjr node = node->parent; 3843146040Stjr dup_node = dup_node->parent; 3844146040Stjr if (!node) 3845250724Sjkim return dup_root; 3846146040Stjr } 3847146040Stjr node = node->right; 3848146040Stjr p_new = &dup_node->right; 3849146040Stjr } 3850146040Stjr } 3851146040Stjr} 3852