1146040Stjr/* Extended regular expression matching and search library.
2250724Sjkim   Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3146040Stjr   This file is part of the GNU C Library.
4146040Stjr   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5146040Stjr
6146040Stjr   The GNU C Library is free software; you can redistribute it and/or
7146040Stjr   modify it under the terms of the GNU Lesser General Public
8146040Stjr   License as published by the Free Software Foundation; either
9146040Stjr   version 2.1 of the License, or (at your option) any later version.
10146040Stjr
11146040Stjr   The GNU C Library is distributed in the hope that it will be useful,
12146040Stjr   but WITHOUT ANY WARRANTY; without even the implied warranty of
13146040Stjr   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14146040Stjr   Lesser General Public License for more details.
15146040Stjr
16146040Stjr   You should have received a copy of the GNU Lesser General Public
17250724Sjkim   License along with the GNU C Library; if not, see
18250724Sjkim   <http://www.gnu.org/licenses/>.  */
19146040Stjr
20146040Stjrstatic reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21250724Sjkim					  size_t length, reg_syntax_t syntax);
22146040Stjrstatic void re_compile_fastmap_iter (regex_t *bufp,
23146040Stjr				     const re_dfastate_t *init_state,
24146040Stjr				     char *fastmap);
25250724Sjkimstatic reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26146040Stjr#ifdef RE_ENABLE_I18N
27146040Stjrstatic void free_charset (re_charset_t *cset);
28146040Stjr#endif /* RE_ENABLE_I18N */
29146040Stjrstatic void free_workarea_compile (regex_t *preg);
30146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa);
31146040Stjr#ifdef RE_ENABLE_I18N
32146040Stjrstatic void optimize_utf8 (re_dfa_t *dfa);
33146040Stjr#endif
34146040Stjrstatic reg_errcode_t analyze (regex_t *preg);
35146040Stjrstatic reg_errcode_t preorder (bin_tree_t *root,
36146040Stjr			       reg_errcode_t (fn (void *, bin_tree_t *)),
37146040Stjr			       void *extra);
38146040Stjrstatic reg_errcode_t postorder (bin_tree_t *root,
39146040Stjr				reg_errcode_t (fn (void *, bin_tree_t *)),
40146040Stjr				void *extra);
41146040Stjrstatic reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42146040Stjrstatic reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43146040Stjrstatic bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44146040Stjr				 bin_tree_t *node);
45146040Stjrstatic reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46146040Stjrstatic reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47146040Stjrstatic reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48250724Sjkimstatic int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49250724Sjkimstatic int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50146040Stjr				   unsigned int constraint);
51146040Stjrstatic reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52146040Stjrstatic reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53146040Stjr					 int node, int root);
54146040Stjrstatic reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55146040Stjrstatic int fetch_number (re_string_t *input, re_token_t *token,
56146040Stjr			 reg_syntax_t syntax);
57146040Stjrstatic int peek_token (re_token_t *token, re_string_t *input,
58250724Sjkim			reg_syntax_t syntax) internal_function;
59146040Stjrstatic bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60146040Stjr			  reg_syntax_t syntax, reg_errcode_t *err);
61146040Stjrstatic bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62146040Stjr				  re_token_t *token, reg_syntax_t syntax,
63146040Stjr				  int nest, reg_errcode_t *err);
64146040Stjrstatic bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65146040Stjr				 re_token_t *token, reg_syntax_t syntax,
66146040Stjr				 int nest, reg_errcode_t *err);
67146040Stjrstatic bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68146040Stjr				     re_token_t *token, reg_syntax_t syntax,
69146040Stjr				     int nest, reg_errcode_t *err);
70146040Stjrstatic bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71146040Stjr				  re_token_t *token, reg_syntax_t syntax,
72146040Stjr				  int nest, reg_errcode_t *err);
73146040Stjrstatic bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74146040Stjr				 re_dfa_t *dfa, re_token_t *token,
75146040Stjr				 reg_syntax_t syntax, reg_errcode_t *err);
76146040Stjrstatic bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77146040Stjr				      re_token_t *token, reg_syntax_t syntax,
78146040Stjr				      reg_errcode_t *err);
79146040Stjrstatic reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80146040Stjr					    re_string_t *regexp,
81146040Stjr					    re_token_t *token, int token_len,
82146040Stjr					    re_dfa_t *dfa,
83146040Stjr					    reg_syntax_t syntax,
84146040Stjr					    int accept_hyphen);
85146040Stjrstatic reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86146040Stjr					  re_string_t *regexp,
87146040Stjr					  re_token_t *token);
88146040Stjr#ifdef RE_ENABLE_I18N
89250724Sjkimstatic reg_errcode_t build_equiv_class (bitset_t sbcset,
90146040Stjr					re_charset_t *mbcset,
91146040Stjr					int *equiv_class_alloc,
92146040Stjr					const unsigned char *name);
93250724Sjkimstatic reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94250724Sjkim				      bitset_t sbcset,
95146040Stjr				      re_charset_t *mbcset,
96146040Stjr				      int *char_class_alloc,
97146040Stjr				      const unsigned char *class_name,
98146040Stjr				      reg_syntax_t syntax);
99146040Stjr#else  /* not RE_ENABLE_I18N */
100250724Sjkimstatic reg_errcode_t build_equiv_class (bitset_t sbcset,
101146040Stjr					const unsigned char *name);
102250724Sjkimstatic reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103250724Sjkim				      bitset_t sbcset,
104146040Stjr				      const unsigned char *class_name,
105146040Stjr				      reg_syntax_t syntax);
106146040Stjr#endif /* not RE_ENABLE_I18N */
107146040Stjrstatic bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108250724Sjkim				       RE_TRANSLATE_TYPE trans,
109146040Stjr				       const unsigned char *class_name,
110146040Stjr				       const unsigned char *extra,
111146040Stjr				       int non_match, reg_errcode_t *err);
112146040Stjrstatic bin_tree_t *create_tree (re_dfa_t *dfa,
113146040Stjr				bin_tree_t *left, bin_tree_t *right,
114146040Stjr				re_token_type_t type);
115146040Stjrstatic bin_tree_t *create_token_tree (re_dfa_t *dfa,
116146040Stjr				      bin_tree_t *left, bin_tree_t *right,
117146040Stjr				      const re_token_t *token);
118146040Stjrstatic bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119146040Stjrstatic void free_token (re_token_t *node);
120146040Stjrstatic reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121146040Stjrstatic reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122146040Stjr
123146040Stjr/* This table gives an error message for each of the error codes listed
124146040Stjr   in regex.h.  Obviously the order here has to be same as there.
125146040Stjr   POSIX doesn't require that we do anything for REG_NOERROR,
126146040Stjr   but why not be nice?  */
127146040Stjr
128146040Stjrconst char __re_error_msgid[] attribute_hidden =
129146040Stjr  {
130146040Stjr#define REG_NOERROR_IDX	0
131146040Stjr    gettext_noop ("Success")	/* REG_NOERROR */
132146040Stjr    "\0"
133146040Stjr#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134146040Stjr    gettext_noop ("No match")	/* REG_NOMATCH */
135146040Stjr    "\0"
136146040Stjr#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137146040Stjr    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138146040Stjr    "\0"
139146040Stjr#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140146040Stjr    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141146040Stjr    "\0"
142146040Stjr#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143146040Stjr    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144146040Stjr    "\0"
145146040Stjr#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146146040Stjr    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147146040Stjr    "\0"
148146040Stjr#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149146040Stjr    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150146040Stjr    "\0"
151146040Stjr#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152146040Stjr    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153146040Stjr    "\0"
154146040Stjr#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155146040Stjr    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156146040Stjr    "\0"
157146040Stjr#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158146040Stjr    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159146040Stjr    "\0"
160146040Stjr#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161146040Stjr    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162146040Stjr    "\0"
163146040Stjr#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164146040Stjr    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165146040Stjr    "\0"
166146040Stjr#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167146040Stjr    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168146040Stjr    "\0"
169146040Stjr#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170146040Stjr    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171146040Stjr    "\0"
172146040Stjr#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173146040Stjr    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174146040Stjr    "\0"
175146040Stjr#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176146040Stjr    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177146040Stjr    "\0"
178146040Stjr#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179146040Stjr    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180146040Stjr  };
181146040Stjr
182146040Stjrconst size_t __re_error_msgid_idx[] attribute_hidden =
183146040Stjr  {
184146040Stjr    REG_NOERROR_IDX,
185146040Stjr    REG_NOMATCH_IDX,
186146040Stjr    REG_BADPAT_IDX,
187146040Stjr    REG_ECOLLATE_IDX,
188146040Stjr    REG_ECTYPE_IDX,
189146040Stjr    REG_EESCAPE_IDX,
190146040Stjr    REG_ESUBREG_IDX,
191146040Stjr    REG_EBRACK_IDX,
192146040Stjr    REG_EPAREN_IDX,
193146040Stjr    REG_EBRACE_IDX,
194146040Stjr    REG_BADBR_IDX,
195146040Stjr    REG_ERANGE_IDX,
196146040Stjr    REG_ESPACE_IDX,
197146040Stjr    REG_BADRPT_IDX,
198146040Stjr    REG_EEND_IDX,
199146040Stjr    REG_ESIZE_IDX,
200146040Stjr    REG_ERPAREN_IDX
201146040Stjr  };
202146040Stjr
203146040Stjr/* Entry points for GNU code.  */
204146040Stjr
205146040Stjr/* re_compile_pattern is the GNU regular expression compiler: it
206146040Stjr   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207146040Stjr   Returns 0 if the pattern was valid, otherwise an error string.
208146040Stjr
209146040Stjr   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210146040Stjr   are set in BUFP on entry.  */
211146040Stjr
212146040Stjrconst char *
213146040Stjrre_compile_pattern (pattern, length, bufp)
214146040Stjr    const char *pattern;
215146040Stjr    size_t length;
216146040Stjr    struct re_pattern_buffer *bufp;
217146040Stjr{
218146040Stjr  reg_errcode_t ret;
219146040Stjr
220146040Stjr  /* And GNU code determines whether or not to get register information
221146040Stjr     by passing null for the REGS argument to re_match, etc., not by
222146040Stjr     setting no_sub, unless RE_NO_SUB is set.  */
223146040Stjr  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224146040Stjr
225146040Stjr  /* Match anchors at newline.  */
226146040Stjr  bufp->newline_anchor = 1;
227146040Stjr
228146040Stjr  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229146040Stjr
230146040Stjr  if (!ret)
231146040Stjr    return NULL;
232146040Stjr  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233146040Stjr}
234146040Stjr#ifdef _LIBC
235146040Stjrweak_alias (__re_compile_pattern, re_compile_pattern)
236146040Stjr#endif
237146040Stjr
238146040Stjr/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
239146040Stjr   also be assigned to arbitrarily: each pattern buffer stores its own
240146040Stjr   syntax, so it can be changed between regex compilations.  */
241146040Stjr/* This has no initializer because initialized variables in Emacs
242146040Stjr   become read-only after dumping.  */
243146040Stjrreg_syntax_t re_syntax_options;
244146040Stjr
245146040Stjr
246146040Stjr/* Specify the precise syntax of regexps for compilation.  This provides
247146040Stjr   for compatibility for various utilities which historically have
248146040Stjr   different, incompatible syntaxes.
249146040Stjr
250146040Stjr   The argument SYNTAX is a bit mask comprised of the various bits
251146040Stjr   defined in regex.h.  We return the old syntax.  */
252146040Stjr
253146040Stjrreg_syntax_t
254146040Stjrre_set_syntax (syntax)
255146040Stjr    reg_syntax_t syntax;
256146040Stjr{
257146040Stjr  reg_syntax_t ret = re_syntax_options;
258146040Stjr
259146040Stjr  re_syntax_options = syntax;
260146040Stjr  return ret;
261146040Stjr}
262146040Stjr#ifdef _LIBC
263146040Stjrweak_alias (__re_set_syntax, re_set_syntax)
264146040Stjr#endif
265146040Stjr
266146040Stjrint
267146040Stjrre_compile_fastmap (bufp)
268146040Stjr    struct re_pattern_buffer *bufp;
269146040Stjr{
270146040Stjr  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271146040Stjr  char *fastmap = bufp->fastmap;
272146040Stjr
273146040Stjr  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274146040Stjr  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275146040Stjr  if (dfa->init_state != dfa->init_state_word)
276146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277146040Stjr  if (dfa->init_state != dfa->init_state_nl)
278146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279146040Stjr  if (dfa->init_state != dfa->init_state_begbuf)
280146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281146040Stjr  bufp->fastmap_accurate = 1;
282146040Stjr  return 0;
283146040Stjr}
284146040Stjr#ifdef _LIBC
285146040Stjrweak_alias (__re_compile_fastmap, re_compile_fastmap)
286146040Stjr#endif
287146040Stjr
288146040Stjrstatic inline void
289146040Stjr__attribute ((always_inline))
290146040Stjrre_set_fastmap (char *fastmap, int icase, int ch)
291146040Stjr{
292146040Stjr  fastmap[ch] = 1;
293146040Stjr  if (icase)
294146040Stjr    fastmap[tolower (ch)] = 1;
295146040Stjr}
296146040Stjr
297146040Stjr/* Helper function for re_compile_fastmap.
298146040Stjr   Compile fastmap for the initial_state INIT_STATE.  */
299146040Stjr
300146040Stjrstatic void
301250724Sjkimre_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302250724Sjkim			 char *fastmap)
303146040Stjr{
304146040Stjr  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305146040Stjr  int node_cnt;
306146040Stjr  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307146040Stjr  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308146040Stjr    {
309146040Stjr      int node = init_state->nodes.elems[node_cnt];
310146040Stjr      re_token_type_t type = dfa->nodes[node].type;
311146040Stjr
312146040Stjr      if (type == CHARACTER)
313146040Stjr	{
314146040Stjr	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315146040Stjr#ifdef RE_ENABLE_I18N
316146040Stjr	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317146040Stjr	    {
318146040Stjr	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319146040Stjr	      wchar_t wc;
320146040Stjr	      mbstate_t state;
321146040Stjr
322146040Stjr	      p = buf;
323146040Stjr	      *p++ = dfa->nodes[node].opr.c;
324146040Stjr	      while (++node < dfa->nodes_len
325146040Stjr		     &&	dfa->nodes[node].type == CHARACTER
326146040Stjr		     && dfa->nodes[node].mb_partial)
327146040Stjr		*p++ = dfa->nodes[node].opr.c;
328250724Sjkim	      memset (&state, '\0', sizeof (state));
329250724Sjkim	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
330250724Sjkim			     &state) == p - buf
331146040Stjr		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
332146040Stjr		      != (size_t) -1))
333146040Stjr		re_set_fastmap (fastmap, 0, buf[0]);
334146040Stjr	    }
335146040Stjr#endif
336146040Stjr	}
337146040Stjr      else if (type == SIMPLE_BRACKET)
338146040Stjr	{
339250724Sjkim	  int i, ch;
340250724Sjkim	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341250724Sjkim	    {
342250724Sjkim	      int j;
343250724Sjkim	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344250724Sjkim	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345250724Sjkim		if (w & ((bitset_word_t) 1 << j))
346250724Sjkim		  re_set_fastmap (fastmap, icase, ch);
347250724Sjkim	    }
348146040Stjr	}
349146040Stjr#ifdef RE_ENABLE_I18N
350146040Stjr      else if (type == COMPLEX_BRACKET)
351146040Stjr	{
352250724Sjkim	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353146040Stjr	  int i;
354250724Sjkim
355146040Stjr# ifdef _LIBC
356250724Sjkim	  /* See if we have to try all bytes which start multiple collation
357250724Sjkim	     elements.
358250724Sjkim	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359250724Sjkim		  collation element, and don't catch 'b' since 'b' is
360250724Sjkim		  the only collation element which starts from 'b' (and
361250724Sjkim		  it is caught by SIMPLE_BRACKET).  */
362250724Sjkim	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363250724Sjkim		  && (cset->ncoll_syms || cset->nranges))
364146040Stjr		{
365146040Stjr		  const int32_t *table = (const int32_t *)
366146040Stjr		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367250724Sjkim		  for (i = 0; i < SBC_MAX; ++i)
368250724Sjkim		    if (table[i] < 0)
369250724Sjkim		      re_set_fastmap (fastmap, icase, i);
370146040Stjr		}
371250724Sjkim# endif /* _LIBC */
372250724Sjkim
373250724Sjkim	  /* See if we have to start the match at all multibyte characters,
374250724Sjkim	     i.e. where we would not find an invalid sequence.  This only
375250724Sjkim	     applies to multibyte character sets; for single byte character
376250724Sjkim	     sets, the SIMPLE_BRACKET again suffices.  */
377250724Sjkim	  if (dfa->mb_cur_max > 1
378250724Sjkim	      && (cset->nchar_classes || cset->non_match || cset->nranges
379250724Sjkim# ifdef _LIBC
380250724Sjkim		  || cset->nequiv_classes
381250724Sjkim# endif /* _LIBC */
382250724Sjkim		 ))
383250724Sjkim	    {
384250724Sjkim	      unsigned char c = 0;
385250724Sjkim	      do
386250724Sjkim		{
387250724Sjkim		  mbstate_t mbs;
388250724Sjkim		  memset (&mbs, 0, sizeof (mbs));
389250724Sjkim		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390250724Sjkim		    re_set_fastmap (fastmap, false, (int) c);
391250724Sjkim		}
392250724Sjkim	      while (++c != 0);
393146040Stjr	    }
394250724Sjkim
395250724Sjkim	  else
396146040Stjr	    {
397250724Sjkim	      /* ... Else catch all bytes which can start the mbchars.  */
398250724Sjkim	      for (i = 0; i < cset->nmbchars; ++i)
399146040Stjr		{
400250724Sjkim		  char buf[256];
401250724Sjkim		  mbstate_t state;
402250724Sjkim		  memset (&state, '\0', sizeof (state));
403250724Sjkim		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404250724Sjkim		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405250724Sjkim		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406250724Sjkim		    {
407250724Sjkim		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408250724Sjkim			  != (size_t) -1)
409250724Sjkim			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
410250724Sjkim		    }
411146040Stjr		}
412146040Stjr	    }
413146040Stjr	}
414146040Stjr#endif /* RE_ENABLE_I18N */
415146040Stjr      else if (type == OP_PERIOD
416146040Stjr#ifdef RE_ENABLE_I18N
417146040Stjr	       || type == OP_UTF8_PERIOD
418146040Stjr#endif /* RE_ENABLE_I18N */
419146040Stjr	       || type == END_OF_RE)
420146040Stjr	{
421146040Stjr	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422146040Stjr	  if (type == END_OF_RE)
423146040Stjr	    bufp->can_be_null = 1;
424146040Stjr	  return;
425146040Stjr	}
426146040Stjr    }
427146040Stjr}
428146040Stjr
429146040Stjr/* Entry point for POSIX code.  */
430146040Stjr/* regcomp takes a regular expression as a string and compiles it.
431146040Stjr
432146040Stjr   PREG is a regex_t *.  We do not expect any fields to be initialized,
433146040Stjr   since POSIX says we shouldn't.  Thus, we set
434146040Stjr
435146040Stjr     `buffer' to the compiled pattern;
436146040Stjr     `used' to the length of the compiled pattern;
437146040Stjr     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438146040Stjr       REG_EXTENDED bit in CFLAGS is set; otherwise, to
439146040Stjr       RE_SYNTAX_POSIX_BASIC;
440146040Stjr     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441146040Stjr     `fastmap' to an allocated space for the fastmap;
442146040Stjr     `fastmap_accurate' to zero;
443146040Stjr     `re_nsub' to the number of subexpressions in PATTERN.
444146040Stjr
445146040Stjr   PATTERN is the address of the pattern string.
446146040Stjr
447146040Stjr   CFLAGS is a series of bits which affect compilation.
448146040Stjr
449146040Stjr     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450146040Stjr     use POSIX basic syntax.
451146040Stjr
452146040Stjr     If REG_NEWLINE is set, then . and [^...] don't match newline.
453146040Stjr     Also, regexec will try a match beginning after every newline.
454146040Stjr
455146040Stjr     If REG_ICASE is set, then we considers upper- and lowercase
456146040Stjr     versions of letters to be equivalent when matching.
457146040Stjr
458146040Stjr     If REG_NOSUB is set, then when PREG is passed to regexec, that
459146040Stjr     routine will report only success or failure, and nothing about the
460146040Stjr     registers.
461146040Stjr
462146040Stjr   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
463146040Stjr   the return codes and their meanings.)  */
464146040Stjr
465146040Stjrint
466146040Stjrregcomp (preg, pattern, cflags)
467146040Stjr    regex_t *__restrict preg;
468146040Stjr    const char *__restrict pattern;
469146040Stjr    int cflags;
470146040Stjr{
471146040Stjr  reg_errcode_t ret;
472146040Stjr  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473146040Stjr			 : RE_SYNTAX_POSIX_BASIC);
474146040Stjr
475146040Stjr  preg->buffer = NULL;
476146040Stjr  preg->allocated = 0;
477146040Stjr  preg->used = 0;
478146040Stjr
479146040Stjr  /* Try to allocate space for the fastmap.  */
480146040Stjr  preg->fastmap = re_malloc (char, SBC_MAX);
481146040Stjr  if (BE (preg->fastmap == NULL, 0))
482146040Stjr    return REG_ESPACE;
483146040Stjr
484146040Stjr  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485146040Stjr
486146040Stjr  /* If REG_NEWLINE is set, newlines are treated differently.  */
487146040Stjr  if (cflags & REG_NEWLINE)
488146040Stjr    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
489146040Stjr      syntax &= ~RE_DOT_NEWLINE;
490146040Stjr      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491146040Stjr      /* It also changes the matching behavior.  */
492146040Stjr      preg->newline_anchor = 1;
493146040Stjr    }
494146040Stjr  else
495146040Stjr    preg->newline_anchor = 0;
496146040Stjr  preg->no_sub = !!(cflags & REG_NOSUB);
497146040Stjr  preg->translate = NULL;
498146040Stjr
499146040Stjr  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500146040Stjr
501146040Stjr  /* POSIX doesn't distinguish between an unmatched open-group and an
502146040Stjr     unmatched close-group: both are REG_EPAREN.  */
503146040Stjr  if (ret == REG_ERPAREN)
504146040Stjr    ret = REG_EPAREN;
505146040Stjr
506146040Stjr  /* We have already checked preg->fastmap != NULL.  */
507146040Stjr  if (BE (ret == REG_NOERROR, 1))
508146040Stjr    /* Compute the fastmap now, since regexec cannot modify the pattern
509146040Stjr       buffer.  This function never fails in this implementation.  */
510146040Stjr    (void) re_compile_fastmap (preg);
511146040Stjr  else
512146040Stjr    {
513146040Stjr      /* Some error occurred while compiling the expression.  */
514146040Stjr      re_free (preg->fastmap);
515146040Stjr      preg->fastmap = NULL;
516146040Stjr    }
517146040Stjr
518146040Stjr  return (int) ret;
519146040Stjr}
520146040Stjr#ifdef _LIBC
521146040Stjrweak_alias (__regcomp, regcomp)
522146040Stjr#endif
523146040Stjr
524146040Stjr/* Returns a message corresponding to an error code, ERRCODE, returned
525146040Stjr   from either regcomp or regexec.   We don't use PREG here.  */
526146040Stjr
527146040Stjrsize_t
528146040Stjrregerror (errcode, preg, errbuf, errbuf_size)
529146040Stjr    int errcode;
530250724Sjkim    const regex_t *__restrict preg;
531250724Sjkim    char *__restrict errbuf;
532146040Stjr    size_t errbuf_size;
533146040Stjr{
534146040Stjr  const char *msg;
535146040Stjr  size_t msg_size;
536146040Stjr
537146040Stjr  if (BE (errcode < 0
538146040Stjr	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
539146040Stjr			       / sizeof (__re_error_msgid_idx[0])), 0))
540146040Stjr    /* Only error codes returned by the rest of the code should be passed
541146040Stjr       to this routine.  If we are given anything else, or if other regex
542146040Stjr       code generates an invalid error code, then the program has a bug.
543146040Stjr       Dump core so we can fix it.  */
544146040Stjr    abort ();
545146040Stjr
546146040Stjr  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547146040Stjr
548146040Stjr  msg_size = strlen (msg) + 1; /* Includes the null.  */
549146040Stjr
550146040Stjr  if (BE (errbuf_size != 0, 1))
551146040Stjr    {
552146040Stjr      if (BE (msg_size > errbuf_size, 0))
553146040Stjr	{
554146040Stjr#if defined HAVE_MEMPCPY || defined _LIBC
555146040Stjr	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556146040Stjr#else
557146040Stjr	  memcpy (errbuf, msg, errbuf_size - 1);
558146040Stjr	  errbuf[errbuf_size - 1] = 0;
559146040Stjr#endif
560146040Stjr	}
561146040Stjr      else
562146040Stjr	memcpy (errbuf, msg, msg_size);
563146040Stjr    }
564146040Stjr
565146040Stjr  return msg_size;
566146040Stjr}
567146040Stjr#ifdef _LIBC
568146040Stjrweak_alias (__regerror, regerror)
569146040Stjr#endif
570146040Stjr
571146040Stjr
572146040Stjr#ifdef RE_ENABLE_I18N
573146040Stjr/* This static array is used for the map to single-byte characters when
574146040Stjr   UTF-8 is used.  Otherwise we would allocate memory just to initialize
575146040Stjr   it the same all the time.  UTF-8 is the preferred encoding so this is
576146040Stjr   a worthwhile optimization.  */
577250724Sjkimstatic const bitset_t utf8_sb_map =
578146040Stjr{
579146040Stjr  /* Set the first 128 bits.  */
580250724Sjkim  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
581146040Stjr};
582146040Stjr#endif
583146040Stjr
584146040Stjr
585146040Stjrstatic void
586146040Stjrfree_dfa_content (re_dfa_t *dfa)
587146040Stjr{
588146040Stjr  int i, j;
589146040Stjr
590146040Stjr  if (dfa->nodes)
591146040Stjr    for (i = 0; i < dfa->nodes_len; ++i)
592146040Stjr      free_token (dfa->nodes + i);
593146040Stjr  re_free (dfa->nexts);
594146040Stjr  for (i = 0; i < dfa->nodes_len; ++i)
595146040Stjr    {
596146040Stjr      if (dfa->eclosures != NULL)
597146040Stjr	re_node_set_free (dfa->eclosures + i);
598146040Stjr      if (dfa->inveclosures != NULL)
599146040Stjr	re_node_set_free (dfa->inveclosures + i);
600146040Stjr      if (dfa->edests != NULL)
601146040Stjr	re_node_set_free (dfa->edests + i);
602146040Stjr    }
603146040Stjr  re_free (dfa->edests);
604146040Stjr  re_free (dfa->eclosures);
605146040Stjr  re_free (dfa->inveclosures);
606146040Stjr  re_free (dfa->nodes);
607146040Stjr
608146040Stjr  if (dfa->state_table)
609146040Stjr    for (i = 0; i <= dfa->state_hash_mask; ++i)
610146040Stjr      {
611146040Stjr	struct re_state_table_entry *entry = dfa->state_table + i;
612146040Stjr	for (j = 0; j < entry->num; ++j)
613146040Stjr	  {
614146040Stjr	    re_dfastate_t *state = entry->array[j];
615146040Stjr	    free_state (state);
616146040Stjr	  }
617250724Sjkim	re_free (entry->array);
618146040Stjr      }
619146040Stjr  re_free (dfa->state_table);
620146040Stjr#ifdef RE_ENABLE_I18N
621146040Stjr  if (dfa->sb_char != utf8_sb_map)
622146040Stjr    re_free (dfa->sb_char);
623146040Stjr#endif
624146040Stjr  re_free (dfa->subexp_map);
625146040Stjr#ifdef DEBUG
626146040Stjr  re_free (dfa->re_str);
627146040Stjr#endif
628146040Stjr
629146040Stjr  re_free (dfa);
630146040Stjr}
631146040Stjr
632146040Stjr
633146040Stjr/* Free dynamically allocated space used by PREG.  */
634146040Stjr
635146040Stjrvoid
636146040Stjrregfree (preg)
637146040Stjr    regex_t *preg;
638146040Stjr{
639146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640146040Stjr  if (BE (dfa != NULL, 1))
641146040Stjr    free_dfa_content (dfa);
642146040Stjr  preg->buffer = NULL;
643146040Stjr  preg->allocated = 0;
644146040Stjr
645146040Stjr  re_free (preg->fastmap);
646146040Stjr  preg->fastmap = NULL;
647146040Stjr
648146040Stjr  re_free (preg->translate);
649146040Stjr  preg->translate = NULL;
650146040Stjr}
651146040Stjr#ifdef _LIBC
652146040Stjrweak_alias (__regfree, regfree)
653146040Stjr#endif
654146040Stjr
655146040Stjr/* Entry points compatible with 4.2 BSD regex library.  We don't define
656146040Stjr   them unless specifically requested.  */
657146040Stjr
658146040Stjr#if defined _REGEX_RE_COMP || defined _LIBC
659146040Stjr
660146040Stjr/* BSD has one and only one pattern buffer.  */
661146040Stjrstatic struct re_pattern_buffer re_comp_buf;
662146040Stjr
663146040Stjrchar *
664146040Stjr# ifdef _LIBC
665146040Stjr/* Make these definitions weak in libc, so POSIX programs can redefine
666146040Stjr   these names if they don't use our functions, and still use
667146040Stjr   regcomp/regexec above without link errors.  */
668146040Stjrweak_function
669146040Stjr# endif
670146040Stjrre_comp (s)
671146040Stjr     const char *s;
672146040Stjr{
673146040Stjr  reg_errcode_t ret;
674146040Stjr  char *fastmap;
675146040Stjr
676146040Stjr  if (!s)
677146040Stjr    {
678146040Stjr      if (!re_comp_buf.buffer)
679146040Stjr	return gettext ("No previous regular expression");
680146040Stjr      return 0;
681146040Stjr    }
682146040Stjr
683146040Stjr  if (re_comp_buf.buffer)
684146040Stjr    {
685146040Stjr      fastmap = re_comp_buf.fastmap;
686146040Stjr      re_comp_buf.fastmap = NULL;
687146040Stjr      __regfree (&re_comp_buf);
688146040Stjr      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689146040Stjr      re_comp_buf.fastmap = fastmap;
690146040Stjr    }
691146040Stjr
692146040Stjr  if (re_comp_buf.fastmap == NULL)
693146040Stjr    {
694146040Stjr      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695146040Stjr      if (re_comp_buf.fastmap == NULL)
696146040Stjr	return (char *) gettext (__re_error_msgid
697146040Stjr				 + __re_error_msgid_idx[(int) REG_ESPACE]);
698146040Stjr    }
699146040Stjr
700146040Stjr  /* Since `re_exec' always passes NULL for the `regs' argument, we
701146040Stjr     don't need to initialize the pattern buffer fields which affect it.  */
702146040Stjr
703146040Stjr  /* Match anchors at newlines.  */
704146040Stjr  re_comp_buf.newline_anchor = 1;
705146040Stjr
706146040Stjr  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707146040Stjr
708146040Stjr  if (!ret)
709146040Stjr    return NULL;
710146040Stjr
711146040Stjr  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
712146040Stjr  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713146040Stjr}
714146040Stjr
715146040Stjr#ifdef _LIBC
716146040Stjrlibc_freeres_fn (free_mem)
717146040Stjr{
718146040Stjr  __regfree (&re_comp_buf);
719146040Stjr}
720146040Stjr#endif
721146040Stjr
722146040Stjr#endif /* _REGEX_RE_COMP */
723146040Stjr
724146040Stjr/* Internal entry point.
725146040Stjr   Compile the regular expression PATTERN, whose length is LENGTH.
726146040Stjr   SYNTAX indicate regular expression's syntax.  */
727146040Stjr
728146040Stjrstatic reg_errcode_t
729250724Sjkimre_compile_internal (regex_t *preg, const char * pattern, size_t length,
730250724Sjkim		     reg_syntax_t syntax)
731146040Stjr{
732146040Stjr  reg_errcode_t err = REG_NOERROR;
733146040Stjr  re_dfa_t *dfa;
734146040Stjr  re_string_t regexp;
735146040Stjr
736146040Stjr  /* Initialize the pattern buffer.  */
737146040Stjr  preg->fastmap_accurate = 0;
738146040Stjr  preg->syntax = syntax;
739146040Stjr  preg->not_bol = preg->not_eol = 0;
740146040Stjr  preg->used = 0;
741146040Stjr  preg->re_nsub = 0;
742146040Stjr  preg->can_be_null = 0;
743146040Stjr  preg->regs_allocated = REGS_UNALLOCATED;
744146040Stjr
745146040Stjr  /* Initialize the dfa.  */
746146040Stjr  dfa = (re_dfa_t *) preg->buffer;
747146040Stjr  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
748146040Stjr    {
749146040Stjr      /* If zero allocated, but buffer is non-null, try to realloc
750146040Stjr	 enough space.  This loses if buffer's address is bogus, but
751146040Stjr	 that is the user's responsibility.  If ->buffer is NULL this
752146040Stjr	 is a simple allocation.  */
753146040Stjr      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754146040Stjr      if (dfa == NULL)
755146040Stjr	return REG_ESPACE;
756146040Stjr      preg->allocated = sizeof (re_dfa_t);
757146040Stjr      preg->buffer = (unsigned char *) dfa;
758146040Stjr    }
759146040Stjr  preg->used = sizeof (re_dfa_t);
760146040Stjr
761146040Stjr  err = init_dfa (dfa, length);
762146040Stjr  if (BE (err != REG_NOERROR, 0))
763146040Stjr    {
764146040Stjr      free_dfa_content (dfa);
765146040Stjr      preg->buffer = NULL;
766146040Stjr      preg->allocated = 0;
767146040Stjr      return err;
768146040Stjr    }
769146040Stjr#ifdef DEBUG
770250724Sjkim  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
771146040Stjr  dfa->re_str = re_malloc (char, length + 1);
772146040Stjr  strncpy (dfa->re_str, pattern, length + 1);
773146040Stjr#endif
774146040Stjr
775250724Sjkim  __libc_lock_init (dfa->lock);
776250724Sjkim
777146040Stjr  err = re_string_construct (&regexp, pattern, length, preg->translate,
778146040Stjr			     syntax & RE_ICASE, dfa);
779146040Stjr  if (BE (err != REG_NOERROR, 0))
780146040Stjr    {
781146040Stjr    re_compile_internal_free_return:
782146040Stjr      free_workarea_compile (preg);
783146040Stjr      re_string_destruct (&regexp);
784146040Stjr      free_dfa_content (dfa);
785146040Stjr      preg->buffer = NULL;
786146040Stjr      preg->allocated = 0;
787146040Stjr      return err;
788146040Stjr    }
789146040Stjr
790146040Stjr  /* Parse the regular expression, and build a structure tree.  */
791146040Stjr  preg->re_nsub = 0;
792146040Stjr  dfa->str_tree = parse (&regexp, preg, syntax, &err);
793146040Stjr  if (BE (dfa->str_tree == NULL, 0))
794146040Stjr    goto re_compile_internal_free_return;
795146040Stjr
796146040Stjr  /* Analyze the tree and create the nfa.  */
797146040Stjr  err = analyze (preg);
798146040Stjr  if (BE (err != REG_NOERROR, 0))
799146040Stjr    goto re_compile_internal_free_return;
800146040Stjr
801146040Stjr#ifdef RE_ENABLE_I18N
802146040Stjr  /* If possible, do searching in single byte encoding to speed things up.  */
803146040Stjr  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804146040Stjr    optimize_utf8 (dfa);
805146040Stjr#endif
806146040Stjr
807146040Stjr  /* Then create the initial state of the dfa.  */
808146040Stjr  err = create_initial_state (dfa);
809146040Stjr
810146040Stjr  /* Release work areas.  */
811146040Stjr  free_workarea_compile (preg);
812146040Stjr  re_string_destruct (&regexp);
813146040Stjr
814146040Stjr  if (BE (err != REG_NOERROR, 0))
815146040Stjr    {
816146040Stjr      free_dfa_content (dfa);
817146040Stjr      preg->buffer = NULL;
818146040Stjr      preg->allocated = 0;
819146040Stjr    }
820146040Stjr
821146040Stjr  return err;
822146040Stjr}
823146040Stjr
824146040Stjr/* Initialize DFA.  We use the length of the regular expression PAT_LEN
825146040Stjr   as the initial length of some arrays.  */
826146040Stjr
827146040Stjrstatic reg_errcode_t
828250724Sjkiminit_dfa (re_dfa_t *dfa, size_t pat_len)
829146040Stjr{
830250724Sjkim  unsigned int table_size;
831146040Stjr#ifndef _LIBC
832146040Stjr  char *codeset_name;
833146040Stjr#endif
834146040Stjr
835146040Stjr  memset (dfa, '\0', sizeof (re_dfa_t));
836146040Stjr
837146040Stjr  /* Force allocation of str_tree_storage the first time.  */
838146040Stjr  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
839146040Stjr
840250724Sjkim  /* Avoid overflows.  */
841250724Sjkim  if (pat_len == SIZE_MAX)
842250724Sjkim    return REG_ESPACE;
843250724Sjkim
844146040Stjr  dfa->nodes_alloc = pat_len + 1;
845146040Stjr  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
846146040Stjr
847146040Stjr  /*  table_size = 2 ^ ceil(log pat_len) */
848250724Sjkim  for (table_size = 1; ; table_size <<= 1)
849146040Stjr    if (table_size > pat_len)
850146040Stjr      break;
851146040Stjr
852146040Stjr  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853146040Stjr  dfa->state_hash_mask = table_size - 1;
854146040Stjr
855146040Stjr  dfa->mb_cur_max = MB_CUR_MAX;
856146040Stjr#ifdef _LIBC
857146040Stjr  if (dfa->mb_cur_max == 6
858146040Stjr      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859146040Stjr    dfa->is_utf8 = 1;
860146040Stjr  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861146040Stjr		       != 0);
862146040Stjr#else
863146040Stjr# ifdef HAVE_LANGINFO_CODESET
864146040Stjr  codeset_name = nl_langinfo (CODESET);
865146040Stjr# else
866146040Stjr  codeset_name = getenv ("LC_ALL");
867146040Stjr  if (codeset_name == NULL || codeset_name[0] == '\0')
868146040Stjr    codeset_name = getenv ("LC_CTYPE");
869146040Stjr  if (codeset_name == NULL || codeset_name[0] == '\0')
870146040Stjr    codeset_name = getenv ("LANG");
871146040Stjr  if (codeset_name == NULL)
872146040Stjr    codeset_name = "";
873146040Stjr  else if (strchr (codeset_name, '.') !=  NULL)
874146040Stjr    codeset_name = strchr (codeset_name, '.') + 1;
875146040Stjr# endif
876146040Stjr
877146040Stjr  if (strcasecmp (codeset_name, "UTF-8") == 0
878146040Stjr      || strcasecmp (codeset_name, "UTF8") == 0)
879146040Stjr    dfa->is_utf8 = 1;
880146040Stjr
881146040Stjr  /* We check exhaustively in the loop below if this charset is a
882146040Stjr     superset of ASCII.  */
883146040Stjr  dfa->map_notascii = 0;
884146040Stjr#endif
885146040Stjr
886146040Stjr#ifdef RE_ENABLE_I18N
887146040Stjr  if (dfa->mb_cur_max > 1)
888146040Stjr    {
889146040Stjr      if (dfa->is_utf8)
890146040Stjr	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891146040Stjr      else
892146040Stjr	{
893146040Stjr	  int i, j, ch;
894146040Stjr
895250724Sjkim	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896146040Stjr	  if (BE (dfa->sb_char == NULL, 0))
897146040Stjr	    return REG_ESPACE;
898146040Stjr
899250724Sjkim	  /* Set the bits corresponding to single byte chars.  */
900250724Sjkim	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901250724Sjkim	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902146040Stjr	      {
903250724Sjkim		wint_t wch = __btowc (ch);
904146040Stjr		if (wch != WEOF)
905250724Sjkim		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906146040Stjr# ifndef _LIBC
907250724Sjkim		if (isascii (ch) && wch != ch)
908146040Stjr		  dfa->map_notascii = 1;
909146040Stjr# endif
910146040Stjr	      }
911146040Stjr	}
912146040Stjr    }
913146040Stjr#endif
914146040Stjr
915146040Stjr  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916146040Stjr    return REG_ESPACE;
917146040Stjr  return REG_NOERROR;
918146040Stjr}
919146040Stjr
920146040Stjr/* Initialize WORD_CHAR table, which indicate which character is
921146040Stjr   "word".  In this case "word" means that it is the word construction
922146040Stjr   character used by some operators like "\<", "\>", etc.  */
923146040Stjr
924146040Stjrstatic void
925250724Sjkiminternal_function
926250724Sjkiminit_word_char (re_dfa_t *dfa)
927146040Stjr{
928146040Stjr  dfa->word_ops_used = 1;
929250724Sjkim  int i = 0;
930250724Sjkim  int ch = 0;
931250724Sjkim  if (BE (dfa->map_notascii == 0, 1))
932250724Sjkim    {
933250724Sjkim      if (sizeof (dfa->word_char[0]) == 8)
934250724Sjkim	{
935250724Sjkim          /* The extra temporaries here avoid "implicitly truncated"
936250724Sjkim             warnings in the case when this is dead code, i.e. 32-bit.  */
937250724Sjkim          const uint64_t wc0 = UINT64_C (0x03ff000000000000);
938250724Sjkim          const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
939250724Sjkim	  dfa->word_char[0] = wc0;
940250724Sjkim	  dfa->word_char[1] = wc1;
941250724Sjkim	  i = 2;
942250724Sjkim	}
943250724Sjkim      else if (sizeof (dfa->word_char[0]) == 4)
944250724Sjkim	{
945250724Sjkim	  dfa->word_char[0] = UINT32_C (0x00000000);
946250724Sjkim	  dfa->word_char[1] = UINT32_C (0x03ff0000);
947250724Sjkim	  dfa->word_char[2] = UINT32_C (0x87fffffe);
948250724Sjkim	  dfa->word_char[3] = UINT32_C (0x07fffffe);
949250724Sjkim	  i = 4;
950250724Sjkim	}
951250724Sjkim      else
952250724Sjkim	abort ();
953250724Sjkim      ch = 128;
954250724Sjkim
955250724Sjkim      if (BE (dfa->is_utf8, 1))
956250724Sjkim	{
957250724Sjkim	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
958250724Sjkim	  return;
959250724Sjkim	}
960250724Sjkim    }
961250724Sjkim
962250724Sjkim  for (; i < BITSET_WORDS; ++i)
963250724Sjkim    for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
964146040Stjr      if (isalnum (ch) || ch == '_')
965250724Sjkim	dfa->word_char[i] |= (bitset_word_t) 1 << j;
966146040Stjr}
967146040Stjr
968146040Stjr/* Free the work area which are only used while compiling.  */
969146040Stjr
970146040Stjrstatic void
971250724Sjkimfree_workarea_compile (regex_t *preg)
972146040Stjr{
973146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
974146040Stjr  bin_tree_storage_t *storage, *next;
975146040Stjr  for (storage = dfa->str_tree_storage; storage; storage = next)
976146040Stjr    {
977146040Stjr      next = storage->next;
978146040Stjr      re_free (storage);
979146040Stjr    }
980146040Stjr  dfa->str_tree_storage = NULL;
981146040Stjr  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
982146040Stjr  dfa->str_tree = NULL;
983146040Stjr  re_free (dfa->org_indices);
984146040Stjr  dfa->org_indices = NULL;
985146040Stjr}
986146040Stjr
987146040Stjr/* Create initial states for all contexts.  */
988146040Stjr
989146040Stjrstatic reg_errcode_t
990250724Sjkimcreate_initial_state (re_dfa_t *dfa)
991146040Stjr{
992146040Stjr  int first, i;
993146040Stjr  reg_errcode_t err;
994146040Stjr  re_node_set init_nodes;
995146040Stjr
996146040Stjr  /* Initial states have the epsilon closure of the node which is
997146040Stjr     the first node of the regular expression.  */
998146040Stjr  first = dfa->str_tree->first->node_idx;
999146040Stjr  dfa->init_node = first;
1000146040Stjr  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001146040Stjr  if (BE (err != REG_NOERROR, 0))
1002146040Stjr    return err;
1003146040Stjr
1004146040Stjr  /* The back-references which are in initial states can epsilon transit,
1005146040Stjr     since in this case all of the subexpressions can be null.
1006146040Stjr     Then we add epsilon closures of the nodes which are the next nodes of
1007146040Stjr     the back-references.  */
1008146040Stjr  if (dfa->nbackref > 0)
1009146040Stjr    for (i = 0; i < init_nodes.nelem; ++i)
1010146040Stjr      {
1011146040Stjr	int node_idx = init_nodes.elems[i];
1012146040Stjr	re_token_type_t type = dfa->nodes[node_idx].type;
1013146040Stjr
1014146040Stjr	int clexp_idx;
1015146040Stjr	if (type != OP_BACK_REF)
1016146040Stjr	  continue;
1017146040Stjr	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018146040Stjr	  {
1019146040Stjr	    re_token_t *clexp_node;
1020146040Stjr	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021146040Stjr	    if (clexp_node->type == OP_CLOSE_SUBEXP
1022146040Stjr		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023146040Stjr	      break;
1024146040Stjr	  }
1025146040Stjr	if (clexp_idx == init_nodes.nelem)
1026146040Stjr	  continue;
1027146040Stjr
1028146040Stjr	if (type == OP_BACK_REF)
1029146040Stjr	  {
1030146040Stjr	    int dest_idx = dfa->edests[node_idx].elems[0];
1031146040Stjr	    if (!re_node_set_contains (&init_nodes, dest_idx))
1032146040Stjr	      {
1033250724Sjkim		reg_errcode_t err = re_node_set_merge (&init_nodes,
1034250724Sjkim						       dfa->eclosures
1035250724Sjkim						       + dest_idx);
1036250724Sjkim		if (err != REG_NOERROR)
1037250724Sjkim		  return err;
1038146040Stjr		i = 0;
1039146040Stjr	      }
1040146040Stjr	  }
1041146040Stjr      }
1042146040Stjr
1043146040Stjr  /* It must be the first time to invoke acquire_state.  */
1044146040Stjr  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045146040Stjr  /* We don't check ERR here, since the initial state must not be NULL.  */
1046146040Stjr  if (BE (dfa->init_state == NULL, 0))
1047146040Stjr    return err;
1048146040Stjr  if (dfa->init_state->has_constraint)
1049146040Stjr    {
1050146040Stjr      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051146040Stjr						       CONTEXT_WORD);
1052146040Stjr      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053146040Stjr						     CONTEXT_NEWLINE);
1054146040Stjr      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055146040Stjr							 &init_nodes,
1056146040Stjr							 CONTEXT_NEWLINE
1057146040Stjr							 | CONTEXT_BEGBUF);
1058146040Stjr      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059146040Stjr	      || dfa->init_state_begbuf == NULL, 0))
1060146040Stjr	return err;
1061146040Stjr    }
1062146040Stjr  else
1063146040Stjr    dfa->init_state_word = dfa->init_state_nl
1064146040Stjr      = dfa->init_state_begbuf = dfa->init_state;
1065146040Stjr
1066146040Stjr  re_node_set_free (&init_nodes);
1067146040Stjr  return REG_NOERROR;
1068146040Stjr}
1069146040Stjr
1070146040Stjr#ifdef RE_ENABLE_I18N
1071146040Stjr/* If it is possible to do searching in single byte encoding instead of UTF-8
1072146040Stjr   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073146040Stjr   DFA nodes where needed.  */
1074146040Stjr
1075146040Stjrstatic void
1076250724Sjkimoptimize_utf8 (re_dfa_t *dfa)
1077146040Stjr{
1078146040Stjr  int node, i, mb_chars = 0, has_period = 0;
1079146040Stjr
1080146040Stjr  for (node = 0; node < dfa->nodes_len; ++node)
1081146040Stjr    switch (dfa->nodes[node].type)
1082146040Stjr      {
1083146040Stjr      case CHARACTER:
1084146040Stjr	if (dfa->nodes[node].opr.c >= 0x80)
1085146040Stjr	  mb_chars = 1;
1086146040Stjr	break;
1087146040Stjr      case ANCHOR:
1088250724Sjkim	switch (dfa->nodes[node].opr.ctx_type)
1089146040Stjr	  {
1090146040Stjr	  case LINE_FIRST:
1091146040Stjr	  case LINE_LAST:
1092146040Stjr	  case BUF_FIRST:
1093146040Stjr	  case BUF_LAST:
1094146040Stjr	    break;
1095146040Stjr	  default:
1096250724Sjkim	    /* Word anchors etc. cannot be handled.  It's okay to test
1097250724Sjkim	       opr.ctx_type since constraints (for all DFA nodes) are
1098250724Sjkim	       created by ORing one or more opr.ctx_type values.  */
1099146040Stjr	    return;
1100146040Stjr	  }
1101146040Stjr	break;
1102146040Stjr      case OP_PERIOD:
1103250724Sjkim	has_period = 1;
1104250724Sjkim	break;
1105146040Stjr      case OP_BACK_REF:
1106146040Stjr      case OP_ALT:
1107146040Stjr      case END_OF_RE:
1108146040Stjr      case OP_DUP_ASTERISK:
1109146040Stjr      case OP_OPEN_SUBEXP:
1110146040Stjr      case OP_CLOSE_SUBEXP:
1111146040Stjr	break;
1112146040Stjr      case COMPLEX_BRACKET:
1113146040Stjr	return;
1114146040Stjr      case SIMPLE_BRACKET:
1115250724Sjkim	/* Just double check.  The non-ASCII range starts at 0x80.  */
1116250724Sjkim	assert (0x80 % BITSET_WORD_BITS == 0);
1117250724Sjkim	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118146040Stjr	  if (dfa->nodes[node].opr.sbcset[i])
1119146040Stjr	    return;
1120146040Stjr	break;
1121146040Stjr      default:
1122146040Stjr	abort ();
1123146040Stjr      }
1124146040Stjr
1125146040Stjr  if (mb_chars || has_period)
1126146040Stjr    for (node = 0; node < dfa->nodes_len; ++node)
1127146040Stjr      {
1128146040Stjr	if (dfa->nodes[node].type == CHARACTER
1129146040Stjr	    && dfa->nodes[node].opr.c >= 0x80)
1130146040Stjr	  dfa->nodes[node].mb_partial = 0;
1131146040Stjr	else if (dfa->nodes[node].type == OP_PERIOD)
1132146040Stjr	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1133146040Stjr      }
1134146040Stjr
1135146040Stjr  /* The search can be in single byte locale.  */
1136146040Stjr  dfa->mb_cur_max = 1;
1137146040Stjr  dfa->is_utf8 = 0;
1138146040Stjr  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139146040Stjr}
1140146040Stjr#endif
1141146040Stjr
1142146040Stjr/* Analyze the structure tree, and calculate "first", "next", "edest",
1143146040Stjr   "eclosure", and "inveclosure".  */
1144146040Stjr
1145146040Stjrstatic reg_errcode_t
1146250724Sjkimanalyze (regex_t *preg)
1147146040Stjr{
1148146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149146040Stjr  reg_errcode_t ret;
1150146040Stjr
1151146040Stjr  /* Allocate arrays.  */
1152146040Stjr  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153146040Stjr  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154146040Stjr  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155146040Stjr  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156146040Stjr  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157146040Stjr	  || dfa->eclosures == NULL, 0))
1158146040Stjr    return REG_ESPACE;
1159146040Stjr
1160146040Stjr  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161146040Stjr  if (dfa->subexp_map != NULL)
1162146040Stjr    {
1163146040Stjr      int i;
1164146040Stjr      for (i = 0; i < preg->re_nsub; i++)
1165146040Stjr	dfa->subexp_map[i] = i;
1166146040Stjr      preorder (dfa->str_tree, optimize_subexps, dfa);
1167146040Stjr      for (i = 0; i < preg->re_nsub; i++)
1168146040Stjr	if (dfa->subexp_map[i] != i)
1169146040Stjr	  break;
1170146040Stjr      if (i == preg->re_nsub)
1171146040Stjr	{
1172146040Stjr	  free (dfa->subexp_map);
1173146040Stjr	  dfa->subexp_map = NULL;
1174146040Stjr	}
1175146040Stjr    }
1176146040Stjr
1177146040Stjr  ret = postorder (dfa->str_tree, lower_subexps, preg);
1178146040Stjr  if (BE (ret != REG_NOERROR, 0))
1179146040Stjr    return ret;
1180146040Stjr  ret = postorder (dfa->str_tree, calc_first, dfa);
1181146040Stjr  if (BE (ret != REG_NOERROR, 0))
1182146040Stjr    return ret;
1183146040Stjr  preorder (dfa->str_tree, calc_next, dfa);
1184146040Stjr  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185146040Stjr  if (BE (ret != REG_NOERROR, 0))
1186146040Stjr    return ret;
1187146040Stjr  ret = calc_eclosure (dfa);
1188146040Stjr  if (BE (ret != REG_NOERROR, 0))
1189146040Stjr    return ret;
1190146040Stjr
1191146040Stjr  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192146040Stjr     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193146040Stjr  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194146040Stjr      || dfa->nbackref)
1195146040Stjr    {
1196146040Stjr      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197146040Stjr      if (BE (dfa->inveclosures == NULL, 0))
1198250724Sjkim	return REG_ESPACE;
1199146040Stjr      ret = calc_inveclosure (dfa);
1200146040Stjr    }
1201146040Stjr
1202146040Stjr  return ret;
1203146040Stjr}
1204146040Stjr
1205146040Stjr/* Our parse trees are very unbalanced, so we cannot use a stack to
1206146040Stjr   implement parse tree visits.  Instead, we use parent pointers and
1207146040Stjr   some hairy code in these two functions.  */
1208146040Stjrstatic reg_errcode_t
1209250724Sjkimpostorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210250724Sjkim	   void *extra)
1211146040Stjr{
1212146040Stjr  bin_tree_t *node, *prev;
1213146040Stjr
1214146040Stjr  for (node = root; ; )
1215146040Stjr    {
1216146040Stjr      /* Descend down the tree, preferably to the left (or to the right
1217146040Stjr	 if that's the only child).  */
1218146040Stjr      while (node->left || node->right)
1219146040Stjr	if (node->left)
1220250724Sjkim	  node = node->left;
1221250724Sjkim	else
1222250724Sjkim	  node = node->right;
1223146040Stjr
1224146040Stjr      do
1225146040Stjr	{
1226146040Stjr	  reg_errcode_t err = fn (extra, node);
1227146040Stjr	  if (BE (err != REG_NOERROR, 0))
1228146040Stjr	    return err;
1229250724Sjkim	  if (node->parent == NULL)
1230146040Stjr	    return REG_NOERROR;
1231146040Stjr	  prev = node;
1232146040Stjr	  node = node->parent;
1233146040Stjr	}
1234146040Stjr      /* Go up while we have a node that is reached from the right.  */
1235146040Stjr      while (node->right == prev || node->right == NULL);
1236146040Stjr      node = node->right;
1237146040Stjr    }
1238146040Stjr}
1239146040Stjr
1240146040Stjrstatic reg_errcode_t
1241250724Sjkimpreorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242250724Sjkim	  void *extra)
1243146040Stjr{
1244146040Stjr  bin_tree_t *node;
1245146040Stjr
1246146040Stjr  for (node = root; ; )
1247146040Stjr    {
1248146040Stjr      reg_errcode_t err = fn (extra, node);
1249146040Stjr      if (BE (err != REG_NOERROR, 0))
1250146040Stjr	return err;
1251146040Stjr
1252146040Stjr      /* Go to the left node, or up and to the right.  */
1253146040Stjr      if (node->left)
1254146040Stjr	node = node->left;
1255146040Stjr      else
1256146040Stjr	{
1257146040Stjr	  bin_tree_t *prev = NULL;
1258146040Stjr	  while (node->right == prev || node->right == NULL)
1259146040Stjr	    {
1260146040Stjr	      prev = node;
1261146040Stjr	      node = node->parent;
1262146040Stjr	      if (!node)
1263250724Sjkim		return REG_NOERROR;
1264146040Stjr	    }
1265146040Stjr	  node = node->right;
1266146040Stjr	}
1267146040Stjr    }
1268146040Stjr}
1269146040Stjr
1270146040Stjr/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271146040Stjr   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272146040Stjr   backreferences as well.  Requires a preorder visit.  */
1273146040Stjrstatic reg_errcode_t
1274250724Sjkimoptimize_subexps (void *extra, bin_tree_t *node)
1275146040Stjr{
1276146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1277146040Stjr
1278146040Stjr  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279146040Stjr    {
1280146040Stjr      int idx = node->token.opr.idx;
1281146040Stjr      node->token.opr.idx = dfa->subexp_map[idx];
1282146040Stjr      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283146040Stjr    }
1284146040Stjr
1285146040Stjr  else if (node->token.type == SUBEXP
1286250724Sjkim	   && node->left && node->left->token.type == SUBEXP)
1287146040Stjr    {
1288146040Stjr      int other_idx = node->left->token.opr.idx;
1289146040Stjr
1290146040Stjr      node->left = node->left->left;
1291146040Stjr      if (node->left)
1292250724Sjkim	node->left->parent = node;
1293146040Stjr
1294146040Stjr      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295250724Sjkim      if (other_idx < BITSET_WORD_BITS)
1296250724Sjkim	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297146040Stjr    }
1298146040Stjr
1299146040Stjr  return REG_NOERROR;
1300146040Stjr}
1301146040Stjr
1302146040Stjr/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303146040Stjr   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304146040Stjrstatic reg_errcode_t
1305250724Sjkimlower_subexps (void *extra, bin_tree_t *node)
1306146040Stjr{
1307146040Stjr  regex_t *preg = (regex_t *) extra;
1308146040Stjr  reg_errcode_t err = REG_NOERROR;
1309146040Stjr
1310146040Stjr  if (node->left && node->left->token.type == SUBEXP)
1311146040Stjr    {
1312146040Stjr      node->left = lower_subexp (&err, preg, node->left);
1313146040Stjr      if (node->left)
1314146040Stjr	node->left->parent = node;
1315146040Stjr    }
1316146040Stjr  if (node->right && node->right->token.type == SUBEXP)
1317146040Stjr    {
1318146040Stjr      node->right = lower_subexp (&err, preg, node->right);
1319146040Stjr      if (node->right)
1320146040Stjr	node->right->parent = node;
1321146040Stjr    }
1322146040Stjr
1323146040Stjr  return err;
1324146040Stjr}
1325146040Stjr
1326146040Stjrstatic bin_tree_t *
1327250724Sjkimlower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328146040Stjr{
1329146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330146040Stjr  bin_tree_t *body = node->left;
1331146040Stjr  bin_tree_t *op, *cls, *tree1, *tree;
1332146040Stjr
1333146040Stjr  if (preg->no_sub
1334146040Stjr      /* We do not optimize empty subexpressions, because otherwise we may
1335146040Stjr	 have bad CONCAT nodes with NULL children.  This is obviously not
1336146040Stjr	 very common, so we do not lose much.  An example that triggers
1337146040Stjr	 this case is the sed "script" /\(\)/x.  */
1338146040Stjr      && node->left != NULL
1339250724Sjkim      && (node->token.opr.idx >= BITSET_WORD_BITS
1340250724Sjkim	  || !(dfa->used_bkref_map
1341250724Sjkim	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1342146040Stjr    return node->left;
1343146040Stjr
1344146040Stjr  /* Convert the SUBEXP node to the concatenation of an
1345146040Stjr     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346146040Stjr  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347146040Stjr  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348146040Stjr  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349146040Stjr  tree = create_tree (dfa, op, tree1, CONCAT);
1350146040Stjr  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351146040Stjr    {
1352146040Stjr      *err = REG_ESPACE;
1353146040Stjr      return NULL;
1354146040Stjr    }
1355146040Stjr
1356146040Stjr  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357146040Stjr  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358146040Stjr  return tree;
1359146040Stjr}
1360146040Stjr
1361146040Stjr/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362146040Stjr   nodes.  Requires a postorder visit.  */
1363146040Stjrstatic reg_errcode_t
1364250724Sjkimcalc_first (void *extra, bin_tree_t *node)
1365146040Stjr{
1366146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1367146040Stjr  if (node->token.type == CONCAT)
1368146040Stjr    {
1369146040Stjr      node->first = node->left->first;
1370146040Stjr      node->node_idx = node->left->node_idx;
1371146040Stjr    }
1372146040Stjr  else
1373146040Stjr    {
1374146040Stjr      node->first = node;
1375146040Stjr      node->node_idx = re_dfa_add_node (dfa, node->token);
1376146040Stjr      if (BE (node->node_idx == -1, 0))
1377250724Sjkim	return REG_ESPACE;
1378250724Sjkim      if (node->token.type == ANCHOR)
1379250724Sjkim	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380146040Stjr    }
1381146040Stjr  return REG_NOERROR;
1382146040Stjr}
1383146040Stjr
1384146040Stjr/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385146040Stjrstatic reg_errcode_t
1386250724Sjkimcalc_next (void *extra, bin_tree_t *node)
1387146040Stjr{
1388146040Stjr  switch (node->token.type)
1389146040Stjr    {
1390146040Stjr    case OP_DUP_ASTERISK:
1391146040Stjr      node->left->next = node;
1392146040Stjr      break;
1393146040Stjr    case CONCAT:
1394146040Stjr      node->left->next = node->right->first;
1395146040Stjr      node->right->next = node->next;
1396146040Stjr      break;
1397146040Stjr    default:
1398146040Stjr      if (node->left)
1399146040Stjr	node->left->next = node->next;
1400146040Stjr      if (node->right)
1401250724Sjkim	node->right->next = node->next;
1402146040Stjr      break;
1403146040Stjr    }
1404146040Stjr  return REG_NOERROR;
1405146040Stjr}
1406146040Stjr
1407146040Stjr/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408146040Stjrstatic reg_errcode_t
1409250724Sjkimlink_nfa_nodes (void *extra, bin_tree_t *node)
1410146040Stjr{
1411146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1412146040Stjr  int idx = node->node_idx;
1413146040Stjr  reg_errcode_t err = REG_NOERROR;
1414146040Stjr
1415146040Stjr  switch (node->token.type)
1416146040Stjr    {
1417146040Stjr    case CONCAT:
1418146040Stjr      break;
1419146040Stjr
1420146040Stjr    case END_OF_RE:
1421146040Stjr      assert (node->next == NULL);
1422146040Stjr      break;
1423146040Stjr
1424146040Stjr    case OP_DUP_ASTERISK:
1425146040Stjr    case OP_ALT:
1426146040Stjr      {
1427146040Stjr	int left, right;
1428146040Stjr	dfa->has_plural_match = 1;
1429146040Stjr	if (node->left != NULL)
1430146040Stjr	  left = node->left->first->node_idx;
1431146040Stjr	else
1432146040Stjr	  left = node->next->node_idx;
1433146040Stjr	if (node->right != NULL)
1434146040Stjr	  right = node->right->first->node_idx;
1435146040Stjr	else
1436146040Stjr	  right = node->next->node_idx;
1437146040Stjr	assert (left > -1);
1438146040Stjr	assert (right > -1);
1439146040Stjr	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440146040Stjr      }
1441146040Stjr      break;
1442146040Stjr
1443146040Stjr    case ANCHOR:
1444146040Stjr    case OP_OPEN_SUBEXP:
1445146040Stjr    case OP_CLOSE_SUBEXP:
1446146040Stjr      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447146040Stjr      break;
1448146040Stjr
1449146040Stjr    case OP_BACK_REF:
1450146040Stjr      dfa->nexts[idx] = node->next->node_idx;
1451146040Stjr      if (node->token.type == OP_BACK_REF)
1452250724Sjkim	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453146040Stjr      break;
1454146040Stjr
1455146040Stjr    default:
1456146040Stjr      assert (!IS_EPSILON_NODE (node->token.type));
1457146040Stjr      dfa->nexts[idx] = node->next->node_idx;
1458146040Stjr      break;
1459146040Stjr    }
1460146040Stjr
1461146040Stjr  return err;
1462146040Stjr}
1463146040Stjr
1464146040Stjr/* Duplicate the epsilon closure of the node ROOT_NODE.
1465146040Stjr   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466146040Stjr   to their own constraint.  */
1467146040Stjr
1468146040Stjrstatic reg_errcode_t
1469250724Sjkiminternal_function
1470250724Sjkimduplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471250724Sjkim			int root_node, unsigned int init_constraint)
1472146040Stjr{
1473146040Stjr  int org_node, clone_node, ret;
1474146040Stjr  unsigned int constraint = init_constraint;
1475146040Stjr  for (org_node = top_org_node, clone_node = top_clone_node;;)
1476146040Stjr    {
1477146040Stjr      int org_dest, clone_dest;
1478146040Stjr      if (dfa->nodes[org_node].type == OP_BACK_REF)
1479146040Stjr	{
1480146040Stjr	  /* If the back reference epsilon-transit, its destination must
1481146040Stjr	     also have the constraint.  Then duplicate the epsilon closure
1482146040Stjr	     of the destination of the back reference, and store it in
1483146040Stjr	     edests of the back reference.  */
1484146040Stjr	  org_dest = dfa->nexts[org_node];
1485146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1486250724Sjkim	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1487250724Sjkim	  if (BE (clone_dest == -1, 0))
1488250724Sjkim	    return REG_ESPACE;
1489146040Stjr	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1490146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491146040Stjr	  if (BE (ret < 0, 0))
1492146040Stjr	    return REG_ESPACE;
1493146040Stjr	}
1494146040Stjr      else if (dfa->edests[org_node].nelem == 0)
1495146040Stjr	{
1496146040Stjr	  /* In case of the node can't epsilon-transit, don't duplicate the
1497146040Stjr	     destination and store the original destination as the
1498146040Stjr	     destination of the node.  */
1499146040Stjr	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1500146040Stjr	  break;
1501146040Stjr	}
1502146040Stjr      else if (dfa->edests[org_node].nelem == 1)
1503146040Stjr	{
1504146040Stjr	  /* In case of the node can epsilon-transit, and it has only one
1505146040Stjr	     destination.  */
1506146040Stjr	  org_dest = dfa->edests[org_node].elems[0];
1507146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1508250724Sjkim	  /* If the node is root_node itself, it means the epsilon clsoure
1509250724Sjkim	     has a loop.   Then tie it to the destination of the root_node.  */
1510250724Sjkim	  if (org_node == root_node && clone_node != org_node)
1511146040Stjr	    {
1512250724Sjkim	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513250724Sjkim	      if (BE (ret < 0, 0))
1514250724Sjkim		return REG_ESPACE;
1515250724Sjkim	      break;
1516146040Stjr	    }
1517250724Sjkim	  /* In case of the node has another constraint, add it.  */
1518250724Sjkim	  constraint |= dfa->nodes[org_node].constraint;
1519250724Sjkim	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1520250724Sjkim	  if (BE (clone_dest == -1, 0))
1521250724Sjkim	    return REG_ESPACE;
1522146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523146040Stjr	  if (BE (ret < 0, 0))
1524146040Stjr	    return REG_ESPACE;
1525146040Stjr	}
1526146040Stjr      else /* dfa->edests[org_node].nelem == 2 */
1527146040Stjr	{
1528146040Stjr	  /* In case of the node can epsilon-transit, and it has two
1529146040Stjr	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530146040Stjr	  org_dest = dfa->edests[org_node].elems[0];
1531146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1532146040Stjr	  /* Search for a duplicated node which satisfies the constraint.  */
1533146040Stjr	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534146040Stjr	  if (clone_dest == -1)
1535146040Stjr	    {
1536250724Sjkim	      /* There is no such duplicated node, create a new one.  */
1537250724Sjkim	      reg_errcode_t err;
1538250724Sjkim	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1539250724Sjkim	      if (BE (clone_dest == -1, 0))
1540250724Sjkim		return REG_ESPACE;
1541146040Stjr	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542146040Stjr	      if (BE (ret < 0, 0))
1543146040Stjr		return REG_ESPACE;
1544146040Stjr	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545146040Stjr					    root_node, constraint);
1546146040Stjr	      if (BE (err != REG_NOERROR, 0))
1547146040Stjr		return err;
1548146040Stjr	    }
1549146040Stjr	  else
1550146040Stjr	    {
1551250724Sjkim	      /* There is a duplicated node which satisfies the constraint,
1552146040Stjr		 use it to avoid infinite loop.  */
1553146040Stjr	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554146040Stjr	      if (BE (ret < 0, 0))
1555146040Stjr		return REG_ESPACE;
1556146040Stjr	    }
1557146040Stjr
1558146040Stjr	  org_dest = dfa->edests[org_node].elems[1];
1559250724Sjkim	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1560250724Sjkim	  if (BE (clone_dest == -1, 0))
1561250724Sjkim	    return REG_ESPACE;
1562146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563146040Stjr	  if (BE (ret < 0, 0))
1564146040Stjr	    return REG_ESPACE;
1565146040Stjr	}
1566146040Stjr      org_node = org_dest;
1567146040Stjr      clone_node = clone_dest;
1568146040Stjr    }
1569146040Stjr  return REG_NOERROR;
1570146040Stjr}
1571146040Stjr
1572146040Stjr/* Search for a node which is duplicated from the node ORG_NODE, and
1573146040Stjr   satisfies the constraint CONSTRAINT.  */
1574146040Stjr
1575146040Stjrstatic int
1576250724Sjkimsearch_duplicated_node (const re_dfa_t *dfa, int org_node,
1577250724Sjkim			unsigned int constraint)
1578146040Stjr{
1579146040Stjr  int idx;
1580146040Stjr  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581146040Stjr    {
1582146040Stjr      if (org_node == dfa->org_indices[idx]
1583146040Stjr	  && constraint == dfa->nodes[idx].constraint)
1584146040Stjr	return idx; /* Found.  */
1585146040Stjr    }
1586146040Stjr  return -1; /* Not found.  */
1587146040Stjr}
1588146040Stjr
1589146040Stjr/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590250724Sjkim   Return the index of the new node, or -1 if insufficient storage is
1591250724Sjkim   available.  */
1592146040Stjr
1593250724Sjkimstatic int
1594250724Sjkimduplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1595146040Stjr{
1596146040Stjr  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597250724Sjkim  if (BE (dup_idx != -1, 1))
1598250724Sjkim    {
1599250724Sjkim      dfa->nodes[dup_idx].constraint = constraint;
1600250724Sjkim      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601250724Sjkim      dfa->nodes[dup_idx].duplicated = 1;
1602146040Stjr
1603250724Sjkim      /* Store the index of the original node.  */
1604250724Sjkim      dfa->org_indices[dup_idx] = org_idx;
1605250724Sjkim    }
1606250724Sjkim  return dup_idx;
1607146040Stjr}
1608146040Stjr
1609146040Stjrstatic reg_errcode_t
1610250724Sjkimcalc_inveclosure (re_dfa_t *dfa)
1611146040Stjr{
1612146040Stjr  int src, idx, ret;
1613146040Stjr  for (idx = 0; idx < dfa->nodes_len; ++idx)
1614146040Stjr    re_node_set_init_empty (dfa->inveclosures + idx);
1615146040Stjr
1616146040Stjr  for (src = 0; src < dfa->nodes_len; ++src)
1617146040Stjr    {
1618146040Stjr      int *elems = dfa->eclosures[src].elems;
1619146040Stjr      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620146040Stjr	{
1621146040Stjr	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622146040Stjr	  if (BE (ret == -1, 0))
1623146040Stjr	    return REG_ESPACE;
1624146040Stjr	}
1625146040Stjr    }
1626146040Stjr
1627146040Stjr  return REG_NOERROR;
1628146040Stjr}
1629146040Stjr
1630146040Stjr/* Calculate "eclosure" for all the node in DFA.  */
1631146040Stjr
1632146040Stjrstatic reg_errcode_t
1633250724Sjkimcalc_eclosure (re_dfa_t *dfa)
1634146040Stjr{
1635146040Stjr  int node_idx, incomplete;
1636146040Stjr#ifdef DEBUG
1637146040Stjr  assert (dfa->nodes_len > 0);
1638146040Stjr#endif
1639146040Stjr  incomplete = 0;
1640146040Stjr  /* For each nodes, calculate epsilon closure.  */
1641146040Stjr  for (node_idx = 0; ; ++node_idx)
1642146040Stjr    {
1643146040Stjr      reg_errcode_t err;
1644146040Stjr      re_node_set eclosure_elem;
1645146040Stjr      if (node_idx == dfa->nodes_len)
1646146040Stjr	{
1647146040Stjr	  if (!incomplete)
1648146040Stjr	    break;
1649146040Stjr	  incomplete = 0;
1650146040Stjr	  node_idx = 0;
1651146040Stjr	}
1652146040Stjr
1653146040Stjr#ifdef DEBUG
1654146040Stjr      assert (dfa->eclosures[node_idx].nelem != -1);
1655146040Stjr#endif
1656146040Stjr
1657146040Stjr      /* If we have already calculated, skip it.  */
1658146040Stjr      if (dfa->eclosures[node_idx].nelem != 0)
1659146040Stjr	continue;
1660146040Stjr      /* Calculate epsilon closure of `node_idx'.  */
1661146040Stjr      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662146040Stjr      if (BE (err != REG_NOERROR, 0))
1663146040Stjr	return err;
1664146040Stjr
1665146040Stjr      if (dfa->eclosures[node_idx].nelem == 0)
1666146040Stjr	{
1667146040Stjr	  incomplete = 1;
1668146040Stjr	  re_node_set_free (&eclosure_elem);
1669146040Stjr	}
1670146040Stjr    }
1671146040Stjr  return REG_NOERROR;
1672146040Stjr}
1673146040Stjr
1674146040Stjr/* Calculate epsilon closure of NODE.  */
1675146040Stjr
1676146040Stjrstatic reg_errcode_t
1677250724Sjkimcalc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1678146040Stjr{
1679146040Stjr  reg_errcode_t err;
1680250724Sjkim  int i;
1681146040Stjr  re_node_set eclosure;
1682250724Sjkim  int ret;
1683250724Sjkim  int incomplete = 0;
1684146040Stjr  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685146040Stjr  if (BE (err != REG_NOERROR, 0))
1686146040Stjr    return err;
1687146040Stjr
1688146040Stjr  /* This indicates that we are calculating this node now.
1689146040Stjr     We reference this value to avoid infinite loop.  */
1690146040Stjr  dfa->eclosures[node].nelem = -1;
1691146040Stjr
1692250724Sjkim  /* If the current node has constraints, duplicate all nodes
1693250724Sjkim     since they must inherit the constraints.  */
1694250724Sjkim  if (dfa->nodes[node].constraint
1695146040Stjr      && dfa->edests[node].nelem
1696146040Stjr      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697146040Stjr    {
1698250724Sjkim      err = duplicate_node_closure (dfa, node, node, node,
1699250724Sjkim				    dfa->nodes[node].constraint);
1700146040Stjr      if (BE (err != REG_NOERROR, 0))
1701146040Stjr	return err;
1702146040Stjr    }
1703146040Stjr
1704146040Stjr  /* Expand each epsilon destination nodes.  */
1705146040Stjr  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706146040Stjr    for (i = 0; i < dfa->edests[node].nelem; ++i)
1707146040Stjr      {
1708146040Stjr	re_node_set eclosure_elem;
1709146040Stjr	int edest = dfa->edests[node].elems[i];
1710146040Stjr	/* If calculating the epsilon closure of `edest' is in progress,
1711146040Stjr	   return intermediate result.  */
1712146040Stjr	if (dfa->eclosures[edest].nelem == -1)
1713146040Stjr	  {
1714146040Stjr	    incomplete = 1;
1715146040Stjr	    continue;
1716146040Stjr	  }
1717146040Stjr	/* If we haven't calculated the epsilon closure of `edest' yet,
1718146040Stjr	   calculate now. Otherwise use calculated epsilon closure.  */
1719146040Stjr	if (dfa->eclosures[edest].nelem == 0)
1720146040Stjr	  {
1721146040Stjr	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722146040Stjr	    if (BE (err != REG_NOERROR, 0))
1723146040Stjr	      return err;
1724146040Stjr	  }
1725146040Stjr	else
1726146040Stjr	  eclosure_elem = dfa->eclosures[edest];
1727146040Stjr	/* Merge the epsilon closure of `edest'.  */
1728250724Sjkim	err = re_node_set_merge (&eclosure, &eclosure_elem);
1729250724Sjkim	if (BE (err != REG_NOERROR, 0))
1730250724Sjkim	  return err;
1731146040Stjr	/* If the epsilon closure of `edest' is incomplete,
1732146040Stjr	   the epsilon closure of this node is also incomplete.  */
1733146040Stjr	if (dfa->eclosures[edest].nelem == 0)
1734146040Stjr	  {
1735146040Stjr	    incomplete = 1;
1736146040Stjr	    re_node_set_free (&eclosure_elem);
1737146040Stjr	  }
1738146040Stjr      }
1739146040Stjr
1740250724Sjkim  /* An epsilon closure includes itself.  */
1741250724Sjkim  ret = re_node_set_insert (&eclosure, node);
1742250724Sjkim  if (BE (ret < 0, 0))
1743250724Sjkim    return REG_ESPACE;
1744146040Stjr  if (incomplete && !root)
1745146040Stjr    dfa->eclosures[node].nelem = 0;
1746146040Stjr  else
1747146040Stjr    dfa->eclosures[node] = eclosure;
1748146040Stjr  *new_set = eclosure;
1749146040Stjr  return REG_NOERROR;
1750146040Stjr}
1751146040Stjr
1752146040Stjr/* Functions for token which are used in the parser.  */
1753146040Stjr
1754146040Stjr/* Fetch a token from INPUT.
1755146040Stjr   We must not use this function inside bracket expressions.  */
1756146040Stjr
1757146040Stjrstatic void
1758250724Sjkiminternal_function
1759250724Sjkimfetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760146040Stjr{
1761146040Stjr  re_string_skip_bytes (input, peek_token (result, input, syntax));
1762146040Stjr}
1763146040Stjr
1764146040Stjr/* Peek a token from INPUT, and return the length of the token.
1765146040Stjr   We must not use this function inside bracket expressions.  */
1766146040Stjr
1767146040Stjrstatic int
1768250724Sjkiminternal_function
1769250724Sjkimpeek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1770146040Stjr{
1771146040Stjr  unsigned char c;
1772146040Stjr
1773146040Stjr  if (re_string_eoi (input))
1774146040Stjr    {
1775146040Stjr      token->type = END_OF_RE;
1776146040Stjr      return 0;
1777146040Stjr    }
1778146040Stjr
1779146040Stjr  c = re_string_peek_byte (input, 0);
1780146040Stjr  token->opr.c = c;
1781146040Stjr
1782146040Stjr  token->word_char = 0;
1783146040Stjr#ifdef RE_ENABLE_I18N
1784146040Stjr  token->mb_partial = 0;
1785146040Stjr  if (input->mb_cur_max > 1 &&
1786146040Stjr      !re_string_first_byte (input, re_string_cur_idx (input)))
1787146040Stjr    {
1788146040Stjr      token->type = CHARACTER;
1789146040Stjr      token->mb_partial = 1;
1790146040Stjr      return 1;
1791146040Stjr    }
1792146040Stjr#endif
1793146040Stjr  if (c == '\\')
1794146040Stjr    {
1795146040Stjr      unsigned char c2;
1796146040Stjr      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797146040Stjr	{
1798146040Stjr	  token->type = BACK_SLASH;
1799146040Stjr	  return 1;
1800146040Stjr	}
1801146040Stjr
1802146040Stjr      c2 = re_string_peek_byte_case (input, 1);
1803146040Stjr      token->opr.c = c2;
1804146040Stjr      token->type = CHARACTER;
1805146040Stjr#ifdef RE_ENABLE_I18N
1806146040Stjr      if (input->mb_cur_max > 1)
1807146040Stjr	{
1808146040Stjr	  wint_t wc = re_string_wchar_at (input,
1809146040Stjr					  re_string_cur_idx (input) + 1);
1810146040Stjr	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1811146040Stjr	}
1812146040Stjr      else
1813146040Stjr#endif
1814146040Stjr	token->word_char = IS_WORD_CHAR (c2) != 0;
1815146040Stjr
1816146040Stjr      switch (c2)
1817146040Stjr	{
1818146040Stjr	case '|':
1819146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820146040Stjr	    token->type = OP_ALT;
1821146040Stjr	  break;
1822146040Stjr	case '1': case '2': case '3': case '4': case '5':
1823146040Stjr	case '6': case '7': case '8': case '9':
1824146040Stjr	  if (!(syntax & RE_NO_BK_REFS))
1825146040Stjr	    {
1826146040Stjr	      token->type = OP_BACK_REF;
1827146040Stjr	      token->opr.idx = c2 - '1';
1828146040Stjr	    }
1829146040Stjr	  break;
1830146040Stjr	case '<':
1831146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1832146040Stjr	    {
1833146040Stjr	      token->type = ANCHOR;
1834146040Stjr	      token->opr.ctx_type = WORD_FIRST;
1835146040Stjr	    }
1836146040Stjr	  break;
1837146040Stjr	case '>':
1838146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1839146040Stjr	    {
1840146040Stjr	      token->type = ANCHOR;
1841146040Stjr	      token->opr.ctx_type = WORD_LAST;
1842146040Stjr	    }
1843146040Stjr	  break;
1844146040Stjr	case 'b':
1845146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1846146040Stjr	    {
1847146040Stjr	      token->type = ANCHOR;
1848146040Stjr	      token->opr.ctx_type = WORD_DELIM;
1849146040Stjr	    }
1850146040Stjr	  break;
1851146040Stjr	case 'B':
1852146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1853146040Stjr	    {
1854146040Stjr	      token->type = ANCHOR;
1855146040Stjr	      token->opr.ctx_type = NOT_WORD_DELIM;
1856146040Stjr	    }
1857146040Stjr	  break;
1858146040Stjr	case 'w':
1859146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1860146040Stjr	    token->type = OP_WORD;
1861146040Stjr	  break;
1862146040Stjr	case 'W':
1863146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1864146040Stjr	    token->type = OP_NOTWORD;
1865146040Stjr	  break;
1866146040Stjr	case 's':
1867146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1868146040Stjr	    token->type = OP_SPACE;
1869146040Stjr	  break;
1870146040Stjr	case 'S':
1871146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1872146040Stjr	    token->type = OP_NOTSPACE;
1873146040Stjr	  break;
1874146040Stjr	case '`':
1875146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1876146040Stjr	    {
1877146040Stjr	      token->type = ANCHOR;
1878146040Stjr	      token->opr.ctx_type = BUF_FIRST;
1879146040Stjr	    }
1880146040Stjr	  break;
1881146040Stjr	case '\'':
1882146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1883146040Stjr	    {
1884146040Stjr	      token->type = ANCHOR;
1885146040Stjr	      token->opr.ctx_type = BUF_LAST;
1886146040Stjr	    }
1887146040Stjr	  break;
1888146040Stjr	case '(':
1889146040Stjr	  if (!(syntax & RE_NO_BK_PARENS))
1890146040Stjr	    token->type = OP_OPEN_SUBEXP;
1891146040Stjr	  break;
1892146040Stjr	case ')':
1893146040Stjr	  if (!(syntax & RE_NO_BK_PARENS))
1894146040Stjr	    token->type = OP_CLOSE_SUBEXP;
1895146040Stjr	  break;
1896146040Stjr	case '+':
1897146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898146040Stjr	    token->type = OP_DUP_PLUS;
1899146040Stjr	  break;
1900146040Stjr	case '?':
1901146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902146040Stjr	    token->type = OP_DUP_QUESTION;
1903146040Stjr	  break;
1904146040Stjr	case '{':
1905146040Stjr	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906146040Stjr	    token->type = OP_OPEN_DUP_NUM;
1907146040Stjr	  break;
1908146040Stjr	case '}':
1909146040Stjr	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910146040Stjr	    token->type = OP_CLOSE_DUP_NUM;
1911146040Stjr	  break;
1912146040Stjr	default:
1913146040Stjr	  break;
1914146040Stjr	}
1915146040Stjr      return 2;
1916146040Stjr    }
1917146040Stjr
1918146040Stjr  token->type = CHARACTER;
1919146040Stjr#ifdef RE_ENABLE_I18N
1920146040Stjr  if (input->mb_cur_max > 1)
1921146040Stjr    {
1922146040Stjr      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923146040Stjr      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1924146040Stjr    }
1925146040Stjr  else
1926146040Stjr#endif
1927146040Stjr    token->word_char = IS_WORD_CHAR (token->opr.c);
1928146040Stjr
1929146040Stjr  switch (c)
1930146040Stjr    {
1931146040Stjr    case '\n':
1932146040Stjr      if (syntax & RE_NEWLINE_ALT)
1933146040Stjr	token->type = OP_ALT;
1934146040Stjr      break;
1935146040Stjr    case '|':
1936146040Stjr      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937146040Stjr	token->type = OP_ALT;
1938146040Stjr      break;
1939146040Stjr    case '*':
1940146040Stjr      token->type = OP_DUP_ASTERISK;
1941146040Stjr      break;
1942146040Stjr    case '+':
1943146040Stjr      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944146040Stjr	token->type = OP_DUP_PLUS;
1945146040Stjr      break;
1946146040Stjr    case '?':
1947146040Stjr      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948146040Stjr	token->type = OP_DUP_QUESTION;
1949146040Stjr      break;
1950146040Stjr    case '{':
1951146040Stjr      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952146040Stjr	token->type = OP_OPEN_DUP_NUM;
1953146040Stjr      break;
1954146040Stjr    case '}':
1955146040Stjr      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956146040Stjr	token->type = OP_CLOSE_DUP_NUM;
1957146040Stjr      break;
1958146040Stjr    case '(':
1959146040Stjr      if (syntax & RE_NO_BK_PARENS)
1960146040Stjr	token->type = OP_OPEN_SUBEXP;
1961146040Stjr      break;
1962146040Stjr    case ')':
1963146040Stjr      if (syntax & RE_NO_BK_PARENS)
1964146040Stjr	token->type = OP_CLOSE_SUBEXP;
1965146040Stjr      break;
1966146040Stjr    case '[':
1967146040Stjr      token->type = OP_OPEN_BRACKET;
1968146040Stjr      break;
1969146040Stjr    case '.':
1970146040Stjr      token->type = OP_PERIOD;
1971146040Stjr      break;
1972146040Stjr    case '^':
1973146040Stjr      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974146040Stjr	  re_string_cur_idx (input) != 0)
1975146040Stjr	{
1976146040Stjr	  char prev = re_string_peek_byte (input, -1);
1977146040Stjr	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978146040Stjr	    break;
1979146040Stjr	}
1980146040Stjr      token->type = ANCHOR;
1981146040Stjr      token->opr.ctx_type = LINE_FIRST;
1982146040Stjr      break;
1983146040Stjr    case '$':
1984146040Stjr      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985146040Stjr	  re_string_cur_idx (input) + 1 != re_string_length (input))
1986146040Stjr	{
1987146040Stjr	  re_token_t next;
1988146040Stjr	  re_string_skip_bytes (input, 1);
1989146040Stjr	  peek_token (&next, input, syntax);
1990146040Stjr	  re_string_skip_bytes (input, -1);
1991146040Stjr	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992146040Stjr	    break;
1993146040Stjr	}
1994146040Stjr      token->type = ANCHOR;
1995146040Stjr      token->opr.ctx_type = LINE_LAST;
1996146040Stjr      break;
1997146040Stjr    default:
1998146040Stjr      break;
1999146040Stjr    }
2000146040Stjr  return 1;
2001146040Stjr}
2002146040Stjr
2003146040Stjr/* Peek a token from INPUT, and return the length of the token.
2004146040Stjr   We must not use this function out of bracket expressions.  */
2005146040Stjr
2006146040Stjrstatic int
2007250724Sjkiminternal_function
2008250724Sjkimpeek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2009146040Stjr{
2010146040Stjr  unsigned char c;
2011146040Stjr  if (re_string_eoi (input))
2012146040Stjr    {
2013146040Stjr      token->type = END_OF_RE;
2014146040Stjr      return 0;
2015146040Stjr    }
2016146040Stjr  c = re_string_peek_byte (input, 0);
2017146040Stjr  token->opr.c = c;
2018146040Stjr
2019146040Stjr#ifdef RE_ENABLE_I18N
2020146040Stjr  if (input->mb_cur_max > 1 &&
2021146040Stjr      !re_string_first_byte (input, re_string_cur_idx (input)))
2022146040Stjr    {
2023146040Stjr      token->type = CHARACTER;
2024146040Stjr      return 1;
2025146040Stjr    }
2026146040Stjr#endif /* RE_ENABLE_I18N */
2027146040Stjr
2028146040Stjr  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029146040Stjr      && re_string_cur_idx (input) + 1 < re_string_length (input))
2030146040Stjr    {
2031146040Stjr      /* In this case, '\' escape a character.  */
2032146040Stjr      unsigned char c2;
2033146040Stjr      re_string_skip_bytes (input, 1);
2034146040Stjr      c2 = re_string_peek_byte (input, 0);
2035146040Stjr      token->opr.c = c2;
2036146040Stjr      token->type = CHARACTER;
2037146040Stjr      return 1;
2038146040Stjr    }
2039146040Stjr  if (c == '[') /* '[' is a special char in a bracket exps.  */
2040146040Stjr    {
2041146040Stjr      unsigned char c2;
2042146040Stjr      int token_len;
2043146040Stjr      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044146040Stjr	c2 = re_string_peek_byte (input, 1);
2045146040Stjr      else
2046146040Stjr	c2 = 0;
2047146040Stjr      token->opr.c = c2;
2048146040Stjr      token_len = 2;
2049146040Stjr      switch (c2)
2050146040Stjr	{
2051146040Stjr	case '.':
2052146040Stjr	  token->type = OP_OPEN_COLL_ELEM;
2053146040Stjr	  break;
2054146040Stjr	case '=':
2055146040Stjr	  token->type = OP_OPEN_EQUIV_CLASS;
2056146040Stjr	  break;
2057146040Stjr	case ':':
2058146040Stjr	  if (syntax & RE_CHAR_CLASSES)
2059146040Stjr	    {
2060146040Stjr	      token->type = OP_OPEN_CHAR_CLASS;
2061146040Stjr	      break;
2062146040Stjr	    }
2063146040Stjr	  /* else fall through.  */
2064146040Stjr	default:
2065146040Stjr	  token->type = CHARACTER;
2066146040Stjr	  token->opr.c = c;
2067146040Stjr	  token_len = 1;
2068146040Stjr	  break;
2069146040Stjr	}
2070146040Stjr      return token_len;
2071146040Stjr    }
2072146040Stjr  switch (c)
2073146040Stjr    {
2074146040Stjr    case '-':
2075146040Stjr      token->type = OP_CHARSET_RANGE;
2076146040Stjr      break;
2077146040Stjr    case ']':
2078146040Stjr      token->type = OP_CLOSE_BRACKET;
2079146040Stjr      break;
2080146040Stjr    case '^':
2081146040Stjr      token->type = OP_NON_MATCH_LIST;
2082146040Stjr      break;
2083146040Stjr    default:
2084146040Stjr      token->type = CHARACTER;
2085146040Stjr    }
2086146040Stjr  return 1;
2087146040Stjr}
2088146040Stjr
2089146040Stjr/* Functions for parser.  */
2090146040Stjr
2091146040Stjr/* Entry point of the parser.
2092146040Stjr   Parse the regular expression REGEXP and return the structure tree.
2093146040Stjr   If an error is occured, ERR is set by error code, and return NULL.
2094146040Stjr   This function build the following tree, from regular expression <reg_exp>:
2095146040Stjr	   CAT
2096146040Stjr	   / \
2097146040Stjr	  /   \
2098146040Stjr   <reg_exp>  EOR
2099146040Stjr
2100146040Stjr   CAT means concatenation.
2101146040Stjr   EOR means end of regular expression.  */
2102146040Stjr
2103146040Stjrstatic bin_tree_t *
2104250724Sjkimparse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105250724Sjkim       reg_errcode_t *err)
2106146040Stjr{
2107146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108146040Stjr  bin_tree_t *tree, *eor, *root;
2109146040Stjr  re_token_t current_token;
2110146040Stjr  dfa->syntax = syntax;
2111146040Stjr  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112146040Stjr  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114146040Stjr    return NULL;
2115146040Stjr  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116146040Stjr  if (tree != NULL)
2117146040Stjr    root = create_tree (dfa, tree, eor, CONCAT);
2118146040Stjr  else
2119146040Stjr    root = eor;
2120146040Stjr  if (BE (eor == NULL || root == NULL, 0))
2121146040Stjr    {
2122146040Stjr      *err = REG_ESPACE;
2123146040Stjr      return NULL;
2124146040Stjr    }
2125146040Stjr  return root;
2126146040Stjr}
2127146040Stjr
2128146040Stjr/* This function build the following tree, from regular expression
2129146040Stjr   <branch1>|<branch2>:
2130146040Stjr	   ALT
2131146040Stjr	   / \
2132146040Stjr	  /   \
2133146040Stjr   <branch1> <branch2>
2134146040Stjr
2135146040Stjr   ALT means alternative, which represents the operator `|'.  */
2136146040Stjr
2137146040Stjrstatic bin_tree_t *
2138250724Sjkimparse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139250724Sjkim	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140146040Stjr{
2141146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142146040Stjr  bin_tree_t *tree, *branch = NULL;
2143146040Stjr  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145146040Stjr    return NULL;
2146146040Stjr
2147146040Stjr  while (token->type == OP_ALT)
2148146040Stjr    {
2149146040Stjr      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150146040Stjr      if (token->type != OP_ALT && token->type != END_OF_RE
2151146040Stjr	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152146040Stjr	{
2153146040Stjr	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154146040Stjr	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155146040Stjr	    return NULL;
2156146040Stjr	}
2157146040Stjr      else
2158146040Stjr	branch = NULL;
2159146040Stjr      tree = create_tree (dfa, tree, branch, OP_ALT);
2160146040Stjr      if (BE (tree == NULL, 0))
2161146040Stjr	{
2162146040Stjr	  *err = REG_ESPACE;
2163146040Stjr	  return NULL;
2164146040Stjr	}
2165146040Stjr    }
2166146040Stjr  return tree;
2167146040Stjr}
2168146040Stjr
2169146040Stjr/* This function build the following tree, from regular expression
2170146040Stjr   <exp1><exp2>:
2171146040Stjr	CAT
2172146040Stjr	/ \
2173146040Stjr       /   \
2174146040Stjr   <exp1> <exp2>
2175146040Stjr
2176146040Stjr   CAT means concatenation.  */
2177146040Stjr
2178146040Stjrstatic bin_tree_t *
2179250724Sjkimparse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180250724Sjkim	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181146040Stjr{
2182146040Stjr  bin_tree_t *tree, *exp;
2183146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184146040Stjr  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186146040Stjr    return NULL;
2187146040Stjr
2188146040Stjr  while (token->type != OP_ALT && token->type != END_OF_RE
2189146040Stjr	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190146040Stjr    {
2191146040Stjr      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192146040Stjr      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193146040Stjr	{
2194250724Sjkim	  if (tree != NULL)
2195250724Sjkim	    postorder (tree, free_tree, NULL);
2196146040Stjr	  return NULL;
2197146040Stjr	}
2198146040Stjr      if (tree != NULL && exp != NULL)
2199146040Stjr	{
2200250724Sjkim	  bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201250724Sjkim	  if (newtree == NULL)
2202146040Stjr	    {
2203250724Sjkim	      postorder (exp, free_tree, NULL);
2204250724Sjkim	      postorder (tree, free_tree, NULL);
2205146040Stjr	      *err = REG_ESPACE;
2206146040Stjr	      return NULL;
2207146040Stjr	    }
2208250724Sjkim	  tree = newtree;
2209146040Stjr	}
2210146040Stjr      else if (tree == NULL)
2211146040Stjr	tree = exp;
2212146040Stjr      /* Otherwise exp == NULL, we don't need to create new tree.  */
2213146040Stjr    }
2214146040Stjr  return tree;
2215146040Stjr}
2216146040Stjr
2217146040Stjr/* This function build the following tree, from regular expression a*:
2218146040Stjr	 *
2219146040Stjr	 |
2220146040Stjr	 a
2221146040Stjr*/
2222146040Stjr
2223146040Stjrstatic bin_tree_t *
2224250724Sjkimparse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225250724Sjkim		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226146040Stjr{
2227146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228146040Stjr  bin_tree_t *tree;
2229146040Stjr  switch (token->type)
2230146040Stjr    {
2231146040Stjr    case CHARACTER:
2232146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2233146040Stjr      if (BE (tree == NULL, 0))
2234146040Stjr	{
2235146040Stjr	  *err = REG_ESPACE;
2236146040Stjr	  return NULL;
2237146040Stjr	}
2238146040Stjr#ifdef RE_ENABLE_I18N
2239146040Stjr      if (dfa->mb_cur_max > 1)
2240146040Stjr	{
2241146040Stjr	  while (!re_string_eoi (regexp)
2242146040Stjr		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243146040Stjr	    {
2244146040Stjr	      bin_tree_t *mbc_remain;
2245146040Stjr	      fetch_token (token, regexp, syntax);
2246146040Stjr	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247146040Stjr	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248146040Stjr	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2249146040Stjr		{
2250146040Stjr		  *err = REG_ESPACE;
2251146040Stjr		  return NULL;
2252146040Stjr		}
2253146040Stjr	    }
2254146040Stjr	}
2255146040Stjr#endif
2256146040Stjr      break;
2257146040Stjr    case OP_OPEN_SUBEXP:
2258146040Stjr      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260146040Stjr	return NULL;
2261146040Stjr      break;
2262146040Stjr    case OP_OPEN_BRACKET:
2263146040Stjr      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265146040Stjr	return NULL;
2266146040Stjr      break;
2267146040Stjr    case OP_BACK_REF:
2268146040Stjr      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269146040Stjr	{
2270146040Stjr	  *err = REG_ESUBREG;
2271146040Stjr	  return NULL;
2272146040Stjr	}
2273146040Stjr      dfa->used_bkref_map |= 1 << token->opr.idx;
2274146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2275146040Stjr      if (BE (tree == NULL, 0))
2276146040Stjr	{
2277146040Stjr	  *err = REG_ESPACE;
2278146040Stjr	  return NULL;
2279146040Stjr	}
2280146040Stjr      ++dfa->nbackref;
2281146040Stjr      dfa->has_mb_node = 1;
2282146040Stjr      break;
2283146040Stjr    case OP_OPEN_DUP_NUM:
2284146040Stjr      if (syntax & RE_CONTEXT_INVALID_DUP)
2285146040Stjr	{
2286146040Stjr	  *err = REG_BADRPT;
2287146040Stjr	  return NULL;
2288146040Stjr	}
2289146040Stjr      /* FALLTHROUGH */
2290146040Stjr    case OP_DUP_ASTERISK:
2291146040Stjr    case OP_DUP_PLUS:
2292146040Stjr    case OP_DUP_QUESTION:
2293146040Stjr      if (syntax & RE_CONTEXT_INVALID_OPS)
2294146040Stjr	{
2295146040Stjr	  *err = REG_BADRPT;
2296146040Stjr	  return NULL;
2297146040Stjr	}
2298146040Stjr      else if (syntax & RE_CONTEXT_INDEP_OPS)
2299146040Stjr	{
2300146040Stjr	  fetch_token (token, regexp, syntax);
2301146040Stjr	  return parse_expression (regexp, preg, token, syntax, nest, err);
2302146040Stjr	}
2303146040Stjr      /* else fall through  */
2304146040Stjr    case OP_CLOSE_SUBEXP:
2305146040Stjr      if ((token->type == OP_CLOSE_SUBEXP) &&
2306146040Stjr	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307146040Stjr	{
2308146040Stjr	  *err = REG_ERPAREN;
2309146040Stjr	  return NULL;
2310146040Stjr	}
2311146040Stjr      /* else fall through  */
2312146040Stjr    case OP_CLOSE_DUP_NUM:
2313146040Stjr      /* We treat it as a normal character.  */
2314146040Stjr
2315146040Stjr      /* Then we can these characters as normal characters.  */
2316146040Stjr      token->type = CHARACTER;
2317146040Stjr      /* mb_partial and word_char bits should be initialized already
2318146040Stjr	 by peek_token.  */
2319146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2320146040Stjr      if (BE (tree == NULL, 0))
2321146040Stjr	{
2322146040Stjr	  *err = REG_ESPACE;
2323146040Stjr	  return NULL;
2324146040Stjr	}
2325146040Stjr      break;
2326146040Stjr    case ANCHOR:
2327146040Stjr      if ((token->opr.ctx_type
2328146040Stjr	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329146040Stjr	  && dfa->word_ops_used == 0)
2330146040Stjr	init_word_char (dfa);
2331146040Stjr      if (token->opr.ctx_type == WORD_DELIM
2332250724Sjkim	  || token->opr.ctx_type == NOT_WORD_DELIM)
2333146040Stjr	{
2334146040Stjr	  bin_tree_t *tree_first, *tree_last;
2335146040Stjr	  if (token->opr.ctx_type == WORD_DELIM)
2336146040Stjr	    {
2337146040Stjr	      token->opr.ctx_type = WORD_FIRST;
2338146040Stjr	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2339146040Stjr	      token->opr.ctx_type = WORD_LAST;
2340250724Sjkim	    }
2341250724Sjkim	  else
2342250724Sjkim	    {
2343146040Stjr	      token->opr.ctx_type = INSIDE_WORD;
2344146040Stjr	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2345146040Stjr	      token->opr.ctx_type = INSIDE_NOTWORD;
2346250724Sjkim	    }
2347146040Stjr	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2348146040Stjr	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349146040Stjr	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350146040Stjr	    {
2351146040Stjr	      *err = REG_ESPACE;
2352146040Stjr	      return NULL;
2353146040Stjr	    }
2354146040Stjr	}
2355146040Stjr      else
2356146040Stjr	{
2357146040Stjr	  tree = create_token_tree (dfa, NULL, NULL, token);
2358146040Stjr	  if (BE (tree == NULL, 0))
2359146040Stjr	    {
2360146040Stjr	      *err = REG_ESPACE;
2361146040Stjr	      return NULL;
2362146040Stjr	    }
2363146040Stjr	}
2364146040Stjr      /* We must return here, since ANCHORs can't be followed
2365146040Stjr	 by repetition operators.
2366146040Stjr	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367146040Stjr	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368146040Stjr      fetch_token (token, regexp, syntax);
2369146040Stjr      return tree;
2370146040Stjr    case OP_PERIOD:
2371146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2372146040Stjr      if (BE (tree == NULL, 0))
2373146040Stjr	{
2374146040Stjr	  *err = REG_ESPACE;
2375146040Stjr	  return NULL;
2376146040Stjr	}
2377146040Stjr      if (dfa->mb_cur_max > 1)
2378146040Stjr	dfa->has_mb_node = 1;
2379146040Stjr      break;
2380146040Stjr    case OP_WORD:
2381146040Stjr    case OP_NOTWORD:
2382146040Stjr      tree = build_charclass_op (dfa, regexp->trans,
2383146040Stjr				 (const unsigned char *) "alnum",
2384146040Stjr				 (const unsigned char *) "_",
2385146040Stjr				 token->type == OP_NOTWORD, err);
2386146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387146040Stjr	return NULL;
2388146040Stjr      break;
2389146040Stjr    case OP_SPACE:
2390146040Stjr    case OP_NOTSPACE:
2391146040Stjr      tree = build_charclass_op (dfa, regexp->trans,
2392146040Stjr				 (const unsigned char *) "space",
2393146040Stjr				 (const unsigned char *) "",
2394146040Stjr				 token->type == OP_NOTSPACE, err);
2395146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396146040Stjr	return NULL;
2397146040Stjr      break;
2398146040Stjr    case OP_ALT:
2399146040Stjr    case END_OF_RE:
2400146040Stjr      return NULL;
2401146040Stjr    case BACK_SLASH:
2402146040Stjr      *err = REG_EESCAPE;
2403146040Stjr      return NULL;
2404146040Stjr    default:
2405146040Stjr      /* Must not happen?  */
2406146040Stjr#ifdef DEBUG
2407146040Stjr      assert (0);
2408146040Stjr#endif
2409146040Stjr      return NULL;
2410146040Stjr    }
2411146040Stjr  fetch_token (token, regexp, syntax);
2412146040Stjr
2413146040Stjr  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414146040Stjr	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415146040Stjr    {
2416146040Stjr      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418146040Stjr	return NULL;
2419146040Stjr      /* In BRE consecutive duplications are not allowed.  */
2420146040Stjr      if ((syntax & RE_CONTEXT_INVALID_DUP)
2421146040Stjr	  && (token->type == OP_DUP_ASTERISK
2422146040Stjr	      || token->type == OP_OPEN_DUP_NUM))
2423146040Stjr	{
2424146040Stjr	  *err = REG_BADRPT;
2425146040Stjr	  return NULL;
2426146040Stjr	}
2427146040Stjr    }
2428146040Stjr
2429146040Stjr  return tree;
2430146040Stjr}
2431146040Stjr
2432146040Stjr/* This function build the following tree, from regular expression
2433146040Stjr   (<reg_exp>):
2434146040Stjr	 SUBEXP
2435146040Stjr	    |
2436146040Stjr	<reg_exp>
2437146040Stjr*/
2438146040Stjr
2439146040Stjrstatic bin_tree_t *
2440250724Sjkimparse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441250724Sjkim	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442146040Stjr{
2443146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444146040Stjr  bin_tree_t *tree;
2445146040Stjr  size_t cur_nsub;
2446146040Stjr  cur_nsub = preg->re_nsub++;
2447146040Stjr
2448146040Stjr  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449146040Stjr
2450146040Stjr  /* The subexpression may be a null string.  */
2451146040Stjr  if (token->type == OP_CLOSE_SUBEXP)
2452146040Stjr    tree = NULL;
2453146040Stjr  else
2454146040Stjr    {
2455146040Stjr      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456146040Stjr      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457250724Sjkim	{
2458250724Sjkim	  if (tree != NULL)
2459250724Sjkim	    postorder (tree, free_tree, NULL);
2460250724Sjkim	  *err = REG_EPAREN;
2461250724Sjkim	}
2462146040Stjr      if (BE (*err != REG_NOERROR, 0))
2463146040Stjr	return NULL;
2464146040Stjr    }
2465146040Stjr
2466250724Sjkim  if (cur_nsub <= '9' - '1')
2467250724Sjkim    dfa->completed_bkref_map |= 1 << cur_nsub;
2468250724Sjkim
2469146040Stjr  tree = create_tree (dfa, tree, NULL, SUBEXP);
2470146040Stjr  if (BE (tree == NULL, 0))
2471146040Stjr    {
2472146040Stjr      *err = REG_ESPACE;
2473146040Stjr      return NULL;
2474146040Stjr    }
2475146040Stjr  tree->token.opr.idx = cur_nsub;
2476146040Stjr  return tree;
2477146040Stjr}
2478146040Stjr
2479146040Stjr/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480146040Stjr
2481146040Stjrstatic bin_tree_t *
2482250724Sjkimparse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483250724Sjkim	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484146040Stjr{
2485146040Stjr  bin_tree_t *tree = NULL, *old_tree = NULL;
2486146040Stjr  int i, start, end, start_idx = re_string_cur_idx (regexp);
2487146040Stjr  re_token_t start_token = *token;
2488146040Stjr
2489146040Stjr  if (token->type == OP_OPEN_DUP_NUM)
2490146040Stjr    {
2491146040Stjr      end = 0;
2492146040Stjr      start = fetch_number (regexp, token, syntax);
2493146040Stjr      if (start == -1)
2494146040Stjr	{
2495146040Stjr	  if (token->type == CHARACTER && token->opr.c == ',')
2496146040Stjr	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2497146040Stjr	  else
2498146040Stjr	    {
2499146040Stjr	      *err = REG_BADBR; /* <re>{} is invalid.  */
2500146040Stjr	      return NULL;
2501146040Stjr	    }
2502146040Stjr	}
2503146040Stjr      if (BE (start != -2, 1))
2504146040Stjr	{
2505146040Stjr	  /* We treat "{n}" as "{n,n}".  */
2506146040Stjr	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507146040Stjr		 : ((token->type == CHARACTER && token->opr.c == ',')
2508146040Stjr		    ? fetch_number (regexp, token, syntax) : -2));
2509146040Stjr	}
2510146040Stjr      if (BE (start == -2 || end == -2, 0))
2511146040Stjr	{
2512146040Stjr	  /* Invalid sequence.  */
2513146040Stjr	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514146040Stjr	    {
2515146040Stjr	      if (token->type == END_OF_RE)
2516146040Stjr		*err = REG_EBRACE;
2517146040Stjr	      else
2518146040Stjr		*err = REG_BADBR;
2519146040Stjr
2520146040Stjr	      return NULL;
2521146040Stjr	    }
2522146040Stjr
2523146040Stjr	  /* If the syntax bit is set, rollback.  */
2524146040Stjr	  re_string_set_index (regexp, start_idx);
2525146040Stjr	  *token = start_token;
2526146040Stjr	  token->type = CHARACTER;
2527146040Stjr	  /* mb_partial and word_char bits should be already initialized by
2528146040Stjr	     peek_token.  */
2529146040Stjr	  return elem;
2530146040Stjr	}
2531146040Stjr
2532250724Sjkim      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533146040Stjr	{
2534146040Stjr	  /* First number greater than second.  */
2535146040Stjr	  *err = REG_BADBR;
2536146040Stjr	  return NULL;
2537146040Stjr	}
2538146040Stjr    }
2539146040Stjr  else
2540146040Stjr    {
2541146040Stjr      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542146040Stjr      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543146040Stjr    }
2544146040Stjr
2545146040Stjr  fetch_token (token, regexp, syntax);
2546146040Stjr
2547146040Stjr  if (BE (elem == NULL, 0))
2548146040Stjr    return NULL;
2549146040Stjr  if (BE (start == 0 && end == 0, 0))
2550146040Stjr    {
2551146040Stjr      postorder (elem, free_tree, NULL);
2552146040Stjr      return NULL;
2553146040Stjr    }
2554146040Stjr
2555146040Stjr  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556146040Stjr  if (BE (start > 0, 0))
2557146040Stjr    {
2558146040Stjr      tree = elem;
2559146040Stjr      for (i = 2; i <= start; ++i)
2560146040Stjr	{
2561146040Stjr	  elem = duplicate_tree (elem, dfa);
2562146040Stjr	  tree = create_tree (dfa, tree, elem, CONCAT);
2563146040Stjr	  if (BE (elem == NULL || tree == NULL, 0))
2564146040Stjr	    goto parse_dup_op_espace;
2565146040Stjr	}
2566146040Stjr
2567146040Stjr      if (start == end)
2568146040Stjr	return tree;
2569146040Stjr
2570146040Stjr      /* Duplicate ELEM before it is marked optional.  */
2571146040Stjr      elem = duplicate_tree (elem, dfa);
2572146040Stjr      old_tree = tree;
2573146040Stjr    }
2574146040Stjr  else
2575146040Stjr    old_tree = NULL;
2576146040Stjr
2577146040Stjr  if (elem->token.type == SUBEXP)
2578146040Stjr    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2579146040Stjr
2580146040Stjr  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581146040Stjr  if (BE (tree == NULL, 0))
2582146040Stjr    goto parse_dup_op_espace;
2583146040Stjr
2584146040Stjr  /* This loop is actually executed only when end != -1,
2585146040Stjr     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586146040Stjr     already created the start+1-th copy.  */
2587146040Stjr  for (i = start + 2; i <= end; ++i)
2588146040Stjr    {
2589146040Stjr      elem = duplicate_tree (elem, dfa);
2590146040Stjr      tree = create_tree (dfa, tree, elem, CONCAT);
2591146040Stjr      if (BE (elem == NULL || tree == NULL, 0))
2592250724Sjkim	goto parse_dup_op_espace;
2593146040Stjr
2594146040Stjr      tree = create_tree (dfa, tree, NULL, OP_ALT);
2595146040Stjr      if (BE (tree == NULL, 0))
2596250724Sjkim	goto parse_dup_op_espace;
2597146040Stjr    }
2598146040Stjr
2599146040Stjr  if (old_tree)
2600146040Stjr    tree = create_tree (dfa, old_tree, tree, CONCAT);
2601146040Stjr
2602146040Stjr  return tree;
2603146040Stjr
2604146040Stjr parse_dup_op_espace:
2605146040Stjr  *err = REG_ESPACE;
2606146040Stjr  return NULL;
2607146040Stjr}
2608146040Stjr
2609146040Stjr/* Size of the names for collating symbol/equivalence_class/character_class.
2610146040Stjr   I'm not sure, but maybe enough.  */
2611146040Stjr#define BRACKET_NAME_BUF_SIZE 32
2612146040Stjr
2613146040Stjr#ifndef _LIBC
2614146040Stjr  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615146040Stjr     Build the range expression which starts from START_ELEM, and ends
2616146040Stjr     at END_ELEM.  The result are written to MBCSET and SBCSET.
2617146040Stjr     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618146040Stjr     mbcset->range_ends, is a pointer argument sinse we may
2619146040Stjr     update it.  */
2620146040Stjr
2621146040Stjrstatic reg_errcode_t
2622250724Sjkiminternal_function
2623146040Stjr# ifdef RE_ENABLE_I18N
2624250724Sjkimbuild_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625250724Sjkim		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626146040Stjr# else /* not RE_ENABLE_I18N */
2627250724Sjkimbuild_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628250724Sjkim		 bracket_elem_t *end_elem)
2629146040Stjr# endif /* not RE_ENABLE_I18N */
2630146040Stjr{
2631146040Stjr  unsigned int start_ch, end_ch;
2632146040Stjr  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2633146040Stjr  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634146040Stjr	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635146040Stjr	  0))
2636146040Stjr    return REG_ERANGE;
2637146040Stjr
2638146040Stjr  /* We can handle no multi character collating elements without libc
2639146040Stjr     support.  */
2640146040Stjr  if (BE ((start_elem->type == COLL_SYM
2641146040Stjr	   && strlen ((char *) start_elem->opr.name) > 1)
2642146040Stjr	  || (end_elem->type == COLL_SYM
2643146040Stjr	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2644146040Stjr    return REG_ECOLLATE;
2645146040Stjr
2646146040Stjr# ifdef RE_ENABLE_I18N
2647146040Stjr  {
2648250724Sjkim    wchar_t wc;
2649250724Sjkim    wint_t start_wc;
2650250724Sjkim    wint_t end_wc;
2651146040Stjr    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652146040Stjr
2653146040Stjr    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654146040Stjr		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655146040Stjr		   : 0));
2656146040Stjr    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657146040Stjr	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658146040Stjr		 : 0));
2659146040Stjr    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660146040Stjr		? __btowc (start_ch) : start_elem->opr.wch);
2661146040Stjr    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662146040Stjr	      ? __btowc (end_ch) : end_elem->opr.wch);
2663146040Stjr    if (start_wc == WEOF || end_wc == WEOF)
2664146040Stjr      return REG_ECOLLATE;
2665146040Stjr    cmp_buf[0] = start_wc;
2666146040Stjr    cmp_buf[4] = end_wc;
2667146040Stjr    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668146040Stjr      return REG_ERANGE;
2669146040Stjr
2670146040Stjr    /* Got valid collation sequence values, add them as a new entry.
2671146040Stjr       However, for !_LIBC we have no collation elements: if the
2672146040Stjr       character set is single byte, the single byte character set
2673146040Stjr       that we build below suffices.  parse_bracket_exp passes
2674146040Stjr       no MBCSET if dfa->mb_cur_max == 1.  */
2675146040Stjr    if (mbcset)
2676146040Stjr      {
2677250724Sjkim	/* Check the space of the arrays.  */
2678250724Sjkim	if (BE (*range_alloc == mbcset->nranges, 0))
2679250724Sjkim	  {
2680146040Stjr	    /* There is not enough space, need realloc.  */
2681146040Stjr	    wchar_t *new_array_start, *new_array_end;
2682146040Stjr	    int new_nranges;
2683146040Stjr
2684146040Stjr	    /* +1 in case of mbcset->nranges is 0.  */
2685146040Stjr	    new_nranges = 2 * mbcset->nranges + 1;
2686146040Stjr	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687146040Stjr	       are NULL if *range_alloc == 0.  */
2688146040Stjr	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689250724Sjkim					  new_nranges);
2690146040Stjr	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691250724Sjkim					new_nranges);
2692146040Stjr
2693146040Stjr	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694146040Stjr	      return REG_ESPACE;
2695146040Stjr
2696146040Stjr	    mbcset->range_starts = new_array_start;
2697146040Stjr	    mbcset->range_ends = new_array_end;
2698146040Stjr	    *range_alloc = new_nranges;
2699250724Sjkim	  }
2700146040Stjr
2701250724Sjkim	mbcset->range_starts[mbcset->nranges] = start_wc;
2702250724Sjkim	mbcset->range_ends[mbcset->nranges++] = end_wc;
2703146040Stjr      }
2704146040Stjr
2705146040Stjr    /* Build the table for single byte characters.  */
2706146040Stjr    for (wc = 0; wc < SBC_MAX; ++wc)
2707146040Stjr      {
2708146040Stjr	cmp_buf[2] = wc;
2709146040Stjr	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710146040Stjr	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711146040Stjr	  bitset_set (sbcset, wc);
2712146040Stjr      }
2713146040Stjr  }
2714146040Stjr# else /* not RE_ENABLE_I18N */
2715146040Stjr  {
2716146040Stjr    unsigned int ch;
2717146040Stjr    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718146040Stjr		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719146040Stjr		   : 0));
2720146040Stjr    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721146040Stjr	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722146040Stjr		 : 0));
2723146040Stjr    if (start_ch > end_ch)
2724146040Stjr      return REG_ERANGE;
2725146040Stjr    /* Build the table for single byte characters.  */
2726146040Stjr    for (ch = 0; ch < SBC_MAX; ++ch)
2727146040Stjr      if (start_ch <= ch  && ch <= end_ch)
2728146040Stjr	bitset_set (sbcset, ch);
2729146040Stjr  }
2730146040Stjr# endif /* not RE_ENABLE_I18N */
2731146040Stjr  return REG_NOERROR;
2732146040Stjr}
2733146040Stjr#endif /* not _LIBC */
2734146040Stjr
2735146040Stjr#ifndef _LIBC
2736146040Stjr/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737146040Stjr   Build the collating element which is represented by NAME.
2738146040Stjr   The result are written to MBCSET and SBCSET.
2739146040Stjr   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740146040Stjr   pointer argument since we may update it.  */
2741146040Stjr
2742146040Stjrstatic reg_errcode_t
2743250724Sjkiminternal_function
2744146040Stjr# ifdef RE_ENABLE_I18N
2745250724Sjkimbuild_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746250724Sjkim			int *coll_sym_alloc, const unsigned char *name)
2747146040Stjr# else /* not RE_ENABLE_I18N */
2748250724Sjkimbuild_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749146040Stjr# endif /* not RE_ENABLE_I18N */
2750146040Stjr{
2751146040Stjr  size_t name_len = strlen ((const char *) name);
2752146040Stjr  if (BE (name_len != 1, 0))
2753146040Stjr    return REG_ECOLLATE;
2754146040Stjr  else
2755146040Stjr    {
2756146040Stjr      bitset_set (sbcset, name[0]);
2757146040Stjr      return REG_NOERROR;
2758146040Stjr    }
2759146040Stjr}
2760146040Stjr#endif /* not _LIBC */
2761146040Stjr
2762146040Stjr/* This function parse bracket expression like "[abc]", "[a-c]",
2763146040Stjr   "[[.a-a.]]" etc.  */
2764146040Stjr
2765146040Stjrstatic bin_tree_t *
2766250724Sjkimparse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767250724Sjkim		   reg_syntax_t syntax, reg_errcode_t *err)
2768146040Stjr{
2769146040Stjr#ifdef _LIBC
2770146040Stjr  const unsigned char *collseqmb;
2771146040Stjr  const char *collseqwc;
2772146040Stjr  uint32_t nrules;
2773146040Stjr  int32_t table_size;
2774146040Stjr  const int32_t *symb_table;
2775146040Stjr  const unsigned char *extra;
2776146040Stjr
2777146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2778146040Stjr     Seek the collating symbol entry correspondings to NAME.
2779251435Sjkim     Return the index of the symbol in the SYMB_TABLE,
2780251435Sjkim     or -1 if not found.  */
2781146040Stjr
2782146040Stjr  auto inline int32_t
2783146040Stjr  __attribute ((always_inline))
2784251435Sjkim  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2785146040Stjr    {
2786251435Sjkim      int32_t elem;
2787250724Sjkim
2788251435Sjkim      for (elem = 0; elem < table_size; elem++)
2789251435Sjkim	if (symb_table[2 * elem] != 0)
2790251435Sjkim	  {
2791251435Sjkim	    int32_t idx = symb_table[2 * elem + 1];
2792251435Sjkim	    /* Skip the name of collating element name.  */
2793251435Sjkim	    idx += 1 + extra[idx];
2794251435Sjkim	    if (/* Compare the length of the name.  */
2795251435Sjkim		name_len == extra[idx]
2796251435Sjkim		/* Compare the name.  */
2797251435Sjkim		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2798251435Sjkim	      /* Yep, this is the entry.  */
2799251435Sjkim	      return elem;
2800251435Sjkim	  }
2801251435Sjkim      return -1;
2802146040Stjr    }
2803146040Stjr
2804250724Sjkim  /* Local function for parse_bracket_exp used in _LIBC environment.
2805146040Stjr     Look up the collation sequence value of BR_ELEM.
2806146040Stjr     Return the value if succeeded, UINT_MAX otherwise.  */
2807146040Stjr
2808146040Stjr  auto inline unsigned int
2809146040Stjr  __attribute ((always_inline))
2810251435Sjkim  lookup_collation_sequence_value (bracket_elem_t *br_elem)
2811146040Stjr    {
2812146040Stjr      if (br_elem->type == SB_CHAR)
2813146040Stjr	{
2814146040Stjr	  /*
2815146040Stjr	  if (MB_CUR_MAX == 1)
2816146040Stjr	  */
2817146040Stjr	  if (nrules == 0)
2818146040Stjr	    return collseqmb[br_elem->opr.ch];
2819146040Stjr	  else
2820146040Stjr	    {
2821146040Stjr	      wint_t wc = __btowc (br_elem->opr.ch);
2822146040Stjr	      return __collseq_table_lookup (collseqwc, wc);
2823146040Stjr	    }
2824146040Stjr	}
2825146040Stjr      else if (br_elem->type == MB_CHAR)
2826146040Stjr	{
2827250724Sjkim	  if (nrules != 0)
2828250724Sjkim	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829146040Stjr	}
2830146040Stjr      else if (br_elem->type == COLL_SYM)
2831146040Stjr	{
2832146040Stjr	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833146040Stjr	  if (nrules != 0)
2834146040Stjr	    {
2835146040Stjr	      int32_t elem, idx;
2836146040Stjr	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2837146040Stjr						  sym_name_len);
2838251435Sjkim	      if (elem != -1)
2839146040Stjr		{
2840146040Stjr		  /* We found the entry.  */
2841146040Stjr		  idx = symb_table[2 * elem + 1];
2842146040Stjr		  /* Skip the name of collating element name.  */
2843146040Stjr		  idx += 1 + extra[idx];
2844146040Stjr		  /* Skip the byte sequence of the collating element.  */
2845146040Stjr		  idx += 1 + extra[idx];
2846146040Stjr		  /* Adjust for the alignment.  */
2847146040Stjr		  idx = (idx + 3) & ~3;
2848146040Stjr		  /* Skip the multibyte collation sequence value.  */
2849146040Stjr		  idx += sizeof (unsigned int);
2850146040Stjr		  /* Skip the wide char sequence of the collating element.  */
2851146040Stjr		  idx += sizeof (unsigned int) *
2852146040Stjr		    (1 + *(unsigned int *) (extra + idx));
2853146040Stjr		  /* Return the collation sequence value.  */
2854146040Stjr		  return *(unsigned int *) (extra + idx);
2855146040Stjr		}
2856251435Sjkim	      else if (sym_name_len == 1)
2857146040Stjr		{
2858146040Stjr		  /* No valid character.  Match it as a single byte
2859146040Stjr		     character.  */
2860146040Stjr		  return collseqmb[br_elem->opr.name[0]];
2861146040Stjr		}
2862146040Stjr	    }
2863146040Stjr	  else if (sym_name_len == 1)
2864146040Stjr	    return collseqmb[br_elem->opr.name[0]];
2865146040Stjr	}
2866146040Stjr      return UINT_MAX;
2867146040Stjr    }
2868146040Stjr
2869146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2870146040Stjr     Build the range expression which starts from START_ELEM, and ends
2871146040Stjr     at END_ELEM.  The result are written to MBCSET and SBCSET.
2872146040Stjr     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873146040Stjr     mbcset->range_ends, is a pointer argument sinse we may
2874146040Stjr     update it.  */
2875146040Stjr
2876146040Stjr  auto inline reg_errcode_t
2877146040Stjr  __attribute ((always_inline))
2878251435Sjkim  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879251435Sjkim		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2880146040Stjr    {
2881146040Stjr      unsigned int ch;
2882146040Stjr      uint32_t start_collseq;
2883146040Stjr      uint32_t end_collseq;
2884146040Stjr
2885146040Stjr      /* Equivalence Classes and Character Classes can't be a range
2886146040Stjr	 start/end.  */
2887146040Stjr      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888146040Stjr	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889146040Stjr	      0))
2890146040Stjr	return REG_ERANGE;
2891146040Stjr
2892146040Stjr      start_collseq = lookup_collation_sequence_value (start_elem);
2893146040Stjr      end_collseq = lookup_collation_sequence_value (end_elem);
2894146040Stjr      /* Check start/end collation sequence values.  */
2895146040Stjr      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896146040Stjr	return REG_ECOLLATE;
2897146040Stjr      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898146040Stjr	return REG_ERANGE;
2899146040Stjr
2900146040Stjr      /* Got valid collation sequence values, add them as a new entry.
2901146040Stjr	 However, if we have no collation elements, and the character set
2902146040Stjr	 is single byte, the single byte character set that we
2903146040Stjr	 build below suffices. */
2904146040Stjr      if (nrules > 0 || dfa->mb_cur_max > 1)
2905146040Stjr	{
2906250724Sjkim	  /* Check the space of the arrays.  */
2907250724Sjkim	  if (BE (*range_alloc == mbcset->nranges, 0))
2908146040Stjr	    {
2909146040Stjr	      /* There is not enough space, need realloc.  */
2910146040Stjr	      uint32_t *new_array_start;
2911146040Stjr	      uint32_t *new_array_end;
2912146040Stjr	      int new_nranges;
2913146040Stjr
2914146040Stjr	      /* +1 in case of mbcset->nranges is 0.  */
2915146040Stjr	      new_nranges = 2 * mbcset->nranges + 1;
2916146040Stjr	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917146040Stjr					    new_nranges);
2918146040Stjr	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919250724Sjkim					  new_nranges);
2920146040Stjr
2921146040Stjr	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922250724Sjkim		return REG_ESPACE;
2923146040Stjr
2924146040Stjr	      mbcset->range_starts = new_array_start;
2925146040Stjr	      mbcset->range_ends = new_array_end;
2926146040Stjr	      *range_alloc = new_nranges;
2927146040Stjr	    }
2928146040Stjr
2929250724Sjkim	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2930250724Sjkim	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931146040Stjr	}
2932146040Stjr
2933146040Stjr      /* Build the table for single byte characters.  */
2934146040Stjr      for (ch = 0; ch < SBC_MAX; ch++)
2935146040Stjr	{
2936146040Stjr	  uint32_t ch_collseq;
2937146040Stjr	  /*
2938146040Stjr	  if (MB_CUR_MAX == 1)
2939146040Stjr	  */
2940146040Stjr	  if (nrules == 0)
2941146040Stjr	    ch_collseq = collseqmb[ch];
2942146040Stjr	  else
2943146040Stjr	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944146040Stjr	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945146040Stjr	    bitset_set (sbcset, ch);
2946146040Stjr	}
2947146040Stjr      return REG_NOERROR;
2948146040Stjr    }
2949146040Stjr
2950146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2951146040Stjr     Build the collating element which is represented by NAME.
2952146040Stjr     The result are written to MBCSET and SBCSET.
2953146040Stjr     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954146040Stjr     pointer argument sinse we may update it.  */
2955146040Stjr
2956146040Stjr  auto inline reg_errcode_t
2957146040Stjr  __attribute ((always_inline))
2958251435Sjkim  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959251435Sjkim			  int *coll_sym_alloc, const unsigned char *name)
2960146040Stjr    {
2961146040Stjr      int32_t elem, idx;
2962146040Stjr      size_t name_len = strlen ((const char *) name);
2963146040Stjr      if (nrules != 0)
2964146040Stjr	{
2965146040Stjr	  elem = seek_collating_symbol_entry (name, name_len);
2966251435Sjkim	  if (elem != -1)
2967146040Stjr	    {
2968146040Stjr	      /* We found the entry.  */
2969146040Stjr	      idx = symb_table[2 * elem + 1];
2970146040Stjr	      /* Skip the name of collating element name.  */
2971146040Stjr	      idx += 1 + extra[idx];
2972146040Stjr	    }
2973251435Sjkim	  else if (name_len == 1)
2974146040Stjr	    {
2975146040Stjr	      /* No valid character, treat it as a normal
2976146040Stjr		 character.  */
2977146040Stjr	      bitset_set (sbcset, name[0]);
2978146040Stjr	      return REG_NOERROR;
2979146040Stjr	    }
2980146040Stjr	  else
2981146040Stjr	    return REG_ECOLLATE;
2982146040Stjr
2983146040Stjr	  /* Got valid collation sequence, add it as a new entry.  */
2984146040Stjr	  /* Check the space of the arrays.  */
2985146040Stjr	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986146040Stjr	    {
2987146040Stjr	      /* Not enough, realloc it.  */
2988146040Stjr	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2989146040Stjr	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990146040Stjr	      /* Use realloc since mbcset->coll_syms is NULL
2991146040Stjr		 if *alloc == 0.  */
2992146040Stjr	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993146040Stjr						   new_coll_sym_alloc);
2994146040Stjr	      if (BE (new_coll_syms == NULL, 0))
2995146040Stjr		return REG_ESPACE;
2996146040Stjr	      mbcset->coll_syms = new_coll_syms;
2997146040Stjr	      *coll_sym_alloc = new_coll_sym_alloc;
2998146040Stjr	    }
2999146040Stjr	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000146040Stjr	  return REG_NOERROR;
3001146040Stjr	}
3002146040Stjr      else
3003146040Stjr	{
3004146040Stjr	  if (BE (name_len != 1, 0))
3005146040Stjr	    return REG_ECOLLATE;
3006146040Stjr	  else
3007146040Stjr	    {
3008146040Stjr	      bitset_set (sbcset, name[0]);
3009146040Stjr	      return REG_NOERROR;
3010146040Stjr	    }
3011146040Stjr	}
3012146040Stjr    }
3013146040Stjr#endif
3014146040Stjr
3015146040Stjr  re_token_t br_token;
3016146040Stjr  re_bitset_ptr_t sbcset;
3017146040Stjr#ifdef RE_ENABLE_I18N
3018146040Stjr  re_charset_t *mbcset;
3019146040Stjr  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020146040Stjr  int equiv_class_alloc = 0, char_class_alloc = 0;
3021146040Stjr#endif /* not RE_ENABLE_I18N */
3022146040Stjr  int non_match = 0;
3023146040Stjr  bin_tree_t *work_tree;
3024146040Stjr  int token_len;
3025146040Stjr  int first_round = 1;
3026146040Stjr#ifdef _LIBC
3027146040Stjr  collseqmb = (const unsigned char *)
3028146040Stjr    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029146040Stjr  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030146040Stjr  if (nrules)
3031146040Stjr    {
3032146040Stjr      /*
3033146040Stjr      if (MB_CUR_MAX > 1)
3034146040Stjr      */
3035250724Sjkim      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036146040Stjr      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037146040Stjr      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038146040Stjr						  _NL_COLLATE_SYMB_TABLEMB);
3039146040Stjr      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040146040Stjr						   _NL_COLLATE_SYMB_EXTRAMB);
3041146040Stjr    }
3042146040Stjr#endif
3043250724Sjkim  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044146040Stjr#ifdef RE_ENABLE_I18N
3045146040Stjr  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046146040Stjr#endif /* RE_ENABLE_I18N */
3047146040Stjr#ifdef RE_ENABLE_I18N
3048146040Stjr  if (BE (sbcset == NULL || mbcset == NULL, 0))
3049146040Stjr#else
3050146040Stjr  if (BE (sbcset == NULL, 0))
3051146040Stjr#endif /* RE_ENABLE_I18N */
3052146040Stjr    {
3053250724Sjkim      re_free (sbcset);
3054250724Sjkim#ifdef RE_ENABLE_I18N
3055250724Sjkim      re_free (mbcset);
3056250724Sjkim#endif
3057146040Stjr      *err = REG_ESPACE;
3058146040Stjr      return NULL;
3059146040Stjr    }
3060146040Stjr
3061146040Stjr  token_len = peek_token_bracket (token, regexp, syntax);
3062146040Stjr  if (BE (token->type == END_OF_RE, 0))
3063146040Stjr    {
3064146040Stjr      *err = REG_BADPAT;
3065146040Stjr      goto parse_bracket_exp_free_return;
3066146040Stjr    }
3067146040Stjr  if (token->type == OP_NON_MATCH_LIST)
3068146040Stjr    {
3069146040Stjr#ifdef RE_ENABLE_I18N
3070146040Stjr      mbcset->non_match = 1;
3071146040Stjr#endif /* not RE_ENABLE_I18N */
3072146040Stjr      non_match = 1;
3073146040Stjr      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074250724Sjkim	bitset_set (sbcset, '\n');
3075146040Stjr      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076146040Stjr      token_len = peek_token_bracket (token, regexp, syntax);
3077146040Stjr      if (BE (token->type == END_OF_RE, 0))
3078146040Stjr	{
3079146040Stjr	  *err = REG_BADPAT;
3080146040Stjr	  goto parse_bracket_exp_free_return;
3081146040Stjr	}
3082146040Stjr    }
3083146040Stjr
3084146040Stjr  /* We treat the first ']' as a normal character.  */
3085146040Stjr  if (token->type == OP_CLOSE_BRACKET)
3086146040Stjr    token->type = CHARACTER;
3087146040Stjr
3088146040Stjr  while (1)
3089146040Stjr    {
3090146040Stjr      bracket_elem_t start_elem, end_elem;
3091146040Stjr      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092146040Stjr      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093146040Stjr      reg_errcode_t ret;
3094146040Stjr      int token_len2 = 0, is_range_exp = 0;
3095146040Stjr      re_token_t token2;
3096146040Stjr
3097146040Stjr      start_elem.opr.name = start_name_buf;
3098146040Stjr      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099146040Stjr				   syntax, first_round);
3100146040Stjr      if (BE (ret != REG_NOERROR, 0))
3101146040Stjr	{
3102146040Stjr	  *err = ret;
3103146040Stjr	  goto parse_bracket_exp_free_return;
3104146040Stjr	}
3105146040Stjr      first_round = 0;
3106146040Stjr
3107146040Stjr      /* Get information about the next token.  We need it in any case.  */
3108146040Stjr      token_len = peek_token_bracket (token, regexp, syntax);
3109146040Stjr
3110146040Stjr      /* Do not check for ranges if we know they are not allowed.  */
3111146040Stjr      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3112146040Stjr	{
3113146040Stjr	  if (BE (token->type == END_OF_RE, 0))
3114146040Stjr	    {
3115146040Stjr	      *err = REG_EBRACK;
3116146040Stjr	      goto parse_bracket_exp_free_return;
3117146040Stjr	    }
3118146040Stjr	  if (token->type == OP_CHARSET_RANGE)
3119146040Stjr	    {
3120146040Stjr	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3121146040Stjr	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122146040Stjr	      if (BE (token2.type == END_OF_RE, 0))
3123146040Stjr		{
3124146040Stjr		  *err = REG_EBRACK;
3125146040Stjr		  goto parse_bracket_exp_free_return;
3126146040Stjr		}
3127146040Stjr	      if (token2.type == OP_CLOSE_BRACKET)
3128146040Stjr		{
3129146040Stjr		  /* We treat the last '-' as a normal character.  */
3130146040Stjr		  re_string_skip_bytes (regexp, -token_len);
3131146040Stjr		  token->type = CHARACTER;
3132146040Stjr		}
3133146040Stjr	      else
3134146040Stjr		is_range_exp = 1;
3135146040Stjr	    }
3136146040Stjr	}
3137146040Stjr
3138146040Stjr      if (is_range_exp == 1)
3139146040Stjr	{
3140146040Stjr	  end_elem.opr.name = end_name_buf;
3141146040Stjr	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3142146040Stjr				       dfa, syntax, 1);
3143146040Stjr	  if (BE (ret != REG_NOERROR, 0))
3144146040Stjr	    {
3145146040Stjr	      *err = ret;
3146146040Stjr	      goto parse_bracket_exp_free_return;
3147146040Stjr	    }
3148146040Stjr
3149146040Stjr	  token_len = peek_token_bracket (token, regexp, syntax);
3150146040Stjr
3151146040Stjr#ifdef _LIBC
3152146040Stjr	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153146040Stjr				  &start_elem, &end_elem);
3154146040Stjr#else
3155146040Stjr# ifdef RE_ENABLE_I18N
3156146040Stjr	  *err = build_range_exp (sbcset,
3157146040Stjr				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3158146040Stjr				  &range_alloc, &start_elem, &end_elem);
3159146040Stjr# else
3160146040Stjr	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3161146040Stjr# endif
3162146040Stjr#endif /* RE_ENABLE_I18N */
3163146040Stjr	  if (BE (*err != REG_NOERROR, 0))
3164146040Stjr	    goto parse_bracket_exp_free_return;
3165146040Stjr	}
3166146040Stjr      else
3167146040Stjr	{
3168146040Stjr	  switch (start_elem.type)
3169146040Stjr	    {
3170146040Stjr	    case SB_CHAR:
3171146040Stjr	      bitset_set (sbcset, start_elem.opr.ch);
3172146040Stjr	      break;
3173146040Stjr#ifdef RE_ENABLE_I18N
3174146040Stjr	    case MB_CHAR:
3175146040Stjr	      /* Check whether the array has enough space.  */
3176146040Stjr	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3177146040Stjr		{
3178146040Stjr		  wchar_t *new_mbchars;
3179146040Stjr		  /* Not enough, realloc it.  */
3180146040Stjr		  /* +1 in case of mbcset->nmbchars is 0.  */
3181146040Stjr		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182146040Stjr		  /* Use realloc since array is NULL if *alloc == 0.  */
3183146040Stjr		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3184146040Stjr					    mbchar_alloc);
3185146040Stjr		  if (BE (new_mbchars == NULL, 0))
3186146040Stjr		    goto parse_bracket_exp_espace;
3187146040Stjr		  mbcset->mbchars = new_mbchars;
3188146040Stjr		}
3189146040Stjr	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3190146040Stjr	      break;
3191146040Stjr#endif /* RE_ENABLE_I18N */
3192146040Stjr	    case EQUIV_CLASS:
3193146040Stjr	      *err = build_equiv_class (sbcset,
3194146040Stjr#ifdef RE_ENABLE_I18N
3195146040Stjr					mbcset, &equiv_class_alloc,
3196146040Stjr#endif /* RE_ENABLE_I18N */
3197146040Stjr					start_elem.opr.name);
3198146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3199146040Stjr		goto parse_bracket_exp_free_return;
3200146040Stjr	      break;
3201146040Stjr	    case COLL_SYM:
3202146040Stjr	      *err = build_collating_symbol (sbcset,
3203146040Stjr#ifdef RE_ENABLE_I18N
3204146040Stjr					     mbcset, &coll_sym_alloc,
3205146040Stjr#endif /* RE_ENABLE_I18N */
3206146040Stjr					     start_elem.opr.name);
3207146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3208146040Stjr		goto parse_bracket_exp_free_return;
3209146040Stjr	      break;
3210146040Stjr	    case CHAR_CLASS:
3211146040Stjr	      *err = build_charclass (regexp->trans, sbcset,
3212146040Stjr#ifdef RE_ENABLE_I18N
3213146040Stjr				      mbcset, &char_class_alloc,
3214146040Stjr#endif /* RE_ENABLE_I18N */
3215146040Stjr				      start_elem.opr.name, syntax);
3216146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3217146040Stjr	       goto parse_bracket_exp_free_return;
3218146040Stjr	      break;
3219146040Stjr	    default:
3220146040Stjr	      assert (0);
3221146040Stjr	      break;
3222146040Stjr	    }
3223146040Stjr	}
3224146040Stjr      if (BE (token->type == END_OF_RE, 0))
3225146040Stjr	{
3226146040Stjr	  *err = REG_EBRACK;
3227146040Stjr	  goto parse_bracket_exp_free_return;
3228146040Stjr	}
3229146040Stjr      if (token->type == OP_CLOSE_BRACKET)
3230146040Stjr	break;
3231146040Stjr    }
3232146040Stjr
3233146040Stjr  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3234146040Stjr
3235146040Stjr  /* If it is non-matching list.  */
3236146040Stjr  if (non_match)
3237146040Stjr    bitset_not (sbcset);
3238146040Stjr
3239146040Stjr#ifdef RE_ENABLE_I18N
3240146040Stjr  /* Ensure only single byte characters are set.  */
3241146040Stjr  if (dfa->mb_cur_max > 1)
3242146040Stjr    bitset_mask (sbcset, dfa->sb_char);
3243146040Stjr
3244146040Stjr  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245146040Stjr      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246146040Stjr						     || mbcset->non_match)))
3247146040Stjr    {
3248146040Stjr      bin_tree_t *mbc_tree;
3249146040Stjr      int sbc_idx;
3250146040Stjr      /* Build a tree for complex bracket.  */
3251146040Stjr      dfa->has_mb_node = 1;
3252146040Stjr      br_token.type = COMPLEX_BRACKET;
3253146040Stjr      br_token.opr.mbcset = mbcset;
3254146040Stjr      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255146040Stjr      if (BE (mbc_tree == NULL, 0))
3256146040Stjr	goto parse_bracket_exp_espace;
3257250724Sjkim      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258146040Stjr	if (sbcset[sbc_idx])
3259146040Stjr	  break;
3260146040Stjr      /* If there are no bits set in sbcset, there is no point
3261146040Stjr	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3262250724Sjkim      if (sbc_idx < BITSET_WORDS)
3263146040Stjr	{
3264250724Sjkim	  /* Build a tree for simple bracket.  */
3265250724Sjkim	  br_token.type = SIMPLE_BRACKET;
3266250724Sjkim	  br_token.opr.sbcset = sbcset;
3267250724Sjkim	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268250724Sjkim	  if (BE (work_tree == NULL, 0))
3269250724Sjkim	    goto parse_bracket_exp_espace;
3270146040Stjr
3271250724Sjkim	  /* Then join them by ALT node.  */
3272250724Sjkim	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273250724Sjkim	  if (BE (work_tree == NULL, 0))
3274250724Sjkim	    goto parse_bracket_exp_espace;
3275146040Stjr	}
3276146040Stjr      else
3277146040Stjr	{
3278146040Stjr	  re_free (sbcset);
3279146040Stjr	  work_tree = mbc_tree;
3280146040Stjr	}
3281146040Stjr    }
3282146040Stjr  else
3283146040Stjr#endif /* not RE_ENABLE_I18N */
3284146040Stjr    {
3285146040Stjr#ifdef RE_ENABLE_I18N
3286146040Stjr      free_charset (mbcset);
3287146040Stjr#endif
3288146040Stjr      /* Build a tree for simple bracket.  */
3289146040Stjr      br_token.type = SIMPLE_BRACKET;
3290146040Stjr      br_token.opr.sbcset = sbcset;
3291146040Stjr      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292146040Stjr      if (BE (work_tree == NULL, 0))
3293250724Sjkim	goto parse_bracket_exp_espace;
3294146040Stjr    }
3295146040Stjr  return work_tree;
3296146040Stjr
3297146040Stjr parse_bracket_exp_espace:
3298146040Stjr  *err = REG_ESPACE;
3299146040Stjr parse_bracket_exp_free_return:
3300146040Stjr  re_free (sbcset);
3301146040Stjr#ifdef RE_ENABLE_I18N
3302146040Stjr  free_charset (mbcset);
3303146040Stjr#endif /* RE_ENABLE_I18N */
3304146040Stjr  return NULL;
3305146040Stjr}
3306146040Stjr
3307146040Stjr/* Parse an element in the bracket expression.  */
3308146040Stjr
3309146040Stjrstatic reg_errcode_t
3310250724Sjkimparse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311250724Sjkim		       re_token_t *token, int token_len, re_dfa_t *dfa,
3312250724Sjkim		       reg_syntax_t syntax, int accept_hyphen)
3313146040Stjr{
3314146040Stjr#ifdef RE_ENABLE_I18N
3315146040Stjr  int cur_char_size;
3316146040Stjr  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317146040Stjr  if (cur_char_size > 1)
3318146040Stjr    {
3319146040Stjr      elem->type = MB_CHAR;
3320146040Stjr      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321146040Stjr      re_string_skip_bytes (regexp, cur_char_size);
3322146040Stjr      return REG_NOERROR;
3323146040Stjr    }
3324146040Stjr#endif /* RE_ENABLE_I18N */
3325146040Stjr  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3326146040Stjr  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327146040Stjr      || token->type == OP_OPEN_EQUIV_CLASS)
3328146040Stjr    return parse_bracket_symbol (elem, regexp, token);
3329146040Stjr  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3330146040Stjr    {
3331146040Stjr      /* A '-' must only appear as anything but a range indicator before
3332146040Stjr	 the closing bracket.  Everything else is an error.  */
3333146040Stjr      re_token_t token2;
3334146040Stjr      (void) peek_token_bracket (&token2, regexp, syntax);
3335146040Stjr      if (token2.type != OP_CLOSE_BRACKET)
3336146040Stjr	/* The actual error value is not standardized since this whole
3337146040Stjr	   case is undefined.  But ERANGE makes good sense.  */
3338146040Stjr	return REG_ERANGE;
3339146040Stjr    }
3340146040Stjr  elem->type = SB_CHAR;
3341146040Stjr  elem->opr.ch = token->opr.c;
3342146040Stjr  return REG_NOERROR;
3343146040Stjr}
3344146040Stjr
3345146040Stjr/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3346146040Stjr   such as [:<character_class>:], [.<collating_element>.], and
3347146040Stjr   [=<equivalent_class>=].  */
3348146040Stjr
3349146040Stjrstatic reg_errcode_t
3350250724Sjkimparse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351250724Sjkim		      re_token_t *token)
3352146040Stjr{
3353146040Stjr  unsigned char ch, delim = token->opr.c;
3354146040Stjr  int i = 0;
3355146040Stjr  if (re_string_eoi(regexp))
3356146040Stjr    return REG_EBRACK;
3357146040Stjr  for (;; ++i)
3358146040Stjr    {
3359146040Stjr      if (i >= BRACKET_NAME_BUF_SIZE)
3360146040Stjr	return REG_EBRACK;
3361146040Stjr      if (token->type == OP_OPEN_CHAR_CLASS)
3362146040Stjr	ch = re_string_fetch_byte_case (regexp);
3363146040Stjr      else
3364146040Stjr	ch = re_string_fetch_byte (regexp);
3365146040Stjr      if (re_string_eoi(regexp))
3366146040Stjr	return REG_EBRACK;
3367146040Stjr      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368146040Stjr	break;
3369146040Stjr      elem->opr.name[i] = ch;
3370146040Stjr    }
3371146040Stjr  re_string_skip_bytes (regexp, 1);
3372146040Stjr  elem->opr.name[i] = '\0';
3373146040Stjr  switch (token->type)
3374146040Stjr    {
3375146040Stjr    case OP_OPEN_COLL_ELEM:
3376146040Stjr      elem->type = COLL_SYM;
3377146040Stjr      break;
3378146040Stjr    case OP_OPEN_EQUIV_CLASS:
3379146040Stjr      elem->type = EQUIV_CLASS;
3380146040Stjr      break;
3381146040Stjr    case OP_OPEN_CHAR_CLASS:
3382146040Stjr      elem->type = CHAR_CLASS;
3383146040Stjr      break;
3384146040Stjr    default:
3385146040Stjr      break;
3386146040Stjr    }
3387146040Stjr  return REG_NOERROR;
3388146040Stjr}
3389146040Stjr
3390146040Stjr  /* Helper function for parse_bracket_exp.
3391146040Stjr     Build the equivalence class which is represented by NAME.
3392146040Stjr     The result are written to MBCSET and SBCSET.
3393146040Stjr     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394146040Stjr     is a pointer argument sinse we may update it.  */
3395146040Stjr
3396146040Stjrstatic reg_errcode_t
3397146040Stjr#ifdef RE_ENABLE_I18N
3398250724Sjkimbuild_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399250724Sjkim		   int *equiv_class_alloc, const unsigned char *name)
3400146040Stjr#else /* not RE_ENABLE_I18N */
3401250724Sjkimbuild_equiv_class (bitset_t sbcset, const unsigned char *name)
3402146040Stjr#endif /* not RE_ENABLE_I18N */
3403146040Stjr{
3404250724Sjkim#ifdef _LIBC
3405146040Stjr  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3406146040Stjr  if (nrules != 0)
3407146040Stjr    {
3408146040Stjr      const int32_t *table, *indirect;
3409146040Stjr      const unsigned char *weights, *extra, *cp;
3410146040Stjr      unsigned char char_buf[2];
3411146040Stjr      int32_t idx1, idx2;
3412146040Stjr      unsigned int ch;
3413146040Stjr      size_t len;
3414146040Stjr      /* This #include defines a local function!  */
3415146040Stjr# include <locale/weight.h>
3416146040Stjr      /* Calculate the index for equivalence class.  */
3417146040Stjr      cp = name;
3418146040Stjr      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419146040Stjr      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420146040Stjr					       _NL_COLLATE_WEIGHTMB);
3421146040Stjr      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422146040Stjr						   _NL_COLLATE_EXTRAMB);
3423146040Stjr      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424146040Stjr						_NL_COLLATE_INDIRECTMB);
3425250724Sjkim      idx1 = findidx (&cp, -1);
3426250724Sjkim      if (BE (idx1 == 0 || *cp != '\0', 0))
3427146040Stjr	/* This isn't a valid character.  */
3428146040Stjr	return REG_ECOLLATE;
3429146040Stjr
3430146040Stjr      /* Build single byte matcing table for this equivalence class.  */
3431250724Sjkim      len = weights[idx1 & 0xffffff];
3432146040Stjr      for (ch = 0; ch < SBC_MAX; ++ch)
3433146040Stjr	{
3434146040Stjr	  char_buf[0] = ch;
3435146040Stjr	  cp = char_buf;
3436250724Sjkim	  idx2 = findidx (&cp, 1);
3437146040Stjr/*
3438146040Stjr	  idx2 = table[ch];
3439146040Stjr*/
3440146040Stjr	  if (idx2 == 0)
3441146040Stjr	    /* This isn't a valid character.  */
3442146040Stjr	    continue;
3443250724Sjkim	  /* Compare only if the length matches and the collation rule
3444250724Sjkim	     index is the same.  */
3445250724Sjkim	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3446146040Stjr	    {
3447146040Stjr	      int cnt = 0;
3448250724Sjkim
3449146040Stjr	      while (cnt <= len &&
3450250724Sjkim		     weights[(idx1 & 0xffffff) + 1 + cnt]
3451250724Sjkim		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3452146040Stjr		++cnt;
3453146040Stjr
3454146040Stjr	      if (cnt > len)
3455146040Stjr		bitset_set (sbcset, ch);
3456146040Stjr	    }
3457146040Stjr	}
3458146040Stjr      /* Check whether the array has enough space.  */
3459146040Stjr      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460146040Stjr	{
3461146040Stjr	  /* Not enough, realloc it.  */
3462146040Stjr	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3463146040Stjr	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464146040Stjr	  /* Use realloc since the array is NULL if *alloc == 0.  */
3465146040Stjr	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466146040Stjr						   int32_t,
3467146040Stjr						   new_equiv_class_alloc);
3468146040Stjr	  if (BE (new_equiv_classes == NULL, 0))
3469146040Stjr	    return REG_ESPACE;
3470146040Stjr	  mbcset->equiv_classes = new_equiv_classes;
3471146040Stjr	  *equiv_class_alloc = new_equiv_class_alloc;
3472146040Stjr	}
3473146040Stjr      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474146040Stjr    }
3475146040Stjr  else
3476146040Stjr#endif /* _LIBC */
3477146040Stjr    {
3478146040Stjr      if (BE (strlen ((const char *) name) != 1, 0))
3479146040Stjr	return REG_ECOLLATE;
3480146040Stjr      bitset_set (sbcset, *name);
3481146040Stjr    }
3482146040Stjr  return REG_NOERROR;
3483146040Stjr}
3484146040Stjr
3485146040Stjr  /* Helper function for parse_bracket_exp.
3486146040Stjr     Build the character class which is represented by NAME.
3487146040Stjr     The result are written to MBCSET and SBCSET.
3488146040Stjr     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489146040Stjr     is a pointer argument sinse we may update it.  */
3490146040Stjr
3491146040Stjrstatic reg_errcode_t
3492146040Stjr#ifdef RE_ENABLE_I18N
3493250724Sjkimbuild_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494250724Sjkim		 re_charset_t *mbcset, int *char_class_alloc,
3495250724Sjkim		 const unsigned char *class_name, reg_syntax_t syntax)
3496146040Stjr#else /* not RE_ENABLE_I18N */
3497250724Sjkimbuild_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498250724Sjkim		 const unsigned char *class_name, reg_syntax_t syntax)
3499146040Stjr#endif /* not RE_ENABLE_I18N */
3500146040Stjr{
3501146040Stjr  int i;
3502146040Stjr  const char *name = (const char *) class_name;
3503146040Stjr
3504146040Stjr  /* In case of REG_ICASE "upper" and "lower" match the both of
3505146040Stjr     upper and lower cases.  */
3506146040Stjr  if ((syntax & RE_ICASE)
3507146040Stjr      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508146040Stjr    name = "alpha";
3509146040Stjr
3510146040Stjr#ifdef RE_ENABLE_I18N
3511146040Stjr  /* Check the space of the arrays.  */
3512146040Stjr  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513146040Stjr    {
3514146040Stjr      /* Not enough, realloc it.  */
3515146040Stjr      /* +1 in case of mbcset->nchar_classes is 0.  */
3516146040Stjr      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517146040Stjr      /* Use realloc since array is NULL if *alloc == 0.  */
3518146040Stjr      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519146040Stjr					       new_char_class_alloc);
3520146040Stjr      if (BE (new_char_classes == NULL, 0))
3521146040Stjr	return REG_ESPACE;
3522146040Stjr      mbcset->char_classes = new_char_classes;
3523146040Stjr      *char_class_alloc = new_char_class_alloc;
3524146040Stjr    }
3525146040Stjr  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526146040Stjr#endif /* RE_ENABLE_I18N */
3527146040Stjr
3528146040Stjr#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3529250724Sjkim  do {						\
3530250724Sjkim    if (BE (trans != NULL, 0))			\
3531146040Stjr      {						\
3532250724Sjkim	for (i = 0; i < SBC_MAX; ++i)		\
3533250724Sjkim  	  if (ctype_func (i))			\
3534250724Sjkim	    bitset_set (sbcset, trans[i]);	\
3535250724Sjkim      }						\
3536250724Sjkim    else					\
3537250724Sjkim      {						\
3538250724Sjkim	for (i = 0; i < SBC_MAX; ++i)		\
3539250724Sjkim  	  if (ctype_func (i))			\
3540250724Sjkim	    bitset_set (sbcset, i);		\
3541250724Sjkim      }						\
3542250724Sjkim  } while (0)
3543146040Stjr
3544146040Stjr  if (strcmp (name, "alnum") == 0)
3545250724Sjkim    BUILD_CHARCLASS_LOOP (isalnum);
3546146040Stjr  else if (strcmp (name, "cntrl") == 0)
3547250724Sjkim    BUILD_CHARCLASS_LOOP (iscntrl);
3548146040Stjr  else if (strcmp (name, "lower") == 0)
3549250724Sjkim    BUILD_CHARCLASS_LOOP (islower);
3550146040Stjr  else if (strcmp (name, "space") == 0)
3551250724Sjkim    BUILD_CHARCLASS_LOOP (isspace);
3552146040Stjr  else if (strcmp (name, "alpha") == 0)
3553250724Sjkim    BUILD_CHARCLASS_LOOP (isalpha);
3554146040Stjr  else if (strcmp (name, "digit") == 0)
3555250724Sjkim    BUILD_CHARCLASS_LOOP (isdigit);
3556146040Stjr  else if (strcmp (name, "print") == 0)
3557250724Sjkim    BUILD_CHARCLASS_LOOP (isprint);
3558146040Stjr  else if (strcmp (name, "upper") == 0)
3559250724Sjkim    BUILD_CHARCLASS_LOOP (isupper);
3560146040Stjr  else if (strcmp (name, "blank") == 0)
3561250724Sjkim    BUILD_CHARCLASS_LOOP (isblank);
3562146040Stjr  else if (strcmp (name, "graph") == 0)
3563250724Sjkim    BUILD_CHARCLASS_LOOP (isgraph);
3564146040Stjr  else if (strcmp (name, "punct") == 0)
3565250724Sjkim    BUILD_CHARCLASS_LOOP (ispunct);
3566146040Stjr  else if (strcmp (name, "xdigit") == 0)
3567250724Sjkim    BUILD_CHARCLASS_LOOP (isxdigit);
3568146040Stjr  else
3569146040Stjr    return REG_ECTYPE;
3570146040Stjr
3571146040Stjr  return REG_NOERROR;
3572146040Stjr}
3573146040Stjr
3574146040Stjrstatic bin_tree_t *
3575250724Sjkimbuild_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576250724Sjkim		    const unsigned char *class_name,
3577250724Sjkim		    const unsigned char *extra, int non_match,
3578250724Sjkim		    reg_errcode_t *err)
3579146040Stjr{
3580146040Stjr  re_bitset_ptr_t sbcset;
3581146040Stjr#ifdef RE_ENABLE_I18N
3582146040Stjr  re_charset_t *mbcset;
3583146040Stjr  int alloc = 0;
3584146040Stjr#endif /* not RE_ENABLE_I18N */
3585146040Stjr  reg_errcode_t ret;
3586146040Stjr  re_token_t br_token;
3587146040Stjr  bin_tree_t *tree;
3588146040Stjr
3589250724Sjkim  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590146040Stjr#ifdef RE_ENABLE_I18N
3591146040Stjr  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592146040Stjr#endif /* RE_ENABLE_I18N */
3593146040Stjr
3594146040Stjr#ifdef RE_ENABLE_I18N
3595146040Stjr  if (BE (sbcset == NULL || mbcset == NULL, 0))
3596146040Stjr#else /* not RE_ENABLE_I18N */
3597146040Stjr  if (BE (sbcset == NULL, 0))
3598146040Stjr#endif /* not RE_ENABLE_I18N */
3599146040Stjr    {
3600146040Stjr      *err = REG_ESPACE;
3601146040Stjr      return NULL;
3602146040Stjr    }
3603146040Stjr
3604146040Stjr  if (non_match)
3605146040Stjr    {
3606146040Stjr#ifdef RE_ENABLE_I18N
3607146040Stjr      mbcset->non_match = 1;
3608146040Stjr#endif /* not RE_ENABLE_I18N */
3609146040Stjr    }
3610146040Stjr
3611146040Stjr  /* We don't care the syntax in this case.  */
3612146040Stjr  ret = build_charclass (trans, sbcset,
3613146040Stjr#ifdef RE_ENABLE_I18N
3614146040Stjr			 mbcset, &alloc,
3615146040Stjr#endif /* RE_ENABLE_I18N */
3616146040Stjr			 class_name, 0);
3617146040Stjr
3618146040Stjr  if (BE (ret != REG_NOERROR, 0))
3619146040Stjr    {
3620146040Stjr      re_free (sbcset);
3621146040Stjr#ifdef RE_ENABLE_I18N
3622146040Stjr      free_charset (mbcset);
3623146040Stjr#endif /* RE_ENABLE_I18N */
3624146040Stjr      *err = ret;
3625146040Stjr      return NULL;
3626146040Stjr    }
3627146040Stjr  /* \w match '_' also.  */
3628146040Stjr  for (; *extra; extra++)
3629146040Stjr    bitset_set (sbcset, *extra);
3630146040Stjr
3631146040Stjr  /* If it is non-matching list.  */
3632146040Stjr  if (non_match)
3633146040Stjr    bitset_not (sbcset);
3634146040Stjr
3635146040Stjr#ifdef RE_ENABLE_I18N
3636146040Stjr  /* Ensure only single byte characters are set.  */
3637146040Stjr  if (dfa->mb_cur_max > 1)
3638146040Stjr    bitset_mask (sbcset, dfa->sb_char);
3639146040Stjr#endif
3640146040Stjr
3641146040Stjr  /* Build a tree for simple bracket.  */
3642146040Stjr  br_token.type = SIMPLE_BRACKET;
3643146040Stjr  br_token.opr.sbcset = sbcset;
3644146040Stjr  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645146040Stjr  if (BE (tree == NULL, 0))
3646146040Stjr    goto build_word_op_espace;
3647146040Stjr
3648146040Stjr#ifdef RE_ENABLE_I18N
3649146040Stjr  if (dfa->mb_cur_max > 1)
3650146040Stjr    {
3651146040Stjr      bin_tree_t *mbc_tree;
3652146040Stjr      /* Build a tree for complex bracket.  */
3653146040Stjr      br_token.type = COMPLEX_BRACKET;
3654146040Stjr      br_token.opr.mbcset = mbcset;
3655146040Stjr      dfa->has_mb_node = 1;
3656146040Stjr      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657146040Stjr      if (BE (mbc_tree == NULL, 0))
3658146040Stjr	goto build_word_op_espace;
3659146040Stjr      /* Then join them by ALT node.  */
3660146040Stjr      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661146040Stjr      if (BE (mbc_tree != NULL, 1))
3662146040Stjr	return tree;
3663146040Stjr    }
3664146040Stjr  else
3665146040Stjr    {
3666146040Stjr      free_charset (mbcset);
3667146040Stjr      return tree;
3668146040Stjr    }
3669146040Stjr#else /* not RE_ENABLE_I18N */
3670146040Stjr  return tree;
3671146040Stjr#endif /* not RE_ENABLE_I18N */
3672146040Stjr
3673146040Stjr build_word_op_espace:
3674146040Stjr  re_free (sbcset);
3675146040Stjr#ifdef RE_ENABLE_I18N
3676146040Stjr  free_charset (mbcset);
3677146040Stjr#endif /* RE_ENABLE_I18N */
3678146040Stjr  *err = REG_ESPACE;
3679146040Stjr  return NULL;
3680146040Stjr}
3681146040Stjr
3682146040Stjr/* This is intended for the expressions like "a{1,3}".
3683146040Stjr   Fetch a number from `input', and return the number.
3684146040Stjr   Return -1, if the number field is empty like "{,1}".
3685146040Stjr   Return -2, If an error is occured.  */
3686146040Stjr
3687146040Stjrstatic int
3688250724Sjkimfetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689146040Stjr{
3690146040Stjr  int num = -1;
3691146040Stjr  unsigned char c;
3692146040Stjr  while (1)
3693146040Stjr    {
3694146040Stjr      fetch_token (token, input, syntax);
3695146040Stjr      c = token->opr.c;
3696146040Stjr      if (BE (token->type == END_OF_RE, 0))
3697146040Stjr	return -2;
3698146040Stjr      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699146040Stjr	break;
3700146040Stjr      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701146040Stjr	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702146040Stjr      num = (num > RE_DUP_MAX) ? -2 : num;
3703146040Stjr    }
3704146040Stjr  return num;
3705146040Stjr}
3706146040Stjr
3707146040Stjr#ifdef RE_ENABLE_I18N
3708146040Stjrstatic void
3709146040Stjrfree_charset (re_charset_t *cset)
3710146040Stjr{
3711146040Stjr  re_free (cset->mbchars);
3712146040Stjr# ifdef _LIBC
3713146040Stjr  re_free (cset->coll_syms);
3714146040Stjr  re_free (cset->equiv_classes);
3715146040Stjr  re_free (cset->range_starts);
3716146040Stjr  re_free (cset->range_ends);
3717146040Stjr# endif
3718146040Stjr  re_free (cset->char_classes);
3719146040Stjr  re_free (cset);
3720146040Stjr}
3721146040Stjr#endif /* RE_ENABLE_I18N */
3722146040Stjr
3723146040Stjr/* Functions for binary tree operation.  */
3724146040Stjr
3725146040Stjr/* Create a tree node.  */
3726146040Stjr
3727146040Stjrstatic bin_tree_t *
3728250724Sjkimcreate_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729250724Sjkim	     re_token_type_t type)
3730146040Stjr{
3731146040Stjr  re_token_t t;
3732146040Stjr  t.type = type;
3733146040Stjr  return create_token_tree (dfa, left, right, &t);
3734146040Stjr}
3735146040Stjr
3736146040Stjrstatic bin_tree_t *
3737250724Sjkimcreate_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738250724Sjkim		   const re_token_t *token)
3739146040Stjr{
3740146040Stjr  bin_tree_t *tree;
3741146040Stjr  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742146040Stjr    {
3743146040Stjr      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744146040Stjr
3745146040Stjr      if (storage == NULL)
3746146040Stjr	return NULL;
3747146040Stjr      storage->next = dfa->str_tree_storage;
3748146040Stjr      dfa->str_tree_storage = storage;
3749146040Stjr      dfa->str_tree_storage_idx = 0;
3750146040Stjr    }
3751146040Stjr  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752146040Stjr
3753146040Stjr  tree->parent = NULL;
3754146040Stjr  tree->left = left;
3755146040Stjr  tree->right = right;
3756146040Stjr  tree->token = *token;
3757146040Stjr  tree->token.duplicated = 0;
3758146040Stjr  tree->token.opt_subexp = 0;
3759146040Stjr  tree->first = NULL;
3760146040Stjr  tree->next = NULL;
3761146040Stjr  tree->node_idx = -1;
3762146040Stjr
3763146040Stjr  if (left != NULL)
3764146040Stjr    left->parent = tree;
3765146040Stjr  if (right != NULL)
3766146040Stjr    right->parent = tree;
3767146040Stjr  return tree;
3768146040Stjr}
3769146040Stjr
3770146040Stjr/* Mark the tree SRC as an optional subexpression.
3771146040Stjr   To be called from preorder or postorder.  */
3772146040Stjr
3773146040Stjrstatic reg_errcode_t
3774250724Sjkimmark_opt_subexp (void *extra, bin_tree_t *node)
3775146040Stjr{
3776146040Stjr  int idx = (int) (long) extra;
3777146040Stjr  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778146040Stjr    node->token.opt_subexp = 1;
3779146040Stjr
3780146040Stjr  return REG_NOERROR;
3781146040Stjr}
3782146040Stjr
3783146040Stjr/* Free the allocated memory inside NODE. */
3784146040Stjr
3785146040Stjrstatic void
3786146040Stjrfree_token (re_token_t *node)
3787146040Stjr{
3788146040Stjr#ifdef RE_ENABLE_I18N
3789146040Stjr  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790146040Stjr    free_charset (node->opr.mbcset);
3791146040Stjr  else
3792146040Stjr#endif /* RE_ENABLE_I18N */
3793146040Stjr    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794146040Stjr      re_free (node->opr.sbcset);
3795146040Stjr}
3796146040Stjr
3797146040Stjr/* Worker function for tree walking.  Free the allocated memory inside NODE
3798146040Stjr   and its children. */
3799146040Stjr
3800146040Stjrstatic reg_errcode_t
3801146040Stjrfree_tree (void *extra, bin_tree_t *node)
3802146040Stjr{
3803146040Stjr  free_token (&node->token);
3804146040Stjr  return REG_NOERROR;
3805146040Stjr}
3806146040Stjr
3807146040Stjr
3808146040Stjr/* Duplicate the node SRC, and return new node.  This is a preorder
3809146040Stjr   visit similar to the one implemented by the generic visitor, but
3810146040Stjr   we need more infrastructure to maintain two parallel trees --- so,
3811146040Stjr   it's easier to duplicate.  */
3812146040Stjr
3813146040Stjrstatic bin_tree_t *
3814250724Sjkimduplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815146040Stjr{
3816146040Stjr  const bin_tree_t *node;
3817146040Stjr  bin_tree_t *dup_root;
3818146040Stjr  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819146040Stjr
3820146040Stjr  for (node = root; ; )
3821146040Stjr    {
3822146040Stjr      /* Create a new tree and link it back to the current parent.  */
3823146040Stjr      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824146040Stjr      if (*p_new == NULL)
3825146040Stjr	return NULL;
3826146040Stjr      (*p_new)->parent = dup_node;
3827146040Stjr      (*p_new)->token.duplicated = 1;
3828146040Stjr      dup_node = *p_new;
3829146040Stjr
3830146040Stjr      /* Go to the left node, or up and to the right.  */
3831146040Stjr      if (node->left)
3832146040Stjr	{
3833146040Stjr	  node = node->left;
3834146040Stjr	  p_new = &dup_node->left;
3835146040Stjr	}
3836146040Stjr      else
3837146040Stjr	{
3838146040Stjr	  const bin_tree_t *prev = NULL;
3839146040Stjr	  while (node->right == prev || node->right == NULL)
3840146040Stjr	    {
3841146040Stjr	      prev = node;
3842146040Stjr	      node = node->parent;
3843146040Stjr	      dup_node = dup_node->parent;
3844146040Stjr	      if (!node)
3845250724Sjkim		return dup_root;
3846146040Stjr	    }
3847146040Stjr	  node = node->right;
3848146040Stjr	  p_new = &dup_node->right;
3849146040Stjr	}
3850146040Stjr    }
3851146040Stjr}
3852