1/* Extended regular expression matching and search library.
2   Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <http://www.gnu.org/licenses/>.  */
19
20static void re_string_construct_common (const char *str, int len,
21					re_string_t *pstr,
22					RE_TRANSLATE_TYPE trans, int icase,
23					const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25					  const re_node_set *nodes,
26					  unsigned int hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28					  const re_node_set *nodes,
29					  unsigned int context,
30					  unsigned int hash) internal_function;
31
32/* Functions for string operation.  */
33
34/* This function allocate the buffers.  It is necessary to call
35   re_string_reconstruct before using the object.  */
36
37static reg_errcode_t
38internal_function __attribute_warn_unused_result__
39re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
40		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
41{
42  reg_errcode_t ret;
43  int init_buf_len;
44
45  /* Ensure at least one character fits into the buffers.  */
46  if (init_len < dfa->mb_cur_max)
47    init_len = dfa->mb_cur_max;
48  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49  re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51  ret = re_string_realloc_buffers (pstr, init_buf_len);
52  if (BE (ret != REG_NOERROR, 0))
53    return ret;
54
55  pstr->word_char = dfa->word_char;
56  pstr->word_ops_used = dfa->word_ops_used;
57  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59  pstr->valid_raw_len = pstr->valid_len;
60  return REG_NOERROR;
61}
62
63/* This function allocate the buffers, and initialize them.  */
64
65static reg_errcode_t
66internal_function __attribute_warn_unused_result__
67re_string_construct (re_string_t *pstr, const char *str, int len,
68		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
69{
70  reg_errcode_t ret;
71  memset (pstr, '\0', sizeof (re_string_t));
72  re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74  if (len > 0)
75    {
76      ret = re_string_realloc_buffers (pstr, len + 1);
77      if (BE (ret != REG_NOERROR, 0))
78	return ret;
79    }
80  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82  if (icase)
83    {
84#ifdef RE_ENABLE_I18N
85      if (dfa->mb_cur_max > 1)
86	{
87	  while (1)
88	    {
89	      ret = build_wcs_upper_buffer (pstr);
90	      if (BE (ret != REG_NOERROR, 0))
91		return ret;
92	      if (pstr->valid_raw_len >= len)
93		break;
94	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95		break;
96	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97	      if (BE (ret != REG_NOERROR, 0))
98		return ret;
99	    }
100	}
101      else
102#endif /* RE_ENABLE_I18N  */
103	build_upper_buffer (pstr);
104    }
105  else
106    {
107#ifdef RE_ENABLE_I18N
108      if (dfa->mb_cur_max > 1)
109	build_wcs_buffer (pstr);
110      else
111#endif /* RE_ENABLE_I18N  */
112	{
113	  if (trans != NULL)
114	    re_string_translate_buffer (pstr);
115	  else
116	    {
117	      pstr->valid_len = pstr->bufs_len;
118	      pstr->valid_raw_len = pstr->bufs_len;
119	    }
120	}
121    }
122
123  return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct.  */
127
128static reg_errcode_t
129internal_function __attribute_warn_unused_result__
130re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
131{
132#ifdef RE_ENABLE_I18N
133  if (pstr->mb_cur_max > 1)
134    {
135      wint_t *new_wcs;
136
137      /* Avoid overflow in realloc.  */
138      const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
139      if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
140	return REG_ESPACE;
141
142      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143      if (BE (new_wcs == NULL, 0))
144	return REG_ESPACE;
145      pstr->wcs = new_wcs;
146      if (pstr->offsets != NULL)
147	{
148	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
149	  if (BE (new_offsets == NULL, 0))
150	    return REG_ESPACE;
151	  pstr->offsets = new_offsets;
152	}
153    }
154#endif /* RE_ENABLE_I18N  */
155  if (pstr->mbs_allocated)
156    {
157      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158					   new_buf_len);
159      if (BE (new_mbs == NULL, 0))
160	return REG_ESPACE;
161      pstr->mbs = new_mbs;
162    }
163  pstr->bufs_len = new_buf_len;
164  return REG_NOERROR;
165}
166
167
168static void
169internal_function
170re_string_construct_common (const char *str, int len, re_string_t *pstr,
171			    RE_TRANSLATE_TYPE trans, int icase,
172			    const re_dfa_t *dfa)
173{
174  pstr->raw_mbs = (const unsigned char *) str;
175  pstr->len = len;
176  pstr->raw_len = len;
177  pstr->trans = trans;
178  pstr->icase = icase ? 1 : 0;
179  pstr->mbs_allocated = (trans != NULL || icase);
180  pstr->mb_cur_max = dfa->mb_cur_max;
181  pstr->is_utf8 = dfa->is_utf8;
182  pstr->map_notascii = dfa->map_notascii;
183  pstr->stop = pstr->len;
184  pstr->raw_stop = pstr->stop;
185}
186
187#ifdef RE_ENABLE_I18N
188
189/* Build wide character buffer PSTR->WCS.
190   If the byte sequence of the string are:
191     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192   Then wide character buffer will be:
193     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
194   We use WEOF for padding, they indicate that the position isn't
195   a first byte of a multibyte character.
196
197   Note that this function assumes PSTR->VALID_LEN elements are already
198   built and starts from PSTR->VALID_LEN.  */
199
200static void
201internal_function
202build_wcs_buffer (re_string_t *pstr)
203{
204#ifdef _LIBC
205  unsigned char buf[MB_LEN_MAX];
206  assert (MB_LEN_MAX >= pstr->mb_cur_max);
207#else
208  unsigned char buf[64];
209#endif
210  mbstate_t prev_st;
211  int byte_idx, end_idx, remain_len;
212  size_t mbclen;
213
214  /* Build the buffers from pstr->valid_len to either pstr->len or
215     pstr->bufs_len.  */
216  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218    {
219      wchar_t wc;
220      const char *p;
221
222      remain_len = end_idx - byte_idx;
223      prev_st = pstr->cur_state;
224      /* Apply the translation if we need.  */
225      if (BE (pstr->trans != NULL, 0))
226	{
227	  int i, ch;
228
229	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230	    {
231	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233	    }
234	  p = (const char *) buf;
235	}
236      else
237	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239      if (BE (mbclen == (size_t) -1 || mbclen == 0
240	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241	{
242	  /* We treat these cases as a singlebyte character.  */
243	  mbclen = 1;
244	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245	  if (BE (pstr->trans != NULL, 0))
246	    wc = pstr->trans[wc];
247	  pstr->cur_state = prev_st;
248	}
249      else if (BE (mbclen == (size_t) -2, 0))
250	{
251	  /* The buffer doesn't have enough space, finish to build.  */
252	  pstr->cur_state = prev_st;
253	  break;
254	}
255
256      /* Write wide character and padding.  */
257      pstr->wcs[byte_idx++] = wc;
258      /* Write paddings.  */
259      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260	pstr->wcs[byte_idx++] = WEOF;
261    }
262  pstr->valid_len = byte_idx;
263  pstr->valid_raw_len = byte_idx;
264}
265
266/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267   but for REG_ICASE.  */
268
269static reg_errcode_t
270internal_function __attribute_warn_unused_result__
271build_wcs_upper_buffer (re_string_t *pstr)
272{
273  mbstate_t prev_st;
274  int src_idx, byte_idx, end_idx, remain_len;
275  size_t mbclen;
276#ifdef _LIBC
277  char buf[MB_LEN_MAX];
278  assert (MB_LEN_MAX >= pstr->mb_cur_max);
279#else
280  char buf[64];
281#endif
282
283  byte_idx = pstr->valid_len;
284  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286  /* The following optimization assumes that ASCII characters can be
287     mapped to wide characters with a simple cast.  */
288  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289    {
290      while (byte_idx < end_idx)
291	{
292	  wchar_t wc;
293
294	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295	      && mbsinit (&pstr->cur_state))
296	    {
297	      /* In case of a singlebyte character.  */
298	      pstr->mbs[byte_idx]
299		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300	      /* The next step uses the assumption that wchar_t is encoded
301		 ASCII-safe: all ASCII values can be converted like this.  */
302	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303	      ++byte_idx;
304	      continue;
305	    }
306
307	  remain_len = end_idx - byte_idx;
308	  prev_st = pstr->cur_state;
309	  mbclen = __mbrtowc (&wc,
310			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311			       + byte_idx), remain_len, &pstr->cur_state);
312	  if (BE (mbclen + 2 > 2, 1))
313	    {
314	      wchar_t wcu = wc;
315	      if (iswlower (wc))
316		{
317		  size_t mbcdlen;
318
319		  wcu = towupper (wc);
320		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
321		  if (BE (mbclen == mbcdlen, 1))
322		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
323		  else
324		    {
325		      src_idx = byte_idx;
326		      goto offsets_needed;
327		    }
328		}
329	      else
330		memcpy (pstr->mbs + byte_idx,
331			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332	      pstr->wcs[byte_idx++] = wcu;
333	      /* Write paddings.  */
334	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335		pstr->wcs[byte_idx++] = WEOF;
336	    }
337	  else if (mbclen == (size_t) -1 || mbclen == 0
338		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
339	    {
340	      /* It is an invalid character, an incomplete character
341		 at the end of the string, or '\0'.  Just use the byte.  */
342	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343	      pstr->mbs[byte_idx] = ch;
344	      /* And also cast it to wide char.  */
345	      pstr->wcs[byte_idx++] = (wchar_t) ch;
346	      if (BE (mbclen == (size_t) -1, 0))
347		pstr->cur_state = prev_st;
348	    }
349	  else
350	    {
351	      /* The buffer doesn't have enough space, finish to build.  */
352	      pstr->cur_state = prev_st;
353	      break;
354	    }
355	}
356      pstr->valid_len = byte_idx;
357      pstr->valid_raw_len = byte_idx;
358      return REG_NOERROR;
359    }
360  else
361    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362      {
363	wchar_t wc;
364	const char *p;
365      offsets_needed:
366	remain_len = end_idx - byte_idx;
367	prev_st = pstr->cur_state;
368	if (BE (pstr->trans != NULL, 0))
369	  {
370	    int i, ch;
371
372	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373	      {
374		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375		buf[i] = pstr->trans[ch];
376	      }
377	    p = (const char *) buf;
378	  }
379	else
380	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382	if (BE (mbclen + 2 > 2, 1))
383	  {
384	    wchar_t wcu = wc;
385	    if (iswlower (wc))
386	      {
387		size_t mbcdlen;
388
389		wcu = towupper (wc);
390		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391		if (BE (mbclen == mbcdlen, 1))
392		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
393		else if (mbcdlen != (size_t) -1)
394		  {
395		    size_t i;
396
397		    if (byte_idx + mbcdlen > pstr->bufs_len)
398		      {
399			pstr->cur_state = prev_st;
400			break;
401		      }
402
403		    if (pstr->offsets == NULL)
404		      {
405			pstr->offsets = re_malloc (int, pstr->bufs_len);
406
407			if (pstr->offsets == NULL)
408			  return REG_ESPACE;
409		      }
410		    if (!pstr->offsets_needed)
411		      {
412			for (i = 0; i < (size_t) byte_idx; ++i)
413			  pstr->offsets[i] = i;
414			pstr->offsets_needed = 1;
415		      }
416
417		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418		    pstr->wcs[byte_idx] = wcu;
419		    pstr->offsets[byte_idx] = src_idx;
420		    for (i = 1; i < mbcdlen; ++i)
421		      {
422			pstr->offsets[byte_idx + i]
423			  = src_idx + (i < mbclen ? i : mbclen - 1);
424			pstr->wcs[byte_idx + i] = WEOF;
425		      }
426		    pstr->len += mbcdlen - mbclen;
427		    if (pstr->raw_stop > src_idx)
428		      pstr->stop += mbcdlen - mbclen;
429		    end_idx = (pstr->bufs_len > pstr->len)
430			      ? pstr->len : pstr->bufs_len;
431		    byte_idx += mbcdlen;
432		    src_idx += mbclen;
433		    continue;
434		  }
435		else
436		  memcpy (pstr->mbs + byte_idx, p, mbclen);
437	      }
438	    else
439	      memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441	    if (BE (pstr->offsets_needed != 0, 0))
442	      {
443		size_t i;
444		for (i = 0; i < mbclen; ++i)
445		  pstr->offsets[byte_idx + i] = src_idx + i;
446	      }
447	    src_idx += mbclen;
448
449	    pstr->wcs[byte_idx++] = wcu;
450	    /* Write paddings.  */
451	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452	      pstr->wcs[byte_idx++] = WEOF;
453	  }
454	else if (mbclen == (size_t) -1 || mbclen == 0
455		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
456	  {
457	    /* It is an invalid character or '\0'.  Just use the byte.  */
458	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459
460	    if (BE (pstr->trans != NULL, 0))
461	      ch = pstr->trans [ch];
462	    pstr->mbs[byte_idx] = ch;
463
464	    if (BE (pstr->offsets_needed != 0, 0))
465	      pstr->offsets[byte_idx] = src_idx;
466	    ++src_idx;
467
468	    /* And also cast it to wide char.  */
469	    pstr->wcs[byte_idx++] = (wchar_t) ch;
470	    if (BE (mbclen == (size_t) -1, 0))
471	      pstr->cur_state = prev_st;
472	  }
473	else
474	  {
475	    /* The buffer doesn't have enough space, finish to build.  */
476	    pstr->cur_state = prev_st;
477	    break;
478	  }
479      }
480  pstr->valid_len = byte_idx;
481  pstr->valid_raw_len = src_idx;
482  return REG_NOERROR;
483}
484
485/* Skip characters until the index becomes greater than NEW_RAW_IDX.
486   Return the index.  */
487
488static int
489internal_function
490re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
491{
492  mbstate_t prev_st;
493  int rawbuf_idx;
494  size_t mbclen;
495  wint_t wc = WEOF;
496
497  /* Skip the characters which are not necessary to check.  */
498  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499       rawbuf_idx < new_raw_idx;)
500    {
501      wchar_t wc2;
502      int remain_len = pstr->raw_len - rawbuf_idx;
503      prev_st = pstr->cur_state;
504      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505			  remain_len, &pstr->cur_state);
506      if (BE ((ssize_t) mbclen <= 0, 0))
507	{
508	  /* We treat these cases as a single byte character.  */
509	  if (mbclen == 0 || remain_len == 0)
510	    wc = L'\0';
511	  else
512	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513	  mbclen = 1;
514	  pstr->cur_state = prev_st;
515	}
516      else
517	wc = (wint_t) wc2;
518      /* Then proceed the next character.  */
519      rawbuf_idx += mbclen;
520    }
521  *last_wc = wc;
522  return rawbuf_idx;
523}
524#endif /* RE_ENABLE_I18N  */
525
526/* Build the buffer PSTR->MBS, and apply the translation if we need.
527   This function is used in case of REG_ICASE.  */
528
529static void
530internal_function
531build_upper_buffer (re_string_t *pstr)
532{
533  int char_idx, end_idx;
534  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537    {
538      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539      if (BE (pstr->trans != NULL, 0))
540	ch = pstr->trans[ch];
541      if (islower (ch))
542	pstr->mbs[char_idx] = toupper (ch);
543      else
544	pstr->mbs[char_idx] = ch;
545    }
546  pstr->valid_len = char_idx;
547  pstr->valid_raw_len = char_idx;
548}
549
550/* Apply TRANS to the buffer in PSTR.  */
551
552static void
553internal_function
554re_string_translate_buffer (re_string_t *pstr)
555{
556  int buf_idx, end_idx;
557  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560    {
561      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562      pstr->mbs[buf_idx] = pstr->trans[ch];
563    }
564
565  pstr->valid_len = buf_idx;
566  pstr->valid_raw_len = buf_idx;
567}
568
569/* This function re-construct the buffers.
570   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571   convert to upper case in case of REG_ICASE, apply translation.  */
572
573static reg_errcode_t
574internal_function __attribute_warn_unused_result__
575re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
576{
577  int offset = idx - pstr->raw_mbs_idx;
578  if (BE (offset < 0, 0))
579    {
580      /* Reset buffer.  */
581#ifdef RE_ENABLE_I18N
582      if (pstr->mb_cur_max > 1)
583	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584#endif /* RE_ENABLE_I18N */
585      pstr->len = pstr->raw_len;
586      pstr->stop = pstr->raw_stop;
587      pstr->valid_len = 0;
588      pstr->raw_mbs_idx = 0;
589      pstr->valid_raw_len = 0;
590      pstr->offsets_needed = 0;
591      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593      if (!pstr->mbs_allocated)
594	pstr->mbs = (unsigned char *) pstr->raw_mbs;
595      offset = idx;
596    }
597
598  if (BE (offset != 0, 1))
599    {
600      /* Should the already checked characters be kept?  */
601      if (BE (offset < pstr->valid_raw_len, 1))
602	{
603	  /* Yes, move them to the front of the buffer.  */
604#ifdef RE_ENABLE_I18N
605	  if (BE (pstr->offsets_needed, 0))
606	    {
607	      int low = 0, high = pstr->valid_len, mid;
608	      do
609		{
610		  mid = (high + low) / 2;
611		  if (pstr->offsets[mid] > offset)
612		    high = mid;
613		  else if (pstr->offsets[mid] < offset)
614		    low = mid + 1;
615		  else
616		    break;
617		}
618	      while (low < high);
619	      if (pstr->offsets[mid] < offset)
620		++mid;
621	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
622							eflags);
623	      /* This can be quite complicated, so handle specially
624		 only the common and easy case where the character with
625		 different length representation of lower and upper
626		 case is present at or after offset.  */
627	      if (pstr->valid_len > offset
628		  && mid == offset && pstr->offsets[mid] == offset)
629		{
630		  memmove (pstr->wcs, pstr->wcs + offset,
631			   (pstr->valid_len - offset) * sizeof (wint_t));
632		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
633		  pstr->valid_len -= offset;
634		  pstr->valid_raw_len -= offset;
635		  for (low = 0; low < pstr->valid_len; low++)
636		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
637		}
638	      else
639		{
640		  /* Otherwise, just find out how long the partial multibyte
641		     character at offset is and fill it with WEOF/255.  */
642		  pstr->len = pstr->raw_len - idx + offset;
643		  pstr->stop = pstr->raw_stop - idx + offset;
644		  pstr->offsets_needed = 0;
645		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
646		    --mid;
647		  while (mid < pstr->valid_len)
648		    if (pstr->wcs[mid] != WEOF)
649		      break;
650		    else
651		      ++mid;
652		  if (mid == pstr->valid_len)
653		    pstr->valid_len = 0;
654		  else
655		    {
656		      pstr->valid_len = pstr->offsets[mid] - offset;
657		      if (pstr->valid_len)
658			{
659			  for (low = 0; low < pstr->valid_len; ++low)
660			    pstr->wcs[low] = WEOF;
661			  memset (pstr->mbs, 255, pstr->valid_len);
662			}
663		    }
664		  pstr->valid_raw_len = pstr->valid_len;
665		}
666	    }
667	  else
668#endif
669	    {
670	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
671							eflags);
672#ifdef RE_ENABLE_I18N
673	      if (pstr->mb_cur_max > 1)
674		memmove (pstr->wcs, pstr->wcs + offset,
675			 (pstr->valid_len - offset) * sizeof (wint_t));
676#endif /* RE_ENABLE_I18N */
677	      if (BE (pstr->mbs_allocated, 0))
678		memmove (pstr->mbs, pstr->mbs + offset,
679			 pstr->valid_len - offset);
680	      pstr->valid_len -= offset;
681	      pstr->valid_raw_len -= offset;
682#if DEBUG
683	      assert (pstr->valid_len > 0);
684#endif
685	    }
686	}
687      else
688	{
689	  /* No, skip all characters until IDX.  */
690	  int prev_valid_len = pstr->valid_len;
691
692#ifdef RE_ENABLE_I18N
693	  if (BE (pstr->offsets_needed, 0))
694	    {
695	      pstr->len = pstr->raw_len - idx + offset;
696	      pstr->stop = pstr->raw_stop - idx + offset;
697	      pstr->offsets_needed = 0;
698	    }
699#endif
700	  pstr->valid_len = 0;
701#ifdef RE_ENABLE_I18N
702	  if (pstr->mb_cur_max > 1)
703	    {
704	      int wcs_idx;
705	      wint_t wc = WEOF;
706
707	      if (pstr->is_utf8)
708		{
709		  const unsigned char *raw, *p, *end;
710
711		  /* Special case UTF-8.  Multi-byte chars start with any
712		     byte other than 0x80 - 0xbf.  */
713		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
714		  end = raw + (offset - pstr->mb_cur_max);
715		  if (end < pstr->raw_mbs)
716		    end = pstr->raw_mbs;
717		  p = raw + offset - 1;
718#ifdef _LIBC
719		  /* We know the wchar_t encoding is UCS4, so for the simple
720		     case, ASCII characters, skip the conversion step.  */
721		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
722		    {
723		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
724		      /* pstr->valid_len = 0; */
725		      wc = (wchar_t) *p;
726		    }
727		  else
728#endif
729		    for (; p >= end; --p)
730		      if ((*p & 0xc0) != 0x80)
731			{
732			  mbstate_t cur_state;
733			  wchar_t wc2;
734			  int mlen = raw + pstr->len - p;
735			  unsigned char buf[6];
736			  size_t mbclen;
737
738			  const unsigned char *pp = p;
739			  if (BE (pstr->trans != NULL, 0))
740			    {
741			      int i = mlen < 6 ? mlen : 6;
742			      while (--i >= 0)
743				buf[i] = pstr->trans[p[i]];
744			      pp = buf;
745			    }
746			  /* XXX Don't use mbrtowc, we know which conversion
747			     to use (UTF-8 -> UCS4).  */
748			  memset (&cur_state, 0, sizeof (cur_state));
749			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
750					      &cur_state);
751			  if (raw + offset - p <= mbclen
752			      && mbclen < (size_t) -2)
753			    {
754			      memset (&pstr->cur_state, '\0',
755				      sizeof (mbstate_t));
756			      pstr->valid_len = mbclen - (raw + offset - p);
757			      wc = wc2;
758			    }
759			  break;
760			}
761		}
762
763	      if (wc == WEOF)
764		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
765	      if (wc == WEOF)
766		pstr->tip_context
767		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
768	      else
769		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
770				      && IS_WIDE_WORD_CHAR (wc))
771				     ? CONTEXT_WORD
772				     : ((IS_WIDE_NEWLINE (wc)
773					 && pstr->newline_anchor)
774					? CONTEXT_NEWLINE : 0));
775	      if (BE (pstr->valid_len, 0))
776		{
777		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
778		    pstr->wcs[wcs_idx] = WEOF;
779		  if (pstr->mbs_allocated)
780		    memset (pstr->mbs, 255, pstr->valid_len);
781		}
782	      pstr->valid_raw_len = pstr->valid_len;
783	    }
784	  else
785#endif /* RE_ENABLE_I18N */
786	    {
787	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
788	      pstr->valid_raw_len = 0;
789	      if (pstr->trans)
790		c = pstr->trans[c];
791	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
792				   ? CONTEXT_WORD
793				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
794				      ? CONTEXT_NEWLINE : 0));
795	    }
796	}
797      if (!BE (pstr->mbs_allocated, 0))
798	pstr->mbs += offset;
799    }
800  pstr->raw_mbs_idx = idx;
801  pstr->len -= offset;
802  pstr->stop -= offset;
803
804  /* Then build the buffers.  */
805#ifdef RE_ENABLE_I18N
806  if (pstr->mb_cur_max > 1)
807    {
808      if (pstr->icase)
809	{
810	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
811	  if (BE (ret != REG_NOERROR, 0))
812	    return ret;
813	}
814      else
815	build_wcs_buffer (pstr);
816    }
817  else
818#endif /* RE_ENABLE_I18N */
819    if (BE (pstr->mbs_allocated, 0))
820      {
821	if (pstr->icase)
822	  build_upper_buffer (pstr);
823	else if (pstr->trans != NULL)
824	  re_string_translate_buffer (pstr);
825      }
826    else
827      pstr->valid_len = pstr->len;
828
829  pstr->cur_idx = 0;
830  return REG_NOERROR;
831}
832
833static unsigned char
834internal_function __attribute ((pure))
835re_string_peek_byte_case (const re_string_t *pstr, int idx)
836{
837  int ch, off;
838
839  /* Handle the common (easiest) cases first.  */
840  if (BE (!pstr->mbs_allocated, 1))
841    return re_string_peek_byte (pstr, idx);
842
843#ifdef RE_ENABLE_I18N
844  if (pstr->mb_cur_max > 1
845      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
846    return re_string_peek_byte (pstr, idx);
847#endif
848
849  off = pstr->cur_idx + idx;
850#ifdef RE_ENABLE_I18N
851  if (pstr->offsets_needed)
852    off = pstr->offsets[off];
853#endif
854
855  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
856
857#ifdef RE_ENABLE_I18N
858  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
859     this function returns CAPITAL LETTER I instead of first byte of
860     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
861     since peek_byte_case doesn't advance cur_idx in any way.  */
862  if (pstr->offsets_needed && !isascii (ch))
863    return re_string_peek_byte (pstr, idx);
864#endif
865
866  return ch;
867}
868
869static unsigned char
870internal_function
871re_string_fetch_byte_case (re_string_t *pstr)
872{
873  if (BE (!pstr->mbs_allocated, 1))
874    return re_string_fetch_byte (pstr);
875
876#ifdef RE_ENABLE_I18N
877  if (pstr->offsets_needed)
878    {
879      int off, ch;
880
881      /* For tr_TR.UTF-8 [[:islower:]] there is
882	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
883	 in that case the whole multi-byte character and return
884	 the original letter.  On the other side, with
885	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886	 anything else would complicate things too much.  */
887
888      if (!re_string_first_byte (pstr, pstr->cur_idx))
889	return re_string_fetch_byte (pstr);
890
891      off = pstr->offsets[pstr->cur_idx];
892      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893
894      if (! isascii (ch))
895	return re_string_fetch_byte (pstr);
896
897      re_string_skip_bytes (pstr,
898			    re_string_char_size_at (pstr, pstr->cur_idx));
899      return ch;
900    }
901#endif
902
903  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904}
905
906static void
907internal_function
908re_string_destruct (re_string_t *pstr)
909{
910#ifdef RE_ENABLE_I18N
911  re_free (pstr->wcs);
912  re_free (pstr->offsets);
913#endif /* RE_ENABLE_I18N  */
914  if (pstr->mbs_allocated)
915    re_free (pstr->mbs);
916}
917
918/* Return the context at IDX in INPUT.  */
919
920static unsigned int
921internal_function
922re_string_context_at (const re_string_t *input, int idx, int eflags)
923{
924  int c;
925  if (BE (idx < 0, 0))
926    /* In this case, we use the value stored in input->tip_context,
927       since we can't know the character in input->mbs[-1] here.  */
928    return input->tip_context;
929  if (BE (idx == input->len, 0))
930    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932#ifdef RE_ENABLE_I18N
933  if (input->mb_cur_max > 1)
934    {
935      wint_t wc;
936      int wc_idx = idx;
937      while(input->wcs[wc_idx] == WEOF)
938	{
939#ifdef DEBUG
940	  /* It must not happen.  */
941	  assert (wc_idx >= 0);
942#endif
943	  --wc_idx;
944	  if (wc_idx < 0)
945	    return input->tip_context;
946	}
947      wc = input->wcs[wc_idx];
948      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949	return CONTEXT_WORD;
950      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951	      ? CONTEXT_NEWLINE : 0);
952    }
953  else
954#endif
955    {
956      c = re_string_byte_at (input, idx);
957      if (bitset_contain (input->word_char, c))
958	return CONTEXT_WORD;
959      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960    }
961}
962
963/* Functions for set operation.  */
964
965static reg_errcode_t
966internal_function __attribute_warn_unused_result__
967re_node_set_alloc (re_node_set *set, int size)
968{
969  set->alloc = size;
970  set->nelem = 0;
971  set->elems = re_malloc (int, size);
972  if (BE (set->elems == NULL, 0))
973    return REG_ESPACE;
974  return REG_NOERROR;
975}
976
977static reg_errcode_t
978internal_function __attribute_warn_unused_result__
979re_node_set_init_1 (re_node_set *set, int elem)
980{
981  set->alloc = 1;
982  set->nelem = 1;
983  set->elems = re_malloc (int, 1);
984  if (BE (set->elems == NULL, 0))
985    {
986      set->alloc = set->nelem = 0;
987      return REG_ESPACE;
988    }
989  set->elems[0] = elem;
990  return REG_NOERROR;
991}
992
993static reg_errcode_t
994internal_function __attribute_warn_unused_result__
995re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
996{
997  set->alloc = 2;
998  set->elems = re_malloc (int, 2);
999  if (BE (set->elems == NULL, 0))
1000    return REG_ESPACE;
1001  if (elem1 == elem2)
1002    {
1003      set->nelem = 1;
1004      set->elems[0] = elem1;
1005    }
1006  else
1007    {
1008      set->nelem = 2;
1009      if (elem1 < elem2)
1010	{
1011	  set->elems[0] = elem1;
1012	  set->elems[1] = elem2;
1013	}
1014      else
1015	{
1016	  set->elems[0] = elem2;
1017	  set->elems[1] = elem1;
1018	}
1019    }
1020  return REG_NOERROR;
1021}
1022
1023static reg_errcode_t
1024internal_function __attribute_warn_unused_result__
1025re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026{
1027  dest->nelem = src->nelem;
1028  if (src->nelem > 0)
1029    {
1030      dest->alloc = dest->nelem;
1031      dest->elems = re_malloc (int, dest->alloc);
1032      if (BE (dest->elems == NULL, 0))
1033	{
1034	  dest->alloc = dest->nelem = 0;
1035	  return REG_ESPACE;
1036	}
1037      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1038    }
1039  else
1040    re_node_set_init_empty (dest);
1041  return REG_NOERROR;
1042}
1043
1044/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047
1048static reg_errcode_t
1049internal_function __attribute_warn_unused_result__
1050re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051			   const re_node_set *src2)
1052{
1053  int i1, i2, is, id, delta, sbase;
1054  if (src1->nelem == 0 || src2->nelem == 0)
1055    return REG_NOERROR;
1056
1057  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058     conservative estimate.  */
1059  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060    {
1061      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062      int *new_elems = re_realloc (dest->elems, int, new_alloc);
1063      if (BE (new_elems == NULL, 0))
1064	return REG_ESPACE;
1065      dest->elems = new_elems;
1066      dest->alloc = new_alloc;
1067    }
1068
1069  /* Find the items in the intersection of SRC1 and SRC2, and copy
1070     into the top of DEST those that are not already in DEST itself.  */
1071  sbase = dest->nelem + src1->nelem + src2->nelem;
1072  i1 = src1->nelem - 1;
1073  i2 = src2->nelem - 1;
1074  id = dest->nelem - 1;
1075  for (;;)
1076    {
1077      if (src1->elems[i1] == src2->elems[i2])
1078	{
1079	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1080	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1081	    --id;
1082
1083	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1084	    dest->elems[--sbase] = src1->elems[i1];
1085
1086	  if (--i1 < 0 || --i2 < 0)
1087	    break;
1088	}
1089
1090      /* Lower the highest of the two items.  */
1091      else if (src1->elems[i1] < src2->elems[i2])
1092	{
1093	  if (--i2 < 0)
1094	    break;
1095	}
1096      else
1097	{
1098	  if (--i1 < 0)
1099	    break;
1100	}
1101    }
1102
1103  id = dest->nelem - 1;
1104  is = dest->nelem + src1->nelem + src2->nelem - 1;
1105  delta = is - sbase + 1;
1106
1107  /* Now copy.  When DELTA becomes zero, the remaining
1108     DEST elements are already in place; this is more or
1109     less the same loop that is in re_node_set_merge.  */
1110  dest->nelem += delta;
1111  if (delta > 0 && id >= 0)
1112    for (;;)
1113      {
1114	if (dest->elems[is] > dest->elems[id])
1115	  {
1116	    /* Copy from the top.  */
1117	    dest->elems[id + delta--] = dest->elems[is--];
1118	    if (delta == 0)
1119	      break;
1120	  }
1121	else
1122	  {
1123	    /* Slide from the bottom.  */
1124	    dest->elems[id + delta] = dest->elems[id];
1125	    if (--id < 0)
1126	      break;
1127	  }
1128      }
1129
1130  /* Copy remaining SRC elements.  */
1131  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1132
1133  return REG_NOERROR;
1134}
1135
1136/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138
1139static reg_errcode_t
1140internal_function __attribute_warn_unused_result__
1141re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142			const re_node_set *src2)
1143{
1144  int i1, i2, id;
1145  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146    {
1147      dest->alloc = src1->nelem + src2->nelem;
1148      dest->elems = re_malloc (int, dest->alloc);
1149      if (BE (dest->elems == NULL, 0))
1150	return REG_ESPACE;
1151    }
1152  else
1153    {
1154      if (src1 != NULL && src1->nelem > 0)
1155	return re_node_set_init_copy (dest, src1);
1156      else if (src2 != NULL && src2->nelem > 0)
1157	return re_node_set_init_copy (dest, src2);
1158      else
1159	re_node_set_init_empty (dest);
1160      return REG_NOERROR;
1161    }
1162  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163    {
1164      if (src1->elems[i1] > src2->elems[i2])
1165	{
1166	  dest->elems[id++] = src2->elems[i2++];
1167	  continue;
1168	}
1169      if (src1->elems[i1] == src2->elems[i2])
1170	++i2;
1171      dest->elems[id++] = src1->elems[i1++];
1172    }
1173  if (i1 < src1->nelem)
1174    {
1175      memcpy (dest->elems + id, src1->elems + i1,
1176	     (src1->nelem - i1) * sizeof (int));
1177      id += src1->nelem - i1;
1178    }
1179  else if (i2 < src2->nelem)
1180    {
1181      memcpy (dest->elems + id, src2->elems + i2,
1182	     (src2->nelem - i2) * sizeof (int));
1183      id += src2->nelem - i2;
1184    }
1185  dest->nelem = id;
1186  return REG_NOERROR;
1187}
1188
1189/* Calculate the union set of the sets DEST and SRC. And store it to
1190   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191
1192static reg_errcode_t
1193internal_function __attribute_warn_unused_result__
1194re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195{
1196  int is, id, sbase, delta;
1197  if (src == NULL || src->nelem == 0)
1198    return REG_NOERROR;
1199  if (dest->alloc < 2 * src->nelem + dest->nelem)
1200    {
1201      int new_alloc = 2 * (src->nelem + dest->alloc);
1202      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1203      if (BE (new_buffer == NULL, 0))
1204	return REG_ESPACE;
1205      dest->elems = new_buffer;
1206      dest->alloc = new_alloc;
1207    }
1208
1209  if (BE (dest->nelem == 0, 0))
1210    {
1211      dest->nelem = src->nelem;
1212      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1213      return REG_NOERROR;
1214    }
1215
1216  /* Copy into the top of DEST the items of SRC that are not
1217     found in DEST.  Maybe we could binary search in DEST?  */
1218  for (sbase = dest->nelem + 2 * src->nelem,
1219       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220    {
1221      if (dest->elems[id] == src->elems[is])
1222	is--, id--;
1223      else if (dest->elems[id] < src->elems[is])
1224	dest->elems[--sbase] = src->elems[is--];
1225      else /* if (dest->elems[id] > src->elems[is]) */
1226	--id;
1227    }
1228
1229  if (is >= 0)
1230    {
1231      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1232      sbase -= is + 1;
1233      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1234    }
1235
1236  id = dest->nelem - 1;
1237  is = dest->nelem + 2 * src->nelem - 1;
1238  delta = is - sbase + 1;
1239  if (delta == 0)
1240    return REG_NOERROR;
1241
1242  /* Now copy.  When DELTA becomes zero, the remaining
1243     DEST elements are already in place.  */
1244  dest->nelem += delta;
1245  for (;;)
1246    {
1247      if (dest->elems[is] > dest->elems[id])
1248	{
1249	  /* Copy from the top.  */
1250	  dest->elems[id + delta--] = dest->elems[is--];
1251	  if (delta == 0)
1252	    break;
1253	}
1254      else
1255	{
1256	  /* Slide from the bottom.  */
1257	  dest->elems[id + delta] = dest->elems[id];
1258	  if (--id < 0)
1259	    {
1260	      /* Copy remaining SRC elements.  */
1261	      memcpy (dest->elems, dest->elems + sbase,
1262		      delta * sizeof (int));
1263	      break;
1264	    }
1265	}
1266    }
1267
1268  return REG_NOERROR;
1269}
1270
1271/* Insert the new element ELEM to the re_node_set* SET.
1272   SET should not already have ELEM.
1273   return -1 if an error is occured, return 1 otherwise.  */
1274
1275static int
1276internal_function __attribute_warn_unused_result__
1277re_node_set_insert (re_node_set *set, int elem)
1278{
1279  int idx;
1280  /* In case the set is empty.  */
1281  if (set->alloc == 0)
1282    {
1283      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1284	return 1;
1285      else
1286	return -1;
1287    }
1288
1289  if (BE (set->nelem, 0) == 0)
1290    {
1291      /* We already guaranteed above that set->alloc != 0.  */
1292      set->elems[0] = elem;
1293      ++set->nelem;
1294      return 1;
1295    }
1296
1297  /* Realloc if we need.  */
1298  if (set->alloc == set->nelem)
1299    {
1300      int *new_elems;
1301      set->alloc = set->alloc * 2;
1302      new_elems = re_realloc (set->elems, int, set->alloc);
1303      if (BE (new_elems == NULL, 0))
1304	return -1;
1305      set->elems = new_elems;
1306    }
1307
1308  /* Move the elements which follows the new element.  Test the
1309     first element separately to skip a check in the inner loop.  */
1310  if (elem < set->elems[0])
1311    {
1312      idx = 0;
1313      for (idx = set->nelem; idx > 0; idx--)
1314	set->elems[idx] = set->elems[idx - 1];
1315    }
1316  else
1317    {
1318      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319	set->elems[idx] = set->elems[idx - 1];
1320    }
1321
1322  /* Insert the new element.  */
1323  set->elems[idx] = elem;
1324  ++set->nelem;
1325  return 1;
1326}
1327
1328/* Insert the new element ELEM to the re_node_set* SET.
1329   SET should not already have any element greater than or equal to ELEM.
1330   Return -1 if an error is occured, return 1 otherwise.  */
1331
1332static int
1333internal_function __attribute_warn_unused_result__
1334re_node_set_insert_last (re_node_set *set, int elem)
1335{
1336  /* Realloc if we need.  */
1337  if (set->alloc == set->nelem)
1338    {
1339      int *new_elems;
1340      set->alloc = (set->alloc + 1) * 2;
1341      new_elems = re_realloc (set->elems, int, set->alloc);
1342      if (BE (new_elems == NULL, 0))
1343	return -1;
1344      set->elems = new_elems;
1345    }
1346
1347  /* Insert the new element.  */
1348  set->elems[set->nelem++] = elem;
1349  return 1;
1350}
1351
1352/* Compare two node sets SET1 and SET2.
1353   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1354
1355static int
1356internal_function __attribute ((pure))
1357re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358{
1359  int i;
1360  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361    return 0;
1362  for (i = set1->nelem ; --i >= 0 ; )
1363    if (set1->elems[i] != set2->elems[i])
1364      return 0;
1365  return 1;
1366}
1367
1368/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1369
1370static int
1371internal_function __attribute ((pure))
1372re_node_set_contains (const re_node_set *set, int elem)
1373{
1374  unsigned int idx, right, mid;
1375  if (set->nelem <= 0)
1376    return 0;
1377
1378  /* Binary search the element.  */
1379  idx = 0;
1380  right = set->nelem - 1;
1381  while (idx < right)
1382    {
1383      mid = (idx + right) / 2;
1384      if (set->elems[mid] < elem)
1385	idx = mid + 1;
1386      else
1387	right = mid;
1388    }
1389  return set->elems[idx] == elem ? idx + 1 : 0;
1390}
1391
1392static void
1393internal_function
1394re_node_set_remove_at (re_node_set *set, int idx)
1395{
1396  if (idx < 0 || idx >= set->nelem)
1397    return;
1398  --set->nelem;
1399  for (; idx < set->nelem; idx++)
1400    set->elems[idx] = set->elems[idx + 1];
1401}
1402
1403
1404/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405   Or return -1, if an error will be occured.  */
1406
1407static int
1408internal_function
1409re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410{
1411  int type = token.type;
1412  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413    {
1414      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415      int *new_nexts, *new_indices;
1416      re_node_set *new_edests, *new_eclosures;
1417      re_token_t *new_nodes;
1418
1419      /* Avoid overflows in realloc.  */
1420      const size_t max_object_size = MAX (sizeof (re_token_t),
1421					  MAX (sizeof (re_node_set),
1422					       sizeof (int)));
1423      if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1424	return -1;
1425
1426      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427      if (BE (new_nodes == NULL, 0))
1428	return -1;
1429      dfa->nodes = new_nodes;
1430      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1431      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1432      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434      if (BE (new_nexts == NULL || new_indices == NULL
1435	      || new_edests == NULL || new_eclosures == NULL, 0))
1436	return -1;
1437      dfa->nexts = new_nexts;
1438      dfa->org_indices = new_indices;
1439      dfa->edests = new_edests;
1440      dfa->eclosures = new_eclosures;
1441      dfa->nodes_alloc = new_nodes_alloc;
1442    }
1443  dfa->nodes[dfa->nodes_len] = token;
1444  dfa->nodes[dfa->nodes_len].constraint = 0;
1445#ifdef RE_ENABLE_I18N
1446  dfa->nodes[dfa->nodes_len].accept_mb =
1447    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1448#endif
1449  dfa->nexts[dfa->nodes_len] = -1;
1450  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452  return dfa->nodes_len++;
1453}
1454
1455static inline unsigned int
1456internal_function
1457calc_state_hash (const re_node_set *nodes, unsigned int context)
1458{
1459  unsigned int hash = nodes->nelem + context;
1460  int i;
1461  for (i = 0 ; i < nodes->nelem ; i++)
1462    hash += nodes->elems[i];
1463  return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467   Return the pointer to the state, if we found it in the DFA.
1468   Otherwise create the new one and return it.  In case of an error
1469   return NULL and set the error code in ERR.
1470   Note: - We assume NULL as the invalid state, then it is possible that
1471	   return value is NULL and ERR is REG_NOERROR.
1472	 - We never return non-NULL value in case of any errors, it is for
1473	   optimization.  */
1474
1475static re_dfastate_t *
1476internal_function __attribute_warn_unused_result__
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478		  const re_node_set *nodes)
1479{
1480  unsigned int hash;
1481  re_dfastate_t *new_state;
1482  struct re_state_table_entry *spot;
1483  int i;
1484  if (BE (nodes->nelem == 0, 0))
1485    {
1486      *err = REG_NOERROR;
1487      return NULL;
1488    }
1489  hash = calc_state_hash (nodes, 0);
1490  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491
1492  for (i = 0 ; i < spot->num ; i++)
1493    {
1494      re_dfastate_t *state = spot->array[i];
1495      if (hash != state->hash)
1496	continue;
1497      if (re_node_set_compare (&state->nodes, nodes))
1498	return state;
1499    }
1500
1501  /* There are no appropriate state in the dfa, create the new one.  */
1502  new_state = create_ci_newstate (dfa, nodes, hash);
1503  if (BE (new_state == NULL, 0))
1504    *err = REG_ESPACE;
1505
1506  return new_state;
1507}
1508
1509/* Search for the state whose node_set is equivalent to NODES and
1510   whose context is equivalent to CONTEXT.
1511   Return the pointer to the state, if we found it in the DFA.
1512   Otherwise create the new one and return it.  In case of an error
1513   return NULL and set the error code in ERR.
1514   Note: - We assume NULL as the invalid state, then it is possible that
1515	   return value is NULL and ERR is REG_NOERROR.
1516	 - We never return non-NULL value in case of any errors, it is for
1517	   optimization.  */
1518
1519static re_dfastate_t *
1520internal_function __attribute_warn_unused_result__
1521re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1522			  const re_node_set *nodes, unsigned int context)
1523{
1524  unsigned int hash;
1525  re_dfastate_t *new_state;
1526  struct re_state_table_entry *spot;
1527  int i;
1528  if (nodes->nelem == 0)
1529    {
1530      *err = REG_NOERROR;
1531      return NULL;
1532    }
1533  hash = calc_state_hash (nodes, context);
1534  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1535
1536  for (i = 0 ; i < spot->num ; i++)
1537    {
1538      re_dfastate_t *state = spot->array[i];
1539      if (state->hash == hash
1540	  && state->context == context
1541	  && re_node_set_compare (state->entrance_nodes, nodes))
1542	return state;
1543    }
1544  /* There are no appropriate state in `dfa', create the new one.  */
1545  new_state = create_cd_newstate (dfa, nodes, context, hash);
1546  if (BE (new_state == NULL, 0))
1547    *err = REG_ESPACE;
1548
1549  return new_state;
1550}
1551
1552/* Finish initialization of the new state NEWSTATE, and using its hash value
1553   HASH put in the appropriate bucket of DFA's state table.  Return value
1554   indicates the error code if failed.  */
1555
1556static reg_errcode_t
1557__attribute_warn_unused_result__
1558register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1559		unsigned int hash)
1560{
1561  struct re_state_table_entry *spot;
1562  reg_errcode_t err;
1563  int i;
1564
1565  newstate->hash = hash;
1566  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1567  if (BE (err != REG_NOERROR, 0))
1568    return REG_ESPACE;
1569  for (i = 0; i < newstate->nodes.nelem; i++)
1570    {
1571      int elem = newstate->nodes.elems[i];
1572      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1573	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1574	  return REG_ESPACE;
1575    }
1576
1577  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1578  if (BE (spot->alloc <= spot->num, 0))
1579    {
1580      int new_alloc = 2 * spot->num + 2;
1581      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1582					      new_alloc);
1583      if (BE (new_array == NULL, 0))
1584	return REG_ESPACE;
1585      spot->array = new_array;
1586      spot->alloc = new_alloc;
1587    }
1588  spot->array[spot->num++] = newstate;
1589  return REG_NOERROR;
1590}
1591
1592static void
1593free_state (re_dfastate_t *state)
1594{
1595  re_node_set_free (&state->non_eps_nodes);
1596  re_node_set_free (&state->inveclosure);
1597  if (state->entrance_nodes != &state->nodes)
1598    {
1599      re_node_set_free (state->entrance_nodes);
1600      re_free (state->entrance_nodes);
1601    }
1602  re_node_set_free (&state->nodes);
1603  re_free (state->word_trtable);
1604  re_free (state->trtable);
1605  re_free (state);
1606}
1607
1608/* Create the new state which is independ of contexts.
1609   Return the new state if succeeded, otherwise return NULL.  */
1610
1611static re_dfastate_t *
1612internal_function __attribute_warn_unused_result__
1613create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1614		    unsigned int hash)
1615{
1616  int i;
1617  reg_errcode_t err;
1618  re_dfastate_t *newstate;
1619
1620  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1621  if (BE (newstate == NULL, 0))
1622    return NULL;
1623  err = re_node_set_init_copy (&newstate->nodes, nodes);
1624  if (BE (err != REG_NOERROR, 0))
1625    {
1626      re_free (newstate);
1627      return NULL;
1628    }
1629
1630  newstate->entrance_nodes = &newstate->nodes;
1631  for (i = 0 ; i < nodes->nelem ; i++)
1632    {
1633      re_token_t *node = dfa->nodes + nodes->elems[i];
1634      re_token_type_t type = node->type;
1635      if (type == CHARACTER && !node->constraint)
1636	continue;
1637#ifdef RE_ENABLE_I18N
1638      newstate->accept_mb |= node->accept_mb;
1639#endif /* RE_ENABLE_I18N */
1640
1641      /* If the state has the halt node, the state is a halt state.  */
1642      if (type == END_OF_RE)
1643	newstate->halt = 1;
1644      else if (type == OP_BACK_REF)
1645	newstate->has_backref = 1;
1646      else if (type == ANCHOR || node->constraint)
1647	newstate->has_constraint = 1;
1648    }
1649  err = register_state (dfa, newstate, hash);
1650  if (BE (err != REG_NOERROR, 0))
1651    {
1652      free_state (newstate);
1653      newstate = NULL;
1654    }
1655  return newstate;
1656}
1657
1658/* Create the new state which is depend on the context CONTEXT.
1659   Return the new state if succeeded, otherwise return NULL.  */
1660
1661static re_dfastate_t *
1662internal_function __attribute_warn_unused_result__
1663create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1664		    unsigned int context, unsigned int hash)
1665{
1666  int i, nctx_nodes = 0;
1667  reg_errcode_t err;
1668  re_dfastate_t *newstate;
1669
1670  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1671  if (BE (newstate == NULL, 0))
1672    return NULL;
1673  err = re_node_set_init_copy (&newstate->nodes, nodes);
1674  if (BE (err != REG_NOERROR, 0))
1675    {
1676      re_free (newstate);
1677      return NULL;
1678    }
1679
1680  newstate->context = context;
1681  newstate->entrance_nodes = &newstate->nodes;
1682
1683  for (i = 0 ; i < nodes->nelem ; i++)
1684    {
1685      re_token_t *node = dfa->nodes + nodes->elems[i];
1686      re_token_type_t type = node->type;
1687      unsigned int constraint = node->constraint;
1688
1689      if (type == CHARACTER && !constraint)
1690	continue;
1691#ifdef RE_ENABLE_I18N
1692      newstate->accept_mb |= node->accept_mb;
1693#endif /* RE_ENABLE_I18N */
1694
1695      /* If the state has the halt node, the state is a halt state.  */
1696      if (type == END_OF_RE)
1697	newstate->halt = 1;
1698      else if (type == OP_BACK_REF)
1699	newstate->has_backref = 1;
1700
1701      if (constraint)
1702	{
1703	  if (newstate->entrance_nodes == &newstate->nodes)
1704	    {
1705	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1706	      if (BE (newstate->entrance_nodes == NULL, 0))
1707		{
1708		  free_state (newstate);
1709		  return NULL;
1710		}
1711	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1712		  != REG_NOERROR)
1713		return NULL;
1714	      nctx_nodes = 0;
1715	      newstate->has_constraint = 1;
1716	    }
1717
1718	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1719	    {
1720	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1721	      ++nctx_nodes;
1722	    }
1723	}
1724    }
1725  err = register_state (dfa, newstate, hash);
1726  if (BE (err != REG_NOERROR, 0))
1727    {
1728      free_state (newstate);
1729      newstate = NULL;
1730    }
1731  return  newstate;
1732}
1733