1/* search.c - searching subroutines using dfa, kwset and regex for grep.
2   Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17   02111-1307, USA.  */
18
19/* Written August 1992 by Mike Haertel. */
20
21/* $FreeBSD$ */
22
23#ifndef _GNU_SOURCE
24# define _GNU_SOURCE 1
25#endif
26#ifdef HAVE_CONFIG_H
27# include <config.h>
28#endif
29#include <assert.h>
30#include <sys/types.h>
31#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
32/* We can handle multibyte string.  */
33# define MBS_SUPPORT
34# include <wchar.h>
35# include <wctype.h>
36#endif
37
38#include "system.h"
39#include "grep.h"
40#include "regex.h"
41#include "dfa.h"
42#include "kwset.h"
43#include "error.h"
44#include "xalloc.h"
45#ifdef HAVE_LIBPCRE
46# include <pcre.h>
47#endif
48#ifdef HAVE_LANGINFO_CODESET
49# include <langinfo.h>
50#endif
51
52#define NCHAR (UCHAR_MAX + 1)
53
54/* For -w, we also consider _ to be word constituent.  */
55#define WCHAR(C) (ISALNUM(C) || (C) == '_')
56
57/* DFA compiled regexp. */
58static struct dfa dfa;
59
60/* The Regex compiled patterns.  */
61static struct patterns
62{
63  /* Regex compiled regexp. */
64  struct re_pattern_buffer regexbuf;
65  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
66			       Q@#%!# library interface in regex.c.  */
67} patterns0;
68
69struct patterns *patterns;
70size_t pcount;
71
72/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
73   a list of strings, at least one of which is known to occur in
74   any string matching the regexp. */
75static kwset_t kwset;
76
77/* Number of compiled fixed strings known to exactly match the regexp.
78   If kwsexec returns < kwset_exact_matches, then we don't need to
79   call the regexp matcher at all. */
80static int kwset_exact_matches;
81
82/* UTF-8 encoding allows some optimizations that we can't otherwise
83   assume in a multibyte encoding. */
84static int using_utf8;
85
86static void kwsinit PARAMS ((void));
87static void kwsmusts PARAMS ((void));
88static void Gcompile PARAMS ((char const *, size_t));
89static void Ecompile PARAMS ((char const *, size_t));
90static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
91static void Fcompile PARAMS ((char const *, size_t));
92static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
93static void Pcompile PARAMS ((char const *, size_t ));
94static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
95
96void
97check_utf8 (void)
98{
99#ifdef HAVE_LANGINFO_CODESET
100  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
101    using_utf8 = 1;
102#endif
103}
104
105void
106dfaerror (char const *mesg)
107{
108  error (2, 0, mesg);
109}
110
111static void
112kwsinit (void)
113{
114  static char trans[NCHAR];
115  size_t i;
116
117  if (match_icase)
118    for (i = 0; i < NCHAR; ++i)
119      trans[i] = TOLOWER (i);
120
121  if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
122    error (2, 0, _("memory exhausted"));
123}
124
125/* If the DFA turns out to have some set of fixed strings one of
126   which must occur in the match, then we build a kwset matcher
127   to find those strings, and thus quickly filter out impossible
128   matches. */
129static void
130kwsmusts (void)
131{
132  struct dfamust const *dm;
133  char const *err;
134
135  if (dfa.musts)
136    {
137      kwsinit ();
138      /* First, we compile in the substrings known to be exact
139	 matches.  The kwset matcher will return the index
140	 of the matching string that it chooses. */
141      for (dm = dfa.musts; dm; dm = dm->next)
142	{
143	  if (!dm->exact)
144	    continue;
145	  ++kwset_exact_matches;
146	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
147	    error (2, 0, err);
148	}
149      /* Now, we compile the substrings that will require
150	 the use of the regexp matcher.  */
151      for (dm = dfa.musts; dm; dm = dm->next)
152	{
153	  if (dm->exact)
154	    continue;
155	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
156	    error (2, 0, err);
157	}
158      if ((err = kwsprep (kwset)) != 0)
159	error (2, 0, err);
160    }
161}
162
163static void
164Gcompile (char const *pattern, size_t size)
165{
166  const char *err;
167  char const *sep;
168  size_t total = size;
169  char const *motif = pattern;
170
171  check_utf8 ();
172  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
173  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
174
175  /* For GNU regex compiler we have to pass the patterns separately to detect
176     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
177     GNU regex should have raise a syntax error.  The same for backref, where
178     the backref should have been local to each pattern.  */
179  do
180    {
181      size_t len;
182      sep = memchr (motif, '\n', total);
183      if (sep)
184	{
185	  len = sep - motif;
186	  sep++;
187	  total -= (len + 1);
188	}
189      else
190	{
191	  len = total;
192	  total = 0;
193	}
194
195      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
196      if (patterns == NULL)
197	error (2, errno, _("memory exhausted"));
198
199      patterns[pcount] = patterns0;
200
201      if ((err = re_compile_pattern (motif, len,
202				    &(patterns[pcount].regexbuf))) != 0)
203	error (2, 0, err);
204      pcount++;
205
206      motif = sep;
207    } while (sep && total != 0);
208
209  /* In the match_words and match_lines cases, we use a different pattern
210     for the DFA matcher that will quickly throw out cases that won't work.
211     Then if DFA succeeds we do some hairy stuff using the regex matcher
212     to decide whether the match should really count. */
213  if (match_words || match_lines)
214    {
215      /* In the whole-word case, we use the pattern:
216	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
217	 In the whole-line case, we use the pattern:
218	 ^\(userpattern\)$.  */
219
220      static char const line_beg[] = "^\\(";
221      static char const line_end[] = "\\)$";
222      static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
223      static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
224      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
225      size_t i;
226      strcpy (n, match_lines ? line_beg : word_beg);
227      i = strlen (n);
228      memcpy (n + i, pattern, size);
229      i += size;
230      strcpy (n + i, match_lines ? line_end : word_end);
231      i += strlen (n + i);
232      pattern = n;
233      size = i;
234    }
235
236  dfacomp (pattern, size, &dfa, 1);
237  kwsmusts ();
238}
239
240static void
241Ecompile (char const *pattern, size_t size)
242{
243  const char *err;
244  const char *sep;
245  size_t total = size;
246  char const *motif = pattern;
247
248  check_utf8 ();
249  if (strcmp (matcher, "awk") == 0)
250    {
251      re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
252      dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
253    }
254  else
255    {
256      re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
257      dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
258    }
259
260  /* For GNU regex compiler we have to pass the patterns separately to detect
261     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
262     GNU regex should have raise a syntax error.  The same for backref, where
263     the backref should have been local to each pattern.  */
264  do
265    {
266      size_t len;
267      sep = memchr (motif, '\n', total);
268      if (sep)
269	{
270	  len = sep - motif;
271	  sep++;
272	  total -= (len + 1);
273	}
274      else
275	{
276	  len = total;
277	  total = 0;
278	}
279
280      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
281      if (patterns == NULL)
282	error (2, errno, _("memory exhausted"));
283      patterns[pcount] = patterns0;
284
285      if ((err = re_compile_pattern (motif, len,
286				    &(patterns[pcount].regexbuf))) != 0)
287	error (2, 0, err);
288      pcount++;
289
290      motif = sep;
291    } while (sep && total != 0);
292
293  /* In the match_words and match_lines cases, we use a different pattern
294     for the DFA matcher that will quickly throw out cases that won't work.
295     Then if DFA succeeds we do some hairy stuff using the regex matcher
296     to decide whether the match should really count. */
297  if (match_words || match_lines)
298    {
299      /* In the whole-word case, we use the pattern:
300	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
301	 In the whole-line case, we use the pattern:
302	 ^(userpattern)$.  */
303
304      static char const line_beg[] = "^(";
305      static char const line_end[] = ")$";
306      static char const word_beg[] = "(^|[^[:alnum:]_])(";
307      static char const word_end[] = ")([^[:alnum:]_]|$)";
308      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
309      size_t i;
310      strcpy (n, match_lines ? line_beg : word_beg);
311      i = strlen(n);
312      memcpy (n + i, pattern, size);
313      i += size;
314      strcpy (n + i, match_lines ? line_end : word_end);
315      i += strlen (n + i);
316      pattern = n;
317      size = i;
318    }
319
320  dfacomp (pattern, size, &dfa, 1);
321  kwsmusts ();
322}
323
324static size_t
325EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
326{
327  register char const *buflim, *beg, *end;
328  char eol = eolbyte;
329  int backref;
330  ptrdiff_t start, len;
331  struct kwsmatch kwsm;
332  size_t i, ret_val;
333  static int use_dfa;
334  static int use_dfa_checked = 0;
335#ifdef MBS_SUPPORT
336  const char *last_char = NULL;
337  int mb_cur_max = MB_CUR_MAX;
338  mbstate_t mbs;
339  memset (&mbs, '\0', sizeof (mbstate_t));
340#endif /* MBS_SUPPORT */
341
342  if (!use_dfa_checked)
343    {
344      char *grep_use_dfa = getenv ("GREP_USE_DFA");
345      if (!grep_use_dfa)
346	{
347#ifdef MBS_SUPPORT
348	  /* Turn off DFA when processing multibyte input. */
349	  use_dfa = (MB_CUR_MAX == 1);
350#else
351	  use_dfa = 1;
352#endif /* MBS_SUPPORT */
353	}
354      else
355	{
356	  use_dfa = atoi (grep_use_dfa);
357	}
358
359      use_dfa_checked = 1;
360    }
361
362  buflim = buf + size;
363
364  for (beg = end = buf; end < buflim; beg = end)
365    {
366      if (!exact)
367	{
368	  if (kwset)
369	    {
370	      /* Find a possible match using the KWset matcher. */
371#ifdef MBS_SUPPORT
372	      size_t bytes_left = 0;
373#endif /* MBS_SUPPORT */
374	      size_t offset;
375#ifdef MBS_SUPPORT
376	      /* kwsexec doesn't work with match_icase and multibyte input. */
377	      if (match_icase && mb_cur_max > 1)
378		/* Avoid kwset */
379		offset = 0;
380	      else
381#endif /* MBS_SUPPORT */
382	      offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
383	      if (offset == (size_t) -1)
384	        goto failure;
385#ifdef MBS_SUPPORT
386	      if (mb_cur_max > 1 && !using_utf8)
387		{
388		  bytes_left = offset;
389		  while (bytes_left)
390		    {
391		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
392
393		      last_char = beg;
394		      if (mlen == (size_t) -1 || mlen == 0)
395			{
396			  /* Incomplete character: treat as single-byte. */
397			  memset (&mbs, '\0', sizeof (mbstate_t));
398			  beg++;
399			  bytes_left--;
400			  continue;
401			}
402
403		      if (mlen == (size_t) -2)
404			/* Offset points inside multibyte character:
405			 * no good. */
406			break;
407
408		      beg += mlen;
409		      bytes_left -= mlen;
410		    }
411		}
412	      else
413#endif /* MBS_SUPPORT */
414	      beg += offset;
415	      /* Narrow down to the line containing the candidate, and
416		 run it through DFA. */
417	      end = memchr(beg, eol, buflim - beg);
418	      end++;
419#ifdef MBS_SUPPORT
420	      if (mb_cur_max > 1 && bytes_left)
421		continue;
422#endif /* MBS_SUPPORT */
423	      while (beg > buf && beg[-1] != eol)
424		--beg;
425	      if (
426#ifdef MBS_SUPPORT
427		  !(match_icase && mb_cur_max > 1) &&
428#endif /* MBS_SUPPORT */
429		  (kwsm.index < kwset_exact_matches))
430		goto success_in_beg_and_end;
431	      if (use_dfa &&
432		  dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
433		continue;
434	    }
435	  else
436	    {
437	      /* No good fixed strings; start with DFA. */
438#ifdef MBS_SUPPORT
439	      size_t bytes_left = 0;
440#endif /* MBS_SUPPORT */
441	      size_t offset = 0;
442	      if (use_dfa)
443		offset = dfaexec (&dfa, beg, buflim - beg, &backref);
444	      if (offset == (size_t) -1)
445		break;
446	      /* Narrow down to the line we've found. */
447#ifdef MBS_SUPPORT
448	      if (mb_cur_max > 1 && !using_utf8)
449		{
450		  bytes_left = offset;
451		  while (bytes_left)
452		    {
453		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
454
455		      last_char = beg;
456		      if (mlen == (size_t) -1 || mlen == 0)
457			{
458			  /* Incomplete character: treat as single-byte. */
459			  memset (&mbs, '\0', sizeof (mbstate_t));
460			  beg++;
461			  bytes_left--;
462			  continue;
463			}
464
465		      if (mlen == (size_t) -2)
466			/* Offset points inside multibyte character:
467			 * no good. */
468			break;
469
470		      beg += mlen;
471		      bytes_left -= mlen;
472		    }
473		}
474	      else
475#endif /* MBS_SUPPORT */
476	      beg += offset;
477	      end = memchr (beg, eol, buflim - beg);
478	      end++;
479#ifdef MBS_SUPPORT
480	      if (mb_cur_max > 1 && bytes_left)
481		continue;
482#endif /* MBS_SUPPORT */
483	      while (beg > buf && beg[-1] != eol)
484		--beg;
485	    }
486	  /* Successful, no backreferences encountered! */
487	  if (use_dfa && !backref)
488	    goto success_in_beg_and_end;
489	}
490      else
491	end = beg + size;
492
493      /* If we've made it to this point, this means DFA has seen
494	 a probable match, and we need to run it through Regex. */
495      for (i = 0; i < pcount; i++)
496	{
497	  patterns[i].regexbuf.not_eol = 0;
498	  if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
499				       end - beg - 1, 0,
500				       end - beg - 1, &(patterns[i].regs))))
501	    {
502	      len = patterns[i].regs.end[0] - start;
503	      if (exact && !match_words)
504	        goto success_in_start_and_len;
505	      if ((!match_lines && !match_words)
506		  || (match_lines && len == end - beg - 1))
507		goto success_in_beg_and_end;
508	      /* If -w, check if the match aligns with word boundaries.
509		 We do this iteratively because:
510		 (a) the line may contain more than one occurence of the
511		 pattern, and
512		 (b) Several alternatives in the pattern might be valid at a
513		 given point, and we may need to consider a shorter one to
514		 find a word boundary.  */
515	      if (match_words)
516		while (start >= 0)
517		  {
518		    int lword_match = 0;
519		    if (start == 0)
520		      lword_match = 1;
521		    else
522		      {
523			assert (start > 0);
524#ifdef MBS_SUPPORT
525			if (mb_cur_max > 1)
526			  {
527			    const char *s;
528			    size_t mr;
529			    wchar_t pwc;
530
531			    /* Locate the start of the multibyte character
532			       before the match position (== beg + start).  */
533			    if (using_utf8)
534			      {
535				/* UTF-8 is a special case: scan backwards
536				   until we find a 7-bit character or a
537				   lead byte.  */
538				s = beg + start - 1;
539				while (s > buf
540				       && (unsigned char) *s >= 0x80
541				       && (unsigned char) *s <= 0xbf)
542				  --s;
543			      }
544			    else
545			      {
546				/* Scan forwards to find the start of the
547				   last complete character before the
548				   match position.  */
549				size_t bytes_left = start - 1;
550				s = beg;
551				while (bytes_left > 0)
552				  {
553				    mr = mbrlen (s, bytes_left, &mbs);
554				    if (mr == (size_t) -1 || mr == 0)
555				      {
556					memset (&mbs, '\0', sizeof (mbs));
557					s++;
558					bytes_left--;
559					continue;
560				      }
561				    if (mr == (size_t) -2)
562				      {
563					memset (&mbs, '\0', sizeof (mbs));
564					break;
565				      }
566				    s += mr;
567				    bytes_left -= mr;
568				  }
569			      }
570			    mr = mbrtowc (&pwc, s, beg + start - s, &mbs);
571			    if (mr == (size_t) -2 || mr == (size_t) -1 ||
572				mr == 0)
573			      {
574				memset (&mbs, '\0', sizeof (mbstate_t));
575				lword_match = 1;
576			      }
577			    else if (!(iswalnum (pwc) || pwc == L'_')
578				     && mr == beg + start - s)
579			      lword_match = 1;
580			  }
581			else
582#endif /* MBS_SUPPORT */
583			if (!WCHAR ((unsigned char) beg[start - 1]))
584			  lword_match = 1;
585		      }
586
587		    if (lword_match)
588		      {
589			int rword_match = 0;
590			if (start + len == end - beg - 1)
591			  rword_match = 1;
592			else
593			  {
594#ifdef MBS_SUPPORT
595			    if (mb_cur_max > 1)
596			      {
597				wchar_t nwc;
598				int mr;
599
600				mr = mbtowc (&nwc, beg + start + len,
601					     end - beg - start - len - 1);
602				if (mr <= 0)
603				  {
604				    memset (&mbs, '\0', sizeof (mbstate_t));
605				    rword_match = 1;
606				  }
607				else if (!iswalnum (nwc) && nwc != L'_')
608				  rword_match = 1;
609			      }
610			    else
611#endif /* MBS_SUPPORT */
612			    if (!WCHAR ((unsigned char) beg[start + len]))
613			      rword_match = 1;
614			  }
615
616			if (rword_match)
617			  {
618			    if (!exact)
619			      /* Returns the whole line. */
620			      goto success_in_beg_and_end;
621			    else
622			      /* Returns just this word match. */
623			      goto success_in_start_and_len;
624			  }
625		      }
626		    if (len > 0)
627		      {
628			/* Try a shorter length anchored at the same place. */
629			--len;
630			patterns[i].regexbuf.not_eol = 1;
631			len = re_match (&(patterns[i].regexbuf), beg,
632					start + len, start,
633					&(patterns[i].regs));
634		      }
635		    if (len <= 0)
636		      {
637			/* Try looking further on. */
638			if (start == end - beg - 1)
639			  break;
640			++start;
641			patterns[i].regexbuf.not_eol = 0;
642			start = re_search (&(patterns[i].regexbuf), beg,
643					   end - beg - 1,
644					   start, end - beg - 1 - start,
645					   &(patterns[i].regs));
646			len = patterns[i].regs.end[0] - start;
647		      }
648		  }
649	    }
650	} /* for Regex patterns.  */
651    } /* for (beg = end ..) */
652
653 failure:
654  return (size_t) -1;
655
656 success_in_beg_and_end:
657  len = end - beg;
658  start = beg - buf;
659  /* FALLTHROUGH */
660
661 success_in_start_and_len:
662  *match_size = len;
663  return start;
664}
665
666#ifdef MBS_SUPPORT
667static int f_i_multibyte; /* whether we're using the new -Fi MB method */
668static struct
669{
670  wchar_t **patterns;
671  size_t count, maxlen;
672  unsigned char *match;
673} Fimb;
674#endif
675
676static void
677Fcompile (char const *pattern, size_t size)
678{
679  int mb_cur_max = MB_CUR_MAX;
680  char const *beg, *lim, *err;
681
682  check_utf8 ();
683#ifdef MBS_SUPPORT
684  /* Support -F -i for UTF-8 input. */
685  if (match_icase && mb_cur_max > 1)
686    {
687      mbstate_t mbs;
688      wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
689      const char *patternend = pattern;
690      size_t wcsize;
691      kwset_t fimb_kwset = NULL;
692      char *starts = NULL;
693      wchar_t *wcbeg, *wclim;
694      size_t allocated = 0;
695
696      memset (&mbs, '\0', sizeof (mbs));
697# ifdef __GNU_LIBRARY__
698      wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
699      if (patternend != pattern + size)
700	wcsize = (size_t) -1;
701# else
702      {
703	char *patterncopy = xmalloc (size + 1);
704
705	memcpy (patterncopy, pattern, size);
706	patterncopy[size] = '\0';
707	patternend = patterncopy;
708	wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
709	if (patternend != patterncopy + size)
710	  wcsize = (size_t) -1;
711	free (patterncopy);
712      }
713# endif
714      if (wcsize + 2 <= 2)
715	{
716fimb_fail:
717	  free (wcpattern);
718	  free (starts);
719	  if (fimb_kwset)
720	    kwsfree (fimb_kwset);
721	  free (Fimb.patterns);
722	  Fimb.patterns = NULL;
723	}
724      else
725	{
726	  if (!(fimb_kwset = kwsalloc (NULL)))
727	    error (2, 0, _("memory exhausted"));
728
729	  starts = xmalloc (mb_cur_max * 3);
730	  wcbeg = wcpattern;
731	  do
732	    {
733	      int i;
734	      size_t wclen;
735
736	      if (Fimb.count >= allocated)
737		{
738		  if (allocated == 0)
739		    allocated = 128;
740		  else
741		    allocated *= 2;
742		  Fimb.patterns = xrealloc (Fimb.patterns,
743					    sizeof (wchar_t *) * allocated);
744		}
745	      Fimb.patterns[Fimb.count++] = wcbeg;
746	      for (wclim = wcbeg;
747		   wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
748		*wclim = towlower (*wclim);
749	      *wclim = L'\0';
750	      wclen = wclim - wcbeg;
751	      if (wclen > Fimb.maxlen)
752		Fimb.maxlen = wclen;
753	      if (wclen > 3)
754		wclen = 3;
755	      if (wclen == 0)
756		{
757		  if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
758		    error (2, 0, err);
759		}
760	      else
761		for (i = 0; i < (1 << wclen); i++)
762		  {
763		    char *p = starts;
764		    int j, k;
765
766		    for (j = 0; j < wclen; ++j)
767		      {
768			wchar_t wc = wcbeg[j];
769			if (i & (1 << j))
770			  {
771			    wc = towupper (wc);
772			    if (wc == wcbeg[j])
773			      continue;
774			  }
775			k = wctomb (p, wc);
776			if (k <= 0)
777			  goto fimb_fail;
778			p += k;
779		      }
780		    if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
781		      error (2, 0, err);
782		  }
783	      if (wclim < wcpattern + wcsize)
784		++wclim;
785	      wcbeg = wclim;
786	    }
787	  while (wcbeg < wcpattern + wcsize);
788	  f_i_multibyte = 1;
789	  kwset = fimb_kwset;
790	  free (starts);
791	  Fimb.match = xmalloc (Fimb.count);
792	  if ((err = kwsprep (kwset)) != 0)
793	    error (2, 0, err);
794	  return;
795	}
796    }
797#endif /* MBS_SUPPORT */
798
799
800  kwsinit ();
801  beg = pattern;
802  do
803    {
804      for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
805	;
806      if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
807	error (2, 0, err);
808      if (lim < pattern + size)
809	++lim;
810      beg = lim;
811    }
812  while (beg < pattern + size);
813
814  if ((err = kwsprep (kwset)) != 0)
815    error (2, 0, err);
816}
817
818#ifdef MBS_SUPPORT
819static int
820Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
821{
822  size_t len, letter, i;
823  int ret = -1;
824  mbstate_t mbs;
825  wchar_t wc;
826  int patterns_left;
827
828  assert (match_icase && f_i_multibyte == 1);
829  assert (MB_CUR_MAX > 1);
830
831  memset (&mbs, '\0', sizeof (mbs));
832  memset (Fimb.match, '\1', Fimb.count);
833  letter = len = 0;
834  patterns_left = 1;
835  while (patterns_left && len <= size)
836    {
837      size_t c;
838
839      patterns_left = 0;
840      if (len < size)
841	{
842	  c = mbrtowc (&wc, buf + len, size - len, &mbs);
843	  if (c + 2 <= 2)
844	    return ret;
845
846	  wc = towlower (wc);
847	}
848      else
849	{
850	  c = 1;
851	  wc = L'\0';
852	}
853
854      for (i = 0; i < Fimb.count; i++)
855	{
856	  if (Fimb.match[i])
857	    {
858	      if (Fimb.patterns[i][letter] == L'\0')
859		{
860		  /* Found a match. */
861		  *plen = len;
862		  if (!exact && !match_words)
863		    return 0;
864		  else
865		    {
866		      /* For -w or exact look for longest match.  */
867		      ret = 0;
868		      Fimb.match[i] = '\0';
869		      continue;
870		    }
871		}
872
873	      if (Fimb.patterns[i][letter] == wc)
874		patterns_left = 1;
875	      else
876		Fimb.match[i] = '\0';
877	    }
878	}
879
880      len += c;
881      letter++;
882    }
883
884  return ret;
885}
886#endif /* MBS_SUPPORT */
887
888static size_t
889Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
890{
891  register char const *beg, *try, *end;
892  register size_t len;
893  char eol = eolbyte;
894  struct kwsmatch kwsmatch;
895  size_t ret_val;
896#ifdef MBS_SUPPORT
897  int mb_cur_max = MB_CUR_MAX;
898  mbstate_t mbs;
899  memset (&mbs, '\0', sizeof (mbstate_t));
900  const char *last_char = NULL;
901#endif /* MBS_SUPPORT */
902
903  for (beg = buf; beg <= buf + size; ++beg)
904    {
905      size_t offset;
906      offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
907
908      if (offset == (size_t) -1)
909	goto failure;
910#ifdef MBS_SUPPORT
911      if (mb_cur_max > 1 && !using_utf8)
912	{
913	  size_t bytes_left = offset;
914	  while (bytes_left)
915	    {
916	      size_t mlen = mbrlen (beg, bytes_left, &mbs);
917
918	      last_char = beg;
919	      if (mlen == (size_t) -1 || mlen == 0)
920		{
921		  /* Incomplete character: treat as single-byte. */
922		  memset (&mbs, '\0', sizeof (mbstate_t));
923		  beg++;
924		  bytes_left--;
925		  continue;
926		}
927
928	      if (mlen == (size_t) -2)
929		/* Offset points inside multibyte character: no good. */
930		break;
931
932	      beg += mlen;
933	      bytes_left -= mlen;
934	    }
935
936	  if (bytes_left)
937	    continue;
938	}
939      else
940#endif /* MBS_SUPPORT */
941      beg += offset;
942#ifdef MBS_SUPPORT
943      /* For f_i_multibyte, the string at beg now matches first 3 chars of
944	 one of the search strings (less if there are shorter search strings).
945	 See if this is a real match.  */
946      if (f_i_multibyte
947	  && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
948	goto next_char;
949#endif /* MBS_SUPPORT */
950      len = kwsmatch.size[0];
951      if (exact && !match_words)
952	goto success_in_beg_and_len;
953      if (match_lines)
954	{
955	  if (beg > buf && beg[-1] != eol)
956	    goto next_char;
957	  if (beg + len < buf + size && beg[len] != eol)
958	    goto next_char;
959	  goto success;
960	}
961      else if (match_words)
962	{
963	  while (1)
964	    {
965	      int word_match = 0;
966	      if (beg > buf)
967		{
968#ifdef MBS_SUPPORT
969		  if (mb_cur_max > 1)
970		    {
971		      const char *s;
972		      int mr;
973		      wchar_t pwc;
974
975		      if (using_utf8)
976			{
977			  s = beg - 1;
978			  while (s > buf
979				 && (unsigned char) *s >= 0x80
980				 && (unsigned char) *s <= 0xbf)
981			    --s;
982			}
983		      else
984			s = last_char;
985		      mr = mbtowc (&pwc, s, beg - s);
986		      if (mr <= 0)
987			memset (&mbs, '\0', sizeof (mbstate_t));
988		      else if ((iswalnum (pwc) || pwc == L'_')
989			       && mr == (int) (beg - s))
990			goto next_char;
991		    }
992		  else
993#endif /* MBS_SUPPORT */
994		  if (WCHAR ((unsigned char) beg[-1]))
995		    goto next_char;
996		}
997#ifdef MBS_SUPPORT
998	      if (mb_cur_max > 1)
999		{
1000		  wchar_t nwc;
1001		  int mr;
1002
1003		  mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
1004		  if (mr <= 0)
1005		    {
1006		      memset (&mbs, '\0', sizeof (mbstate_t));
1007		      word_match = 1;
1008		    }
1009		  else if (!iswalnum (nwc) && nwc != L'_')
1010		    word_match = 1;
1011		}
1012	      else
1013#endif /* MBS_SUPPORT */
1014		if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
1015		  word_match = 1;
1016	      if (word_match)
1017		{
1018		  if (!exact)
1019		    /* Returns the whole line now we know there's a word match. */
1020		    goto success;
1021		  else
1022		    /* Returns just this word match. */
1023		    goto success_in_beg_and_len;
1024		}
1025	      if (len > 0)
1026		{
1027		  /* Try a shorter length anchored at the same place. */
1028		  --len;
1029		  offset = kwsexec (kwset, beg, len, &kwsmatch);
1030
1031		  if (offset == -1)
1032		    goto next_char; /* Try a different anchor. */
1033#ifdef MBS_SUPPORT
1034		  if (mb_cur_max > 1 && !using_utf8)
1035		    {
1036		      size_t bytes_left = offset;
1037		      while (bytes_left)
1038			{
1039			  size_t mlen = mbrlen (beg, bytes_left, &mbs);
1040
1041			  last_char = beg;
1042			  if (mlen == (size_t) -1 || mlen == 0)
1043			    {
1044			      /* Incomplete character: treat as single-byte. */
1045			      memset (&mbs, '\0', sizeof (mbstate_t));
1046			      beg++;
1047			      bytes_left--;
1048			      continue;
1049			    }
1050
1051			  if (mlen == (size_t) -2)
1052			    {
1053			      /* Offset points inside multibyte character:
1054			       * no good. */
1055			      break;
1056			    }
1057
1058			  beg += mlen;
1059			  bytes_left -= mlen;
1060			}
1061
1062		      if (bytes_left)
1063			{
1064			  memset (&mbs, '\0', sizeof (mbstate_t));
1065			  goto next_char; /* Try a different anchor. */
1066			}
1067		    }
1068		  else
1069#endif /* MBS_SUPPORT */
1070		  beg += offset;
1071#ifdef MBS_SUPPORT
1072		  /* The string at beg now matches first 3 chars of one of
1073		     the search strings (less if there are shorter search
1074		     strings).  See if this is a real match.  */
1075		  if (f_i_multibyte
1076		      && Fimbexec (beg, len - offset, &kwsmatch.size[0],
1077				   exact))
1078		    goto next_char;
1079#endif /* MBS_SUPPORT */
1080		  len = kwsmatch.size[0];
1081		}
1082	    }
1083	}
1084      else
1085	goto success;
1086next_char:;
1087#ifdef MBS_SUPPORT
1088      /* Advance to next character.  For MB_CUR_MAX == 1 case this is handled
1089	 by ++beg above.  */
1090      if (mb_cur_max > 1)
1091	{
1092	  if (using_utf8)
1093	    {
1094	      unsigned char c = *beg;
1095	      if (c >= 0xc2)
1096		{
1097		  if (c < 0xe0)
1098		    ++beg;
1099		  else if (c < 0xf0)
1100		    beg += 2;
1101		  else if (c < 0xf8)
1102		    beg += 3;
1103		  else if (c < 0xfc)
1104		    beg += 4;
1105		  else if (c < 0xfe)
1106		    beg += 5;
1107		}
1108	    }
1109	  else
1110	    {
1111	      size_t l = mbrlen (beg, buf + size - beg, &mbs);
1112
1113	      last_char = beg;
1114	      if (l + 2 >= 2)
1115		beg += l - 1;
1116	      else
1117		memset (&mbs, '\0', sizeof (mbstate_t));
1118	    }
1119	}
1120#endif /* MBS_SUPPORT */
1121    }
1122
1123 failure:
1124  return -1;
1125
1126 success:
1127#ifdef MBS_SUPPORT
1128  if (mb_cur_max > 1 && !using_utf8)
1129    {
1130      end = beg + len;
1131      while (end < buf + size)
1132	{
1133	  size_t mlen = mbrlen (end, buf + size - end, &mbs);
1134	  if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
1135	    {
1136	      memset (&mbs, '\0', sizeof (mbstate_t));
1137	      mlen = 1;
1138	    }
1139	  if (mlen == 1 && *end == eol)
1140	    break;
1141
1142	  end += mlen;
1143	}
1144    }
1145  else
1146#endif /* MBS_SUPPORT */
1147  end = memchr (beg + len, eol, (buf + size) - (beg + len));
1148
1149  end++;
1150  while (buf < beg && beg[-1] != eol)
1151    --beg;
1152  len = end - beg;
1153  /* FALLTHROUGH */
1154
1155 success_in_beg_and_len:
1156  *match_size = len;
1157  return beg - buf;
1158}
1159
1160#if HAVE_LIBPCRE
1161/* Compiled internal form of a Perl regular expression.  */
1162static pcre *cre;
1163
1164/* Additional information about the pattern.  */
1165static pcre_extra *extra;
1166#endif
1167
1168static void
1169Pcompile (char const *pattern, size_t size)
1170{
1171#if !HAVE_LIBPCRE
1172  error (2, 0, _("The -P option is not supported"));
1173#else
1174  int e;
1175  char const *ep;
1176  char *re = xmalloc (4 * size + 7);
1177  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
1178  char const *patlim = pattern + size;
1179  char *n = re;
1180  char const *p;
1181  char const *pnul;
1182
1183  /* FIXME: Remove this restriction.  */
1184  if (eolbyte != '\n')
1185    error (2, 0, _("The -P and -z options cannot be combined"));
1186
1187  *n = '\0';
1188  if (match_lines)
1189    strcpy (n, "^(");
1190  if (match_words)
1191    strcpy (n, "\\b(");
1192  n += strlen (n);
1193
1194  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
1195     replace each NUL byte in the pattern with the four characters
1196     "\000", removing a preceding backslash if there are an odd
1197     number of backslashes before the NUL.
1198
1199     FIXME: This method does not work with some multibyte character
1200     encodings, notably Shift-JIS, where a multibyte character can end
1201     in a backslash byte.  */
1202  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
1203    {
1204      memcpy (n, p, pnul - p);
1205      n += pnul - p;
1206      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
1207	continue;
1208      n -= (pnul - p) & 1;
1209      strcpy (n, "\\000");
1210      n += 4;
1211    }
1212
1213  memcpy (n, p, patlim - p);
1214  n += patlim - p;
1215  *n = '\0';
1216  if (match_words)
1217    strcpy (n, ")\\b");
1218  if (match_lines)
1219    strcpy (n, ")$");
1220
1221  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
1222  if (!cre)
1223    error (2, 0, ep);
1224
1225  extra = pcre_study (cre, 0, &ep);
1226  if (ep)
1227    error (2, 0, ep);
1228
1229  free (re);
1230#endif
1231}
1232
1233static size_t
1234Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
1235{
1236#if !HAVE_LIBPCRE
1237  abort ();
1238  return -1;
1239#else
1240  /* This array must have at least two elements; everything after that
1241     is just for performance improvement in pcre_exec.  */
1242  int sub[300];
1243
1244  int e = pcre_exec (cre, extra, buf, size, 0, 0,
1245		     sub, sizeof sub / sizeof *sub);
1246
1247  if (e <= 0)
1248    {
1249      switch (e)
1250	{
1251	case PCRE_ERROR_NOMATCH:
1252	  return -1;
1253
1254	case PCRE_ERROR_NOMEMORY:
1255	  error (2, 0, _("Memory exhausted"));
1256
1257	default:
1258	  abort ();
1259	}
1260    }
1261  else
1262    {
1263      /* Narrow down to the line we've found.  */
1264      char const *beg = buf + sub[0];
1265      char const *end = buf + sub[1];
1266      char const *buflim = buf + size;
1267      char eol = eolbyte;
1268      if (!exact)
1269	{
1270	  end = memchr (end, eol, buflim - end);
1271	  end++;
1272	  while (buf < beg && beg[-1] != eol)
1273	    --beg;
1274	}
1275
1276      *match_size = end - beg;
1277      return beg - buf;
1278    }
1279#endif
1280}
1281
1282struct matcher const matchers[] = {
1283  { "default", Gcompile, EGexecute },
1284  { "grep", Gcompile, EGexecute },
1285  { "egrep", Ecompile, EGexecute },
1286  { "awk", Ecompile, EGexecute },
1287  { "fgrep", Fcompile, Fexecute },
1288  { "perl", Pcompile, Pexecute },
1289  { "", 0, 0 },
1290};
1291