153451Speter/* search.c - searching subroutines using dfa, kwset and regex for grep.
2126435Sache   Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
353451Speter
453451Speter   This program is free software; you can redistribute it and/or modify
553451Speter   it under the terms of the GNU General Public License as published by
653451Speter   the Free Software Foundation; either version 2, or (at your option)
753451Speter   any later version.
853451Speter
953451Speter   This program is distributed in the hope that it will be useful,
1053451Speter   but WITHOUT ANY WARRANTY; without even the implied warranty of
1153451Speter   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1253451Speter   GNU General Public License for more details.
1353451Speter
1453451Speter   You should have received a copy of the GNU General Public License
1553451Speter   along with this program; if not, write to the Free Software
1653479Sobrien   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
1753479Sobrien   02111-1307, USA.  */
1853451Speter
1953479Sobrien/* Written August 1992 by Mike Haertel. */
2053451Speter
2153479Sobrien/* $FreeBSD$ */
2253451Speter
23146205Stjr#ifndef _GNU_SOURCE
24146205Stjr# define _GNU_SOURCE 1
25146205Stjr#endif
2653479Sobrien#ifdef HAVE_CONFIG_H
2753479Sobrien# include <config.h>
2853479Sobrien#endif
29146205Stjr#include <assert.h>
3053451Speter#include <sys/types.h>
31131557Stjr#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
32131557Stjr/* We can handle multibyte string.  */
33131557Stjr# define MBS_SUPPORT
34131557Stjr# include <wchar.h>
35131557Stjr# include <wctype.h>
36131557Stjr#endif
37131557Stjr
3853479Sobrien#include "system.h"
3953479Sobrien#include "grep.h"
4053479Sobrien#include "regex.h"
4153451Speter#include "dfa.h"
4253451Speter#include "kwset.h"
43131557Stjr#include "error.h"
44131557Stjr#include "xalloc.h"
45131557Stjr#ifdef HAVE_LIBPCRE
46131557Stjr# include <pcre.h>
47131557Stjr#endif
48146205Stjr#ifdef HAVE_LANGINFO_CODESET
49146205Stjr# include <langinfo.h>
50146205Stjr#endif
5153451Speter
5253451Speter#define NCHAR (UCHAR_MAX + 1)
5353451Speter
5453451Speter/* For -w, we also consider _ to be word constituent.  */
5553451Speter#define WCHAR(C) (ISALNUM(C) || (C) == '_')
5653451Speter
5753451Speter/* DFA compiled regexp. */
5853451Speterstatic struct dfa dfa;
5953451Speter
60131557Stjr/* The Regex compiled patterns.  */
61131557Stjrstatic struct patterns
62131557Stjr{
63131557Stjr  /* Regex compiled regexp. */
64131557Stjr  struct re_pattern_buffer regexbuf;
65131557Stjr  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
66131557Stjr			       Q@#%!# library interface in regex.c.  */
67131557Stjr} patterns0;
6853451Speter
69131557Stjrstruct patterns *patterns;
70131557Stjrsize_t pcount;
71131557Stjr
7253451Speter/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
7353451Speter   a list of strings, at least one of which is known to occur in
7453451Speter   any string matching the regexp. */
7553451Speterstatic kwset_t kwset;
7653451Speter
77131557Stjr/* Number of compiled fixed strings known to exactly match the regexp.
78131557Stjr   If kwsexec returns < kwset_exact_matches, then we don't need to
7953451Speter   call the regexp matcher at all. */
80131557Stjrstatic int kwset_exact_matches;
8153451Speter
82146205Stjr/* UTF-8 encoding allows some optimizations that we can't otherwise
83146205Stjr   assume in a multibyte encoding. */
84146205Stjrstatic int using_utf8;
85146205Stjr
86131557Stjrstatic void kwsinit PARAMS ((void));
87131557Stjrstatic void kwsmusts PARAMS ((void));
88131557Stjrstatic void Gcompile PARAMS ((char const *, size_t));
89131557Stjrstatic void Ecompile PARAMS ((char const *, size_t));
90146199Stjrstatic size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
91131557Stjrstatic void Fcompile PARAMS ((char const *, size_t));
92146199Stjrstatic size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
93131557Stjrstatic void Pcompile PARAMS ((char const *, size_t ));
94146199Stjrstatic size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
95131557Stjr
9653451Spetervoid
97146205Stjrcheck_utf8 (void)
98146205Stjr{
99146205Stjr#ifdef HAVE_LANGINFO_CODESET
100146205Stjr  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
101146205Stjr    using_utf8 = 1;
102146205Stjr#endif
103146205Stjr}
104146205Stjr
105146205Stjrvoid
10656920Srudfaerror (char const *mesg)
10753451Speter{
108131557Stjr  error (2, 0, mesg);
10953451Speter}
11053451Speter
11153451Speterstatic void
11256920Srukwsinit (void)
11353451Speter{
11453451Speter  static char trans[NCHAR];
115250823Spfg  size_t i;
11653451Speter
11753451Speter  if (match_icase)
11853451Speter    for (i = 0; i < NCHAR; ++i)
119131557Stjr      trans[i] = TOLOWER (i);
12053451Speter
121131557Stjr  if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
122131557Stjr    error (2, 0, _("memory exhausted"));
12353451Speter}
12453451Speter
12553451Speter/* If the DFA turns out to have some set of fixed strings one of
12653451Speter   which must occur in the match, then we build a kwset matcher
12753451Speter   to find those strings, and thus quickly filter out impossible
12853451Speter   matches. */
12953451Speterstatic void
13056920Srukwsmusts (void)
13153451Speter{
132131557Stjr  struct dfamust const *dm;
133131557Stjr  char const *err;
13453451Speter
13553451Speter  if (dfa.musts)
13653451Speter    {
137131557Stjr      kwsinit ();
13853451Speter      /* First, we compile in the substrings known to be exact
13953451Speter	 matches.  The kwset matcher will return the index
14053451Speter	 of the matching string that it chooses. */
14153451Speter      for (dm = dfa.musts; dm; dm = dm->next)
14253451Speter	{
14353451Speter	  if (!dm->exact)
14453451Speter	    continue;
145131557Stjr	  ++kwset_exact_matches;
146131557Stjr	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
147131557Stjr	    error (2, 0, err);
14853451Speter	}
14953451Speter      /* Now, we compile the substrings that will require
15053451Speter	 the use of the regexp matcher.  */
15153451Speter      for (dm = dfa.musts; dm; dm = dm->next)
15253451Speter	{
15353451Speter	  if (dm->exact)
15453451Speter	    continue;
155131557Stjr	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
156131557Stjr	    error (2, 0, err);
15753451Speter	}
158131557Stjr      if ((err = kwsprep (kwset)) != 0)
159131557Stjr	error (2, 0, err);
16053451Speter    }
16153451Speter}
16253451Speter
16353451Speterstatic void
164131557StjrGcompile (char const *pattern, size_t size)
16553451Speter{
16653479Sobrien  const char *err;
167131557Stjr  char const *sep;
168131557Stjr  size_t total = size;
169131557Stjr  char const *motif = pattern;
17053451Speter
171146205Stjr  check_utf8 ();
172146820Stjr  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
173131557Stjr  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
17453451Speter
175131557Stjr  /* For GNU regex compiler we have to pass the patterns separately to detect
176131557Stjr     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
177131557Stjr     GNU regex should have raise a syntax error.  The same for backref, where
178131557Stjr     the backref should have been local to each pattern.  */
179131557Stjr  do
180131557Stjr    {
181131557Stjr      size_t len;
182131557Stjr      sep = memchr (motif, '\n', total);
183131557Stjr      if (sep)
184131557Stjr	{
185131557Stjr	  len = sep - motif;
186131557Stjr	  sep++;
187131557Stjr	  total -= (len + 1);
188131557Stjr	}
189131557Stjr      else
190131557Stjr	{
191131557Stjr	  len = total;
192131557Stjr	  total = 0;
193131557Stjr	}
19453451Speter
195131557Stjr      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
196131557Stjr      if (patterns == NULL)
197131557Stjr	error (2, errno, _("memory exhausted"));
198131557Stjr
199131557Stjr      patterns[pcount] = patterns0;
200131557Stjr
201131557Stjr      if ((err = re_compile_pattern (motif, len,
202131557Stjr				    &(patterns[pcount].regexbuf))) != 0)
203131557Stjr	error (2, 0, err);
204131557Stjr      pcount++;
205131557Stjr
206131557Stjr      motif = sep;
207131557Stjr    } while (sep && total != 0);
208131557Stjr
20953451Speter  /* In the match_words and match_lines cases, we use a different pattern
21053451Speter     for the DFA matcher that will quickly throw out cases that won't work.
21153451Speter     Then if DFA succeeds we do some hairy stuff using the regex matcher
21253451Speter     to decide whether the match should really count. */
21353451Speter  if (match_words || match_lines)
21453451Speter    {
21553451Speter      /* In the whole-word case, we use the pattern:
216131557Stjr	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
21753451Speter	 In the whole-line case, we use the pattern:
218131557Stjr	 ^\(userpattern\)$.  */
21953451Speter
220131557Stjr      static char const line_beg[] = "^\\(";
221131557Stjr      static char const line_end[] = "\\)$";
222131557Stjr      static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
223131557Stjr      static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
224131564Stjr      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
225131557Stjr      size_t i;
226131557Stjr      strcpy (n, match_lines ? line_beg : word_beg);
227131557Stjr      i = strlen (n);
228131557Stjr      memcpy (n + i, pattern, size);
22953451Speter      i += size;
230131557Stjr      strcpy (n + i, match_lines ? line_end : word_end);
231131557Stjr      i += strlen (n + i);
232131557Stjr      pattern = n;
233131557Stjr      size = i;
23453451Speter    }
23553451Speter
236131557Stjr  dfacomp (pattern, size, &dfa, 1);
237131557Stjr  kwsmusts ();
23853451Speter}
23953451Speter
24053451Speterstatic void
241131557StjrEcompile (char const *pattern, size_t size)
24253451Speter{
24353479Sobrien  const char *err;
244131557Stjr  const char *sep;
245131557Stjr  size_t total = size;
246131557Stjr  char const *motif = pattern;
24753451Speter
248146205Stjr  check_utf8 ();
249131557Stjr  if (strcmp (matcher, "awk") == 0)
25053451Speter    {
251146820Stjr      re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
252131557Stjr      dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
25353479Sobrien    }
25453451Speter  else
25553451Speter    {
256146820Stjr      re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
25755379Sobrien      dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
25853451Speter    }
25953451Speter
260131557Stjr  /* For GNU regex compiler we have to pass the patterns separately to detect
261131557Stjr     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
262131557Stjr     GNU regex should have raise a syntax error.  The same for backref, where
263131557Stjr     the backref should have been local to each pattern.  */
264131557Stjr  do
265131557Stjr    {
266131557Stjr      size_t len;
267131557Stjr      sep = memchr (motif, '\n', total);
268131557Stjr      if (sep)
269131557Stjr	{
270131557Stjr	  len = sep - motif;
271131557Stjr	  sep++;
272131557Stjr	  total -= (len + 1);
273131557Stjr	}
274131557Stjr      else
275131557Stjr	{
276131557Stjr	  len = total;
277131557Stjr	  total = 0;
278131557Stjr	}
27953451Speter
280131557Stjr      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
281131557Stjr      if (patterns == NULL)
282131557Stjr	error (2, errno, _("memory exhausted"));
283131557Stjr      patterns[pcount] = patterns0;
284131557Stjr
285131557Stjr      if ((err = re_compile_pattern (motif, len,
286131557Stjr				    &(patterns[pcount].regexbuf))) != 0)
287131557Stjr	error (2, 0, err);
288131557Stjr      pcount++;
289131557Stjr
290131557Stjr      motif = sep;
291131557Stjr    } while (sep && total != 0);
292131557Stjr
29353451Speter  /* In the match_words and match_lines cases, we use a different pattern
29453451Speter     for the DFA matcher that will quickly throw out cases that won't work.
29553451Speter     Then if DFA succeeds we do some hairy stuff using the regex matcher
29653451Speter     to decide whether the match should really count. */
29753451Speter  if (match_words || match_lines)
29853451Speter    {
29953451Speter      /* In the whole-word case, we use the pattern:
300131557Stjr	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
30153451Speter	 In the whole-line case, we use the pattern:
302131557Stjr	 ^(userpattern)$.  */
30353451Speter
304131557Stjr      static char const line_beg[] = "^(";
305131557Stjr      static char const line_end[] = ")$";
306131557Stjr      static char const word_beg[] = "(^|[^[:alnum:]_])(";
307131557Stjr      static char const word_end[] = ")([^[:alnum:]_]|$)";
308131564Stjr      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
309131557Stjr      size_t i;
310131557Stjr      strcpy (n, match_lines ? line_beg : word_beg);
31153451Speter      i = strlen(n);
312131557Stjr      memcpy (n + i, pattern, size);
31353451Speter      i += size;
314131557Stjr      strcpy (n + i, match_lines ? line_end : word_end);
315131557Stjr      i += strlen (n + i);
316131557Stjr      pattern = n;
317131557Stjr      size = i;
31853451Speter    }
31953451Speter
320131557Stjr  dfacomp (pattern, size, &dfa, 1);
321131557Stjr  kwsmusts ();
32253451Speter}
32353451Speter
324131557Stjrstatic size_t
325146199StjrEGexecute (char const *buf, size_t size, size_t *match_size, int exact)
32653451Speter{
327131557Stjr  register char const *buflim, *beg, *end;
32855379Sobrien  char eol = eolbyte;
329250823Spfg  int backref;
330250823Spfg  ptrdiff_t start, len;
33153451Speter  struct kwsmatch kwsm;
332131564Stjr  size_t i, ret_val;
333146206Stjr  static int use_dfa;
334146206Stjr  static int use_dfa_checked = 0;
335131557Stjr#ifdef MBS_SUPPORT
336146207Stjr  const char *last_char = NULL;
337146205Stjr  int mb_cur_max = MB_CUR_MAX;
338146205Stjr  mbstate_t mbs;
339146205Stjr  memset (&mbs, '\0', sizeof (mbstate_t));
340131557Stjr#endif /* MBS_SUPPORT */
34153451Speter
342146206Stjr  if (!use_dfa_checked)
343146206Stjr    {
344146206Stjr      char *grep_use_dfa = getenv ("GREP_USE_DFA");
345146206Stjr      if (!grep_use_dfa)
346146206Stjr	{
347146206Stjr#ifdef MBS_SUPPORT
348146206Stjr	  /* Turn off DFA when processing multibyte input. */
349146206Stjr	  use_dfa = (MB_CUR_MAX == 1);
350146206Stjr#else
351146206Stjr	  use_dfa = 1;
352146206Stjr#endif /* MBS_SUPPORT */
353146206Stjr	}
354146206Stjr      else
355146206Stjr	{
356146206Stjr	  use_dfa = atoi (grep_use_dfa);
357146206Stjr	}
358146206Stjr
359146206Stjr      use_dfa_checked = 1;
360146206Stjr    }
361146206Stjr
36253451Speter  buflim = buf + size;
36353451Speter
364131557Stjr  for (beg = end = buf; end < buflim; beg = end)
36553451Speter    {
366131557Stjr      if (!exact)
36753451Speter	{
368131557Stjr	  if (kwset)
36953451Speter	    {
370131557Stjr	      /* Find a possible match using the KWset matcher. */
371146205Stjr#ifdef MBS_SUPPORT
372146205Stjr	      size_t bytes_left = 0;
373146205Stjr#endif /* MBS_SUPPORT */
374146205Stjr	      size_t offset;
375146205Stjr#ifdef MBS_SUPPORT
376146205Stjr	      /* kwsexec doesn't work with match_icase and multibyte input. */
377146205Stjr	      if (match_icase && mb_cur_max > 1)
378146205Stjr		/* Avoid kwset */
379146205Stjr		offset = 0;
380146205Stjr	      else
381146205Stjr#endif /* MBS_SUPPORT */
382146205Stjr	      offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
383131557Stjr	      if (offset == (size_t) -1)
384131563Stjr	        goto failure;
385146205Stjr#ifdef MBS_SUPPORT
386146205Stjr	      if (mb_cur_max > 1 && !using_utf8)
387146205Stjr		{
388146205Stjr		  bytes_left = offset;
389146205Stjr		  while (bytes_left)
390146205Stjr		    {
391146205Stjr		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
392146207Stjr
393146207Stjr		      last_char = beg;
394146205Stjr		      if (mlen == (size_t) -1 || mlen == 0)
395146205Stjr			{
396146205Stjr			  /* Incomplete character: treat as single-byte. */
397146205Stjr			  memset (&mbs, '\0', sizeof (mbstate_t));
398146205Stjr			  beg++;
399146205Stjr			  bytes_left--;
400146205Stjr			  continue;
401146205Stjr			}
402146205Stjr
403146205Stjr		      if (mlen == (size_t) -2)
404146205Stjr			/* Offset points inside multibyte character:
405146205Stjr			 * no good. */
406146205Stjr			break;
407146205Stjr
408146205Stjr		      beg += mlen;
409146205Stjr		      bytes_left -= mlen;
410146205Stjr		    }
411146205Stjr		}
412146205Stjr	      else
413146205Stjr#endif /* MBS_SUPPORT */
414131557Stjr	      beg += offset;
415131557Stjr	      /* Narrow down to the line containing the candidate, and
416131557Stjr		 run it through DFA. */
417131557Stjr	      end = memchr(beg, eol, buflim - beg);
418131557Stjr	      end++;
419131557Stjr#ifdef MBS_SUPPORT
420146205Stjr	      if (mb_cur_max > 1 && bytes_left)
421131557Stjr		continue;
422146205Stjr#endif /* MBS_SUPPORT */
423131557Stjr	      while (beg > buf && beg[-1] != eol)
424131557Stjr		--beg;
425146205Stjr	      if (
426146205Stjr#ifdef MBS_SUPPORT
427146205Stjr		  !(match_icase && mb_cur_max > 1) &&
428146205Stjr#endif /* MBS_SUPPORT */
429146205Stjr		  (kwsm.index < kwset_exact_matches))
430131563Stjr		goto success_in_beg_and_end;
431146206Stjr	      if (use_dfa &&
432146206Stjr		  dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
433131557Stjr		continue;
43453451Speter	    }
435131557Stjr	  else
436131557Stjr	    {
437131557Stjr	      /* No good fixed strings; start with DFA. */
438146205Stjr#ifdef MBS_SUPPORT
439146205Stjr	      size_t bytes_left = 0;
440146205Stjr#endif /* MBS_SUPPORT */
441146206Stjr	      size_t offset = 0;
442146206Stjr	      if (use_dfa)
443146206Stjr		offset = dfaexec (&dfa, beg, buflim - beg, &backref);
444131557Stjr	      if (offset == (size_t) -1)
445131557Stjr		break;
446131557Stjr	      /* Narrow down to the line we've found. */
447146205Stjr#ifdef MBS_SUPPORT
448146205Stjr	      if (mb_cur_max > 1 && !using_utf8)
449146205Stjr		{
450146205Stjr		  bytes_left = offset;
451146205Stjr		  while (bytes_left)
452146205Stjr		    {
453146205Stjr		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
454146207Stjr
455146207Stjr		      last_char = beg;
456146205Stjr		      if (mlen == (size_t) -1 || mlen == 0)
457146205Stjr			{
458146205Stjr			  /* Incomplete character: treat as single-byte. */
459146205Stjr			  memset (&mbs, '\0', sizeof (mbstate_t));
460146205Stjr			  beg++;
461146205Stjr			  bytes_left--;
462146205Stjr			  continue;
463146205Stjr			}
464146205Stjr
465146205Stjr		      if (mlen == (size_t) -2)
466146205Stjr			/* Offset points inside multibyte character:
467146205Stjr			 * no good. */
468146205Stjr			break;
469146205Stjr
470146205Stjr		      beg += mlen;
471146205Stjr		      bytes_left -= mlen;
472146205Stjr		    }
473146205Stjr		}
474146205Stjr	      else
475146205Stjr#endif /* MBS_SUPPORT */
476131557Stjr	      beg += offset;
477131557Stjr	      end = memchr (beg, eol, buflim - beg);
478131557Stjr	      end++;
479146205Stjr#ifdef MBS_SUPPORT
480146205Stjr	      if (mb_cur_max > 1 && bytes_left)
481146205Stjr		continue;
482146205Stjr#endif /* MBS_SUPPORT */
483131557Stjr	      while (beg > buf && beg[-1] != eol)
484131557Stjr		--beg;
485131557Stjr	    }
486131557Stjr	  /* Successful, no backreferences encountered! */
487146206Stjr	  if (use_dfa && !backref)
488131563Stjr	    goto success_in_beg_and_end;
48953451Speter	}
49053451Speter      else
491131557Stjr	end = beg + size;
492131557Stjr
49353451Speter      /* If we've made it to this point, this means DFA has seen
49453451Speter	 a probable match, and we need to run it through Regex. */
495131557Stjr      for (i = 0; i < pcount; i++)
49653451Speter	{
497131557Stjr	  patterns[i].regexbuf.not_eol = 0;
498131557Stjr	  if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
499131557Stjr				       end - beg - 1, 0,
500131557Stjr				       end - beg - 1, &(patterns[i].regs))))
501131557Stjr	    {
502131557Stjr	      len = patterns[i].regs.end[0] - start;
503131563Stjr	      if (exact && !match_words)
504131563Stjr	        goto success_in_start_and_len;
505131557Stjr	      if ((!match_lines && !match_words)
506131557Stjr		  || (match_lines && len == end - beg - 1))
507131563Stjr		goto success_in_beg_and_end;
508131557Stjr	      /* If -w, check if the match aligns with word boundaries.
509131557Stjr		 We do this iteratively because:
510131557Stjr		 (a) the line may contain more than one occurence of the
511131557Stjr		 pattern, and
512131557Stjr		 (b) Several alternatives in the pattern might be valid at a
513131557Stjr		 given point, and we may need to consider a shorter one to
514131557Stjr		 find a word boundary.  */
515131557Stjr	      if (match_words)
516131557Stjr		while (start >= 0)
51753451Speter		  {
518146207Stjr		    int lword_match = 0;
519146207Stjr		    if (start == 0)
520146207Stjr		      lword_match = 1;
521146207Stjr		    else
522146207Stjr		      {
523146207Stjr			assert (start > 0);
524146207Stjr#ifdef MBS_SUPPORT
525146207Stjr			if (mb_cur_max > 1)
526146207Stjr			  {
527146207Stjr			    const char *s;
528155829Stjr			    size_t mr;
529146207Stjr			    wchar_t pwc;
530146207Stjr
531155829Stjr			    /* Locate the start of the multibyte character
532155829Stjr			       before the match position (== beg + start).  */
533146207Stjr			    if (using_utf8)
534146207Stjr			      {
535155829Stjr				/* UTF-8 is a special case: scan backwards
536155829Stjr				   until we find a 7-bit character or a
537155829Stjr				   lead byte.  */
538146207Stjr				s = beg + start - 1;
539146207Stjr				while (s > buf
540146207Stjr				       && (unsigned char) *s >= 0x80
541146207Stjr				       && (unsigned char) *s <= 0xbf)
542146207Stjr				  --s;
543146207Stjr			      }
544146207Stjr			    else
545146207Stjr			      {
546155829Stjr				/* Scan forwards to find the start of the
547155829Stjr				   last complete character before the
548155829Stjr				   match position.  */
549155829Stjr				size_t bytes_left = start - 1;
550155829Stjr				s = beg;
551155829Stjr				while (bytes_left > 0)
552155829Stjr				  {
553155829Stjr				    mr = mbrlen (s, bytes_left, &mbs);
554155829Stjr				    if (mr == (size_t) -1 || mr == 0)
555155829Stjr				      {
556155829Stjr					memset (&mbs, '\0', sizeof (mbs));
557155829Stjr					s++;
558155829Stjr					bytes_left--;
559155829Stjr					continue;
560155829Stjr				      }
561155829Stjr				    if (mr == (size_t) -2)
562155829Stjr				      {
563155829Stjr					memset (&mbs, '\0', sizeof (mbs));
564155829Stjr					break;
565155829Stjr				      }
566155829Stjr				    s += mr;
567155829Stjr				    bytes_left -= mr;
568155829Stjr				  }
569155829Stjr			      }
570155829Stjr			    mr = mbrtowc (&pwc, s, beg + start - s, &mbs);
571155829Stjr			    if (mr == (size_t) -2 || mr == (size_t) -1 ||
572155829Stjr				mr == 0)
573155829Stjr			      {
574146207Stjr				memset (&mbs, '\0', sizeof (mbstate_t));
575146207Stjr				lword_match = 1;
576146207Stjr			      }
577146207Stjr			    else if (!(iswalnum (pwc) || pwc == L'_')
578155829Stjr				     && mr == beg + start - s)
579146207Stjr			      lword_match = 1;
580146207Stjr			  }
581146207Stjr			else
582146207Stjr#endif /* MBS_SUPPORT */
583146207Stjr			if (!WCHAR ((unsigned char) beg[start - 1]))
584146207Stjr			  lword_match = 1;
585146207Stjr		      }
586146207Stjr
587146207Stjr		    if (lword_match)
588146207Stjr		      {
589146207Stjr			int rword_match = 0;
590146207Stjr			if (start + len == end - beg - 1)
591146207Stjr			  rword_match = 1;
592146207Stjr			else
593146207Stjr			  {
594146207Stjr#ifdef MBS_SUPPORT
595146207Stjr			    if (mb_cur_max > 1)
596146207Stjr			      {
597146207Stjr				wchar_t nwc;
598146207Stjr				int mr;
599146207Stjr
600146207Stjr				mr = mbtowc (&nwc, beg + start + len,
601146207Stjr					     end - beg - start - len - 1);
602146207Stjr				if (mr <= 0)
603146207Stjr				  {
604146207Stjr				    memset (&mbs, '\0', sizeof (mbstate_t));
605146207Stjr				    rword_match = 1;
606146207Stjr				  }
607146207Stjr				else if (!iswalnum (nwc) && nwc != L'_')
608146207Stjr				  rword_match = 1;
609146207Stjr			      }
610146207Stjr			    else
611146207Stjr#endif /* MBS_SUPPORT */
612146207Stjr			    if (!WCHAR ((unsigned char) beg[start + len]))
613146207Stjr			      rword_match = 1;
614146207Stjr			  }
615146207Stjr
616146207Stjr			if (rword_match)
617146207Stjr			  {
618146207Stjr			    if (!exact)
619146207Stjr			      /* Returns the whole line. */
620146207Stjr			      goto success_in_beg_and_end;
621146207Stjr			    else
622146207Stjr			      /* Returns just this word match. */
623146207Stjr			      goto success_in_start_and_len;
624146207Stjr			  }
625146207Stjr		      }
626131557Stjr		    if (len > 0)
627131557Stjr		      {
628131557Stjr			/* Try a shorter length anchored at the same place. */
629131557Stjr			--len;
630131557Stjr			patterns[i].regexbuf.not_eol = 1;
631131557Stjr			len = re_match (&(patterns[i].regexbuf), beg,
632131557Stjr					start + len, start,
633131557Stjr					&(patterns[i].regs));
634131557Stjr		      }
635131557Stjr		    if (len <= 0)
636131557Stjr		      {
637131557Stjr			/* Try looking further on. */
638131557Stjr			if (start == end - beg - 1)
639131557Stjr			  break;
640131557Stjr			++start;
641131557Stjr			patterns[i].regexbuf.not_eol = 0;
642131557Stjr			start = re_search (&(patterns[i].regexbuf), beg,
643131557Stjr					   end - beg - 1,
644131557Stjr					   start, end - beg - 1 - start,
645131557Stjr					   &(patterns[i].regs));
646131557Stjr			len = patterns[i].regs.end[0] - start;
647131557Stjr		      }
64853451Speter		  }
649131557Stjr	    }
650131557Stjr	} /* for Regex patterns.  */
651131557Stjr    } /* for (beg = end ..) */
652131563Stjr
653131563Stjr failure:
654131557Stjr  return (size_t) -1;
65553451Speter
656131563Stjr success_in_beg_and_end:
657131563Stjr  len = end - beg;
658131563Stjr  start = beg - buf;
659131563Stjr  /* FALLTHROUGH */
660131563Stjr
661131563Stjr success_in_start_and_len:
662131563Stjr  *match_size = len;
663131563Stjr  return start;
66453451Speter}
66553451Speter
666146205Stjr#ifdef MBS_SUPPORT
667146205Stjrstatic int f_i_multibyte; /* whether we're using the new -Fi MB method */
668146205Stjrstatic struct
669146205Stjr{
670146205Stjr  wchar_t **patterns;
671146205Stjr  size_t count, maxlen;
672146205Stjr  unsigned char *match;
673146205Stjr} Fimb;
674146205Stjr#endif
675146205Stjr
67653451Speterstatic void
677131557StjrFcompile (char const *pattern, size_t size)
67853451Speter{
679146205Stjr  int mb_cur_max = MB_CUR_MAX;
680131557Stjr  char const *beg, *lim, *err;
68153451Speter
682146205Stjr  check_utf8 ();
683146205Stjr#ifdef MBS_SUPPORT
684146205Stjr  /* Support -F -i for UTF-8 input. */
685146205Stjr  if (match_icase && mb_cur_max > 1)
686146205Stjr    {
687146205Stjr      mbstate_t mbs;
688146205Stjr      wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
689146205Stjr      const char *patternend = pattern;
690146205Stjr      size_t wcsize;
691146205Stjr      kwset_t fimb_kwset = NULL;
692146205Stjr      char *starts = NULL;
693146205Stjr      wchar_t *wcbeg, *wclim;
694146205Stjr      size_t allocated = 0;
695146205Stjr
696146205Stjr      memset (&mbs, '\0', sizeof (mbs));
697146205Stjr# ifdef __GNU_LIBRARY__
698146205Stjr      wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
699146205Stjr      if (patternend != pattern + size)
700146205Stjr	wcsize = (size_t) -1;
701146205Stjr# else
702146205Stjr      {
703146205Stjr	char *patterncopy = xmalloc (size + 1);
704146205Stjr
705146205Stjr	memcpy (patterncopy, pattern, size);
706146205Stjr	patterncopy[size] = '\0';
707146205Stjr	patternend = patterncopy;
708146205Stjr	wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
709146205Stjr	if (patternend != patterncopy + size)
710146205Stjr	  wcsize = (size_t) -1;
711146205Stjr	free (patterncopy);
712146205Stjr      }
713146205Stjr# endif
714146205Stjr      if (wcsize + 2 <= 2)
715146205Stjr	{
716146205Stjrfimb_fail:
717146205Stjr	  free (wcpattern);
718146205Stjr	  free (starts);
719146205Stjr	  if (fimb_kwset)
720146205Stjr	    kwsfree (fimb_kwset);
721146205Stjr	  free (Fimb.patterns);
722146205Stjr	  Fimb.patterns = NULL;
723146205Stjr	}
724146205Stjr      else
725146205Stjr	{
726146205Stjr	  if (!(fimb_kwset = kwsalloc (NULL)))
727146205Stjr	    error (2, 0, _("memory exhausted"));
728146205Stjr
729146205Stjr	  starts = xmalloc (mb_cur_max * 3);
730146205Stjr	  wcbeg = wcpattern;
731146205Stjr	  do
732146205Stjr	    {
733146205Stjr	      int i;
734146205Stjr	      size_t wclen;
735146205Stjr
736146205Stjr	      if (Fimb.count >= allocated)
737146205Stjr		{
738146205Stjr		  if (allocated == 0)
739146205Stjr		    allocated = 128;
740146205Stjr		  else
741146205Stjr		    allocated *= 2;
742146205Stjr		  Fimb.patterns = xrealloc (Fimb.patterns,
743146205Stjr					    sizeof (wchar_t *) * allocated);
744146205Stjr		}
745146205Stjr	      Fimb.patterns[Fimb.count++] = wcbeg;
746146205Stjr	      for (wclim = wcbeg;
747146205Stjr		   wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
748146205Stjr		*wclim = towlower (*wclim);
749146205Stjr	      *wclim = L'\0';
750146205Stjr	      wclen = wclim - wcbeg;
751146205Stjr	      if (wclen > Fimb.maxlen)
752146205Stjr		Fimb.maxlen = wclen;
753146205Stjr	      if (wclen > 3)
754146205Stjr		wclen = 3;
755146205Stjr	      if (wclen == 0)
756146205Stjr		{
757146205Stjr		  if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
758146205Stjr		    error (2, 0, err);
759146205Stjr		}
760146205Stjr	      else
761146205Stjr		for (i = 0; i < (1 << wclen); i++)
762146205Stjr		  {
763146205Stjr		    char *p = starts;
764146205Stjr		    int j, k;
765146205Stjr
766146205Stjr		    for (j = 0; j < wclen; ++j)
767146205Stjr		      {
768146205Stjr			wchar_t wc = wcbeg[j];
769146205Stjr			if (i & (1 << j))
770146205Stjr			  {
771146205Stjr			    wc = towupper (wc);
772146205Stjr			    if (wc == wcbeg[j])
773146205Stjr			      continue;
774146205Stjr			  }
775146205Stjr			k = wctomb (p, wc);
776146205Stjr			if (k <= 0)
777146205Stjr			  goto fimb_fail;
778146205Stjr			p += k;
779146205Stjr		      }
780146205Stjr		    if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
781146205Stjr		      error (2, 0, err);
782146205Stjr		  }
783146205Stjr	      if (wclim < wcpattern + wcsize)
784146205Stjr		++wclim;
785146205Stjr	      wcbeg = wclim;
786146205Stjr	    }
787146205Stjr	  while (wcbeg < wcpattern + wcsize);
788146205Stjr	  f_i_multibyte = 1;
789146205Stjr	  kwset = fimb_kwset;
790146205Stjr	  free (starts);
791146205Stjr	  Fimb.match = xmalloc (Fimb.count);
792146205Stjr	  if ((err = kwsprep (kwset)) != 0)
793146205Stjr	    error (2, 0, err);
794146205Stjr	  return;
795146205Stjr	}
796146205Stjr    }
797146205Stjr#endif /* MBS_SUPPORT */
798146205Stjr
799146205Stjr
800131557Stjr  kwsinit ();
80153451Speter  beg = pattern;
80253451Speter  do
80353451Speter    {
80453451Speter      for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
80553451Speter	;
806131557Stjr      if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
807131557Stjr	error (2, 0, err);
80853451Speter      if (lim < pattern + size)
80953451Speter	++lim;
81053451Speter      beg = lim;
81153451Speter    }
81253451Speter  while (beg < pattern + size);
81353451Speter
814131557Stjr  if ((err = kwsprep (kwset)) != 0)
815131557Stjr    error (2, 0, err);
81653451Speter}
81753451Speter
818146205Stjr#ifdef MBS_SUPPORT
819146205Stjrstatic int
820146205StjrFimbexec (const char *buf, size_t size, size_t *plen, int exact)
821146205Stjr{
822146205Stjr  size_t len, letter, i;
823146205Stjr  int ret = -1;
824146205Stjr  mbstate_t mbs;
825146205Stjr  wchar_t wc;
826146205Stjr  int patterns_left;
827146205Stjr
828146205Stjr  assert (match_icase && f_i_multibyte == 1);
829146205Stjr  assert (MB_CUR_MAX > 1);
830146205Stjr
831146205Stjr  memset (&mbs, '\0', sizeof (mbs));
832146205Stjr  memset (Fimb.match, '\1', Fimb.count);
833146205Stjr  letter = len = 0;
834146205Stjr  patterns_left = 1;
835146205Stjr  while (patterns_left && len <= size)
836146205Stjr    {
837146205Stjr      size_t c;
838146205Stjr
839146205Stjr      patterns_left = 0;
840146205Stjr      if (len < size)
841146205Stjr	{
842146205Stjr	  c = mbrtowc (&wc, buf + len, size - len, &mbs);
843146205Stjr	  if (c + 2 <= 2)
844146205Stjr	    return ret;
845146205Stjr
846146205Stjr	  wc = towlower (wc);
847146205Stjr	}
848146205Stjr      else
849146205Stjr	{
850146205Stjr	  c = 1;
851146205Stjr	  wc = L'\0';
852146205Stjr	}
853146205Stjr
854146205Stjr      for (i = 0; i < Fimb.count; i++)
855146205Stjr	{
856146205Stjr	  if (Fimb.match[i])
857146205Stjr	    {
858146205Stjr	      if (Fimb.patterns[i][letter] == L'\0')
859146205Stjr		{
860146205Stjr		  /* Found a match. */
861146205Stjr		  *plen = len;
862146205Stjr		  if (!exact && !match_words)
863146205Stjr		    return 0;
864146205Stjr		  else
865146205Stjr		    {
866146205Stjr		      /* For -w or exact look for longest match.  */
867146205Stjr		      ret = 0;
868146205Stjr		      Fimb.match[i] = '\0';
869146205Stjr		      continue;
870146205Stjr		    }
871146205Stjr		}
872146205Stjr
873146205Stjr	      if (Fimb.patterns[i][letter] == wc)
874146205Stjr		patterns_left = 1;
875146205Stjr	      else
876146205Stjr		Fimb.match[i] = '\0';
877146205Stjr	    }
878146205Stjr	}
879146205Stjr
880146205Stjr      len += c;
881146205Stjr      letter++;
882146205Stjr    }
883146205Stjr
884146205Stjr  return ret;
885146205Stjr}
886146205Stjr#endif /* MBS_SUPPORT */
887146205Stjr
888131557Stjrstatic size_t
889146199StjrFexecute (char const *buf, size_t size, size_t *match_size, int exact)
89053451Speter{
891131557Stjr  register char const *beg, *try, *end;
89253451Speter  register size_t len;
89355379Sobrien  char eol = eolbyte;
89453451Speter  struct kwsmatch kwsmatch;
895131564Stjr  size_t ret_val;
896131557Stjr#ifdef MBS_SUPPORT
897146205Stjr  int mb_cur_max = MB_CUR_MAX;
898146205Stjr  mbstate_t mbs;
899146205Stjr  memset (&mbs, '\0', sizeof (mbstate_t));
900146205Stjr  const char *last_char = NULL;
901131557Stjr#endif /* MBS_SUPPORT */
90253451Speter
90353451Speter  for (beg = buf; beg <= buf + size; ++beg)
90453451Speter    {
905146205Stjr      size_t offset;
906146205Stjr      offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
907146205Stjr
908131557Stjr      if (offset == (size_t) -1)
909131563Stjr	goto failure;
910131557Stjr#ifdef MBS_SUPPORT
911146205Stjr      if (mb_cur_max > 1 && !using_utf8)
912146205Stjr	{
913146205Stjr	  size_t bytes_left = offset;
914146205Stjr	  while (bytes_left)
915146205Stjr	    {
916146205Stjr	      size_t mlen = mbrlen (beg, bytes_left, &mbs);
917146205Stjr
918146205Stjr	      last_char = beg;
919146205Stjr	      if (mlen == (size_t) -1 || mlen == 0)
920146205Stjr		{
921146205Stjr		  /* Incomplete character: treat as single-byte. */
922146205Stjr		  memset (&mbs, '\0', sizeof (mbstate_t));
923146205Stjr		  beg++;
924146205Stjr		  bytes_left--;
925146205Stjr		  continue;
926146205Stjr		}
927146205Stjr
928146205Stjr	      if (mlen == (size_t) -2)
929146205Stjr		/* Offset points inside multibyte character: no good. */
930146205Stjr		break;
931146205Stjr
932146205Stjr	      beg += mlen;
933146205Stjr	      bytes_left -= mlen;
934146205Stjr	    }
935146205Stjr
936146205Stjr	  if (bytes_left)
937146205Stjr	    continue;
938146205Stjr	}
939146205Stjr      else
940131557Stjr#endif /* MBS_SUPPORT */
941131557Stjr      beg += offset;
942146205Stjr#ifdef MBS_SUPPORT
943146205Stjr      /* For f_i_multibyte, the string at beg now matches first 3 chars of
944146205Stjr	 one of the search strings (less if there are shorter search strings).
945146205Stjr	 See if this is a real match.  */
946146205Stjr      if (f_i_multibyte
947146205Stjr	  && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
948146205Stjr	goto next_char;
949146205Stjr#endif /* MBS_SUPPORT */
95053451Speter      len = kwsmatch.size[0];
951131563Stjr      if (exact && !match_words)
952131563Stjr	goto success_in_beg_and_len;
95353451Speter      if (match_lines)
95453451Speter	{
95555379Sobrien	  if (beg > buf && beg[-1] != eol)
956146205Stjr	    goto next_char;
95755379Sobrien	  if (beg + len < buf + size && beg[len] != eol)
958146205Stjr	    goto next_char;
95953451Speter	  goto success;
96053451Speter	}
96153451Speter      else if (match_words)
962146205Stjr	{
963146205Stjr	  while (1)
964146205Stjr	    {
965146205Stjr	      int word_match = 0;
966146205Stjr	      if (beg > buf)
967146205Stjr		{
968146201Stjr#ifdef MBS_SUPPORT
969146205Stjr		  if (mb_cur_max > 1)
970146205Stjr		    {
971146205Stjr		      const char *s;
972146205Stjr		      int mr;
973146205Stjr		      wchar_t pwc;
974146205Stjr
975146205Stjr		      if (using_utf8)
976146205Stjr			{
977146205Stjr			  s = beg - 1;
978146205Stjr			  while (s > buf
979146205Stjr				 && (unsigned char) *s >= 0x80
980146205Stjr				 && (unsigned char) *s <= 0xbf)
981146205Stjr			    --s;
982146205Stjr			}
983146205Stjr		      else
984146205Stjr			s = last_char;
985146205Stjr		      mr = mbtowc (&pwc, s, beg - s);
986146205Stjr		      if (mr <= 0)
987146205Stjr			memset (&mbs, '\0', sizeof (mbstate_t));
988146205Stjr		      else if ((iswalnum (pwc) || pwc == L'_')
989146205Stjr			       && mr == (int) (beg - s))
990146205Stjr			goto next_char;
991146205Stjr		    }
992146205Stjr		  else
993146201Stjr#endif /* MBS_SUPPORT */
994151647Stjr		  if (WCHAR ((unsigned char) beg[-1]))
995146205Stjr		    goto next_char;
996146205Stjr		}
997146205Stjr#ifdef MBS_SUPPORT
998146205Stjr	      if (mb_cur_max > 1)
999146205Stjr		{
1000146205Stjr		  wchar_t nwc;
1001146205Stjr		  int mr;
1002146205Stjr
1003146205Stjr		  mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
1004146205Stjr		  if (mr <= 0)
1005146205Stjr		    {
1006146205Stjr		      memset (&mbs, '\0', sizeof (mbstate_t));
1007146205Stjr		      word_match = 1;
1008146205Stjr		    }
1009146205Stjr		  else if (!iswalnum (nwc) && nwc != L'_')
1010146205Stjr		    word_match = 1;
1011146205Stjr		}
1012146205Stjr	      else
1013146205Stjr#endif /* MBS_SUPPORT */
1014151647Stjr		if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
1015146205Stjr		  word_match = 1;
1016146205Stjr	      if (word_match)
1017146205Stjr		{
1018146205Stjr		  if (!exact)
1019146205Stjr		    /* Returns the whole line now we know there's a word match. */
1020146205Stjr		    goto success;
1021146205Stjr		  else
1022146205Stjr		    /* Returns just this word match. */
1023146205Stjr		    goto success_in_beg_and_len;
1024146205Stjr		}
1025146205Stjr	      if (len > 0)
1026146205Stjr		{
1027146205Stjr		  /* Try a shorter length anchored at the same place. */
1028146205Stjr		  --len;
1029146205Stjr		  offset = kwsexec (kwset, beg, len, &kwsmatch);
1030146205Stjr
1031146205Stjr		  if (offset == -1)
1032146205Stjr		    goto next_char; /* Try a different anchor. */
1033146205Stjr#ifdef MBS_SUPPORT
1034146205Stjr		  if (mb_cur_max > 1 && !using_utf8)
1035146205Stjr		    {
1036146205Stjr		      size_t bytes_left = offset;
1037146205Stjr		      while (bytes_left)
1038146205Stjr			{
1039146205Stjr			  size_t mlen = mbrlen (beg, bytes_left, &mbs);
1040146205Stjr
1041146205Stjr			  last_char = beg;
1042146205Stjr			  if (mlen == (size_t) -1 || mlen == 0)
1043146205Stjr			    {
1044146205Stjr			      /* Incomplete character: treat as single-byte. */
1045146205Stjr			      memset (&mbs, '\0', sizeof (mbstate_t));
1046146205Stjr			      beg++;
1047146205Stjr			      bytes_left--;
1048146205Stjr			      continue;
1049146205Stjr			    }
1050146205Stjr
1051146205Stjr			  if (mlen == (size_t) -2)
1052146205Stjr			    {
1053146205Stjr			      /* Offset points inside multibyte character:
1054146205Stjr			       * no good. */
1055146205Stjr			      break;
1056146205Stjr			    }
1057146205Stjr
1058146205Stjr			  beg += mlen;
1059146205Stjr			  bytes_left -= mlen;
1060146205Stjr			}
1061146205Stjr
1062146205Stjr		      if (bytes_left)
1063146205Stjr			{
1064146205Stjr			  memset (&mbs, '\0', sizeof (mbstate_t));
1065146205Stjr			  goto next_char; /* Try a different anchor. */
1066146205Stjr			}
1067146205Stjr		    }
1068146205Stjr		  else
1069146205Stjr#endif /* MBS_SUPPORT */
1070146205Stjr		  beg += offset;
1071146205Stjr#ifdef MBS_SUPPORT
1072146205Stjr		  /* The string at beg now matches first 3 chars of one of
1073146205Stjr		     the search strings (less if there are shorter search
1074146205Stjr		     strings).  See if this is a real match.  */
1075146205Stjr		  if (f_i_multibyte
1076146205Stjr		      && Fimbexec (beg, len - offset, &kwsmatch.size[0],
1077146205Stjr				   exact))
1078146205Stjr		    goto next_char;
1079146205Stjr#endif /* MBS_SUPPORT */
1080146205Stjr		  len = kwsmatch.size[0];
1081146205Stjr		}
1082146205Stjr	    }
1083146205Stjr	}
108453451Speter      else
108553451Speter	goto success;
1086146205Stjrnext_char:;
1087146205Stjr#ifdef MBS_SUPPORT
1088146205Stjr      /* Advance to next character.  For MB_CUR_MAX == 1 case this is handled
1089146205Stjr	 by ++beg above.  */
1090146205Stjr      if (mb_cur_max > 1)
1091146205Stjr	{
1092146205Stjr	  if (using_utf8)
1093146205Stjr	    {
1094146205Stjr	      unsigned char c = *beg;
1095146205Stjr	      if (c >= 0xc2)
1096146205Stjr		{
1097146205Stjr		  if (c < 0xe0)
1098146205Stjr		    ++beg;
1099146205Stjr		  else if (c < 0xf0)
1100146205Stjr		    beg += 2;
1101146205Stjr		  else if (c < 0xf8)
1102146205Stjr		    beg += 3;
1103146205Stjr		  else if (c < 0xfc)
1104146205Stjr		    beg += 4;
1105146205Stjr		  else if (c < 0xfe)
1106146205Stjr		    beg += 5;
1107146205Stjr		}
1108146205Stjr	    }
1109146205Stjr	  else
1110146205Stjr	    {
1111146205Stjr	      size_t l = mbrlen (beg, buf + size - beg, &mbs);
1112146205Stjr
1113146205Stjr	      last_char = beg;
1114146205Stjr	      if (l + 2 >= 2)
1115146205Stjr		beg += l - 1;
1116146205Stjr	      else
1117146205Stjr		memset (&mbs, '\0', sizeof (mbstate_t));
1118146205Stjr	    }
1119146205Stjr	}
1120146205Stjr#endif /* MBS_SUPPORT */
112153451Speter    }
112253451Speter
1123131563Stjr failure:
1124146205Stjr  return -1;
1125146205Stjr
1126146205Stjr success:
1127131557Stjr#ifdef MBS_SUPPORT
1128146205Stjr  if (mb_cur_max > 1 && !using_utf8)
1129131564Stjr    {
1130146205Stjr      end = beg + len;
1131146205Stjr      while (end < buf + size)
1132146205Stjr	{
1133146205Stjr	  size_t mlen = mbrlen (end, buf + size - end, &mbs);
1134146205Stjr	  if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
1135146205Stjr	    {
1136146205Stjr	      memset (&mbs, '\0', sizeof (mbstate_t));
1137146205Stjr	      mlen = 1;
1138146205Stjr	    }
1139146205Stjr	  if (mlen == 1 && *end == eol)
1140146205Stjr	    break;
1141146205Stjr
1142146205Stjr	  end += mlen;
1143146205Stjr	}
1144131564Stjr    }
1145146205Stjr  else
1146131557Stjr#endif /* MBS_SUPPORT */
1147146205Stjr  end = memchr (beg + len, eol, (buf + size) - (beg + len));
114853451Speter
1149131557Stjr  end++;
1150131557Stjr  while (buf < beg && beg[-1] != eol)
115153451Speter    --beg;
1152131563Stjr  len = end - beg;
1153131563Stjr  /* FALLTHROUGH */
1154131563Stjr
1155131563Stjr success_in_beg_and_len:
1156131563Stjr  *match_size = len;
1157131557Stjr  return beg - buf;
115853451Speter}
1159131557Stjr
1160131557Stjr#if HAVE_LIBPCRE
1161131557Stjr/* Compiled internal form of a Perl regular expression.  */
1162131557Stjrstatic pcre *cre;
1163131557Stjr
1164131557Stjr/* Additional information about the pattern.  */
1165131557Stjrstatic pcre_extra *extra;
1166131557Stjr#endif
1167131557Stjr
1168131557Stjrstatic void
1169131557StjrPcompile (char const *pattern, size_t size)
1170131557Stjr{
1171131557Stjr#if !HAVE_LIBPCRE
1172131557Stjr  error (2, 0, _("The -P option is not supported"));
1173131557Stjr#else
1174131557Stjr  int e;
1175131557Stjr  char const *ep;
1176131557Stjr  char *re = xmalloc (4 * size + 7);
1177131557Stjr  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
1178131557Stjr  char const *patlim = pattern + size;
1179131557Stjr  char *n = re;
1180131557Stjr  char const *p;
1181131557Stjr  char const *pnul;
1182131557Stjr
1183131557Stjr  /* FIXME: Remove this restriction.  */
1184131557Stjr  if (eolbyte != '\n')
1185131557Stjr    error (2, 0, _("The -P and -z options cannot be combined"));
1186131557Stjr
1187131557Stjr  *n = '\0';
1188131557Stjr  if (match_lines)
1189131557Stjr    strcpy (n, "^(");
1190131557Stjr  if (match_words)
1191131557Stjr    strcpy (n, "\\b(");
1192131557Stjr  n += strlen (n);
1193131557Stjr
1194131557Stjr  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
1195131557Stjr     replace each NUL byte in the pattern with the four characters
1196131557Stjr     "\000", removing a preceding backslash if there are an odd
1197131557Stjr     number of backslashes before the NUL.
1198131557Stjr
1199131557Stjr     FIXME: This method does not work with some multibyte character
1200131557Stjr     encodings, notably Shift-JIS, where a multibyte character can end
1201131557Stjr     in a backslash byte.  */
1202131557Stjr  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
1203131557Stjr    {
1204131557Stjr      memcpy (n, p, pnul - p);
1205131557Stjr      n += pnul - p;
1206131557Stjr      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
1207131557Stjr	continue;
1208131557Stjr      n -= (pnul - p) & 1;
1209131557Stjr      strcpy (n, "\\000");
1210131557Stjr      n += 4;
1211131557Stjr    }
1212131557Stjr
1213131557Stjr  memcpy (n, p, patlim - p);
1214131557Stjr  n += patlim - p;
1215131557Stjr  *n = '\0';
1216131557Stjr  if (match_words)
1217131557Stjr    strcpy (n, ")\\b");
1218131557Stjr  if (match_lines)
1219131557Stjr    strcpy (n, ")$");
1220131557Stjr
1221131557Stjr  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
1222131557Stjr  if (!cre)
1223131557Stjr    error (2, 0, ep);
1224131557Stjr
1225131557Stjr  extra = pcre_study (cre, 0, &ep);
1226131557Stjr  if (ep)
1227131557Stjr    error (2, 0, ep);
1228131557Stjr
1229131557Stjr  free (re);
1230131557Stjr#endif
1231131557Stjr}
1232131557Stjr
1233131557Stjrstatic size_t
1234146199StjrPexecute (char const *buf, size_t size, size_t *match_size, int exact)
1235131557Stjr{
1236131557Stjr#if !HAVE_LIBPCRE
1237131557Stjr  abort ();
1238131557Stjr  return -1;
1239131557Stjr#else
1240131557Stjr  /* This array must have at least two elements; everything after that
1241131557Stjr     is just for performance improvement in pcre_exec.  */
1242131557Stjr  int sub[300];
1243131557Stjr
1244131557Stjr  int e = pcre_exec (cre, extra, buf, size, 0, 0,
1245131557Stjr		     sub, sizeof sub / sizeof *sub);
1246131557Stjr
1247131557Stjr  if (e <= 0)
1248131557Stjr    {
1249131557Stjr      switch (e)
1250131557Stjr	{
1251131557Stjr	case PCRE_ERROR_NOMATCH:
1252131557Stjr	  return -1;
1253131557Stjr
1254131557Stjr	case PCRE_ERROR_NOMEMORY:
1255131557Stjr	  error (2, 0, _("Memory exhausted"));
1256131557Stjr
1257131557Stjr	default:
1258131557Stjr	  abort ();
1259131557Stjr	}
1260131557Stjr    }
1261131557Stjr  else
1262131557Stjr    {
1263131557Stjr      /* Narrow down to the line we've found.  */
1264131557Stjr      char const *beg = buf + sub[0];
1265131557Stjr      char const *end = buf + sub[1];
1266131557Stjr      char const *buflim = buf + size;
1267131557Stjr      char eol = eolbyte;
1268131557Stjr      if (!exact)
1269131557Stjr	{
1270131557Stjr	  end = memchr (end, eol, buflim - end);
1271131557Stjr	  end++;
1272131557Stjr	  while (buf < beg && beg[-1] != eol)
1273131557Stjr	    --beg;
1274131557Stjr	}
1275131557Stjr
1276131557Stjr      *match_size = end - beg;
1277131557Stjr      return beg - buf;
1278131557Stjr    }
1279131557Stjr#endif
1280131557Stjr}
1281131557Stjr
1282131557Stjrstruct matcher const matchers[] = {
1283131557Stjr  { "default", Gcompile, EGexecute },
1284131557Stjr  { "grep", Gcompile, EGexecute },
1285131557Stjr  { "egrep", Ecompile, EGexecute },
1286131557Stjr  { "awk", Ecompile, EGexecute },
1287131557Stjr  { "fgrep", Fcompile, Fexecute },
1288131557Stjr  { "perl", Pcompile, Pexecute },
1289131557Stjr  { "", 0, 0 },
1290131557Stjr};
1291