153451Speter/* search.c - searching subroutines using dfa, kwset and regex for grep. 2126435Sache Copyright 1992, 1998, 2000 Free Software Foundation, Inc. 353451Speter 453451Speter This program is free software; you can redistribute it and/or modify 553451Speter it under the terms of the GNU General Public License as published by 653451Speter the Free Software Foundation; either version 2, or (at your option) 753451Speter any later version. 853451Speter 953451Speter This program is distributed in the hope that it will be useful, 1053451Speter but WITHOUT ANY WARRANTY; without even the implied warranty of 1153451Speter MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1253451Speter GNU General Public License for more details. 1353451Speter 1453451Speter You should have received a copy of the GNU General Public License 1553451Speter along with this program; if not, write to the Free Software 1653479Sobrien Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 1753479Sobrien 02111-1307, USA. */ 1853451Speter 1953479Sobrien/* Written August 1992 by Mike Haertel. */ 2053451Speter 2153479Sobrien/* $FreeBSD$ */ 2253451Speter 23146205Stjr#ifndef _GNU_SOURCE 24146205Stjr# define _GNU_SOURCE 1 25146205Stjr#endif 2653479Sobrien#ifdef HAVE_CONFIG_H 2753479Sobrien# include <config.h> 2853479Sobrien#endif 29146205Stjr#include <assert.h> 3053451Speter#include <sys/types.h> 31131557Stjr#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 32131557Stjr/* We can handle multibyte string. */ 33131557Stjr# define MBS_SUPPORT 34131557Stjr# include <wchar.h> 35131557Stjr# include <wctype.h> 36131557Stjr#endif 37131557Stjr 3853479Sobrien#include "system.h" 3953479Sobrien#include "grep.h" 4053479Sobrien#include "regex.h" 4153451Speter#include "dfa.h" 4253451Speter#include "kwset.h" 43131557Stjr#include "error.h" 44131557Stjr#include "xalloc.h" 45131557Stjr#ifdef HAVE_LIBPCRE 46131557Stjr# include <pcre.h> 47131557Stjr#endif 48146205Stjr#ifdef HAVE_LANGINFO_CODESET 49146205Stjr# include <langinfo.h> 50146205Stjr#endif 5153451Speter 5253451Speter#define NCHAR (UCHAR_MAX + 1) 5353451Speter 5453451Speter/* For -w, we also consider _ to be word constituent. */ 5553451Speter#define WCHAR(C) (ISALNUM(C) || (C) == '_') 5653451Speter 5753451Speter/* DFA compiled regexp. */ 5853451Speterstatic struct dfa dfa; 5953451Speter 60131557Stjr/* The Regex compiled patterns. */ 61131557Stjrstatic struct patterns 62131557Stjr{ 63131557Stjr /* Regex compiled regexp. */ 64131557Stjr struct re_pattern_buffer regexbuf; 65131557Stjr struct re_registers regs; /* This is here on account of a BRAIN-DEAD 66131557Stjr Q@#%!# library interface in regex.c. */ 67131557Stjr} patterns0; 6853451Speter 69131557Stjrstruct patterns *patterns; 70131557Stjrsize_t pcount; 71131557Stjr 7253451Speter/* KWset compiled pattern. For Ecompile and Gcompile, we compile 7353451Speter a list of strings, at least one of which is known to occur in 7453451Speter any string matching the regexp. */ 7553451Speterstatic kwset_t kwset; 7653451Speter 77131557Stjr/* Number of compiled fixed strings known to exactly match the regexp. 78131557Stjr If kwsexec returns < kwset_exact_matches, then we don't need to 7953451Speter call the regexp matcher at all. */ 80131557Stjrstatic int kwset_exact_matches; 8153451Speter 82146205Stjr/* UTF-8 encoding allows some optimizations that we can't otherwise 83146205Stjr assume in a multibyte encoding. */ 84146205Stjrstatic int using_utf8; 85146205Stjr 86131557Stjrstatic void kwsinit PARAMS ((void)); 87131557Stjrstatic void kwsmusts PARAMS ((void)); 88131557Stjrstatic void Gcompile PARAMS ((char const *, size_t)); 89131557Stjrstatic void Ecompile PARAMS ((char const *, size_t)); 90146199Stjrstatic size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); 91131557Stjrstatic void Fcompile PARAMS ((char const *, size_t)); 92146199Stjrstatic size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); 93131557Stjrstatic void Pcompile PARAMS ((char const *, size_t )); 94146199Stjrstatic size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); 95131557Stjr 9653451Spetervoid 97146205Stjrcheck_utf8 (void) 98146205Stjr{ 99146205Stjr#ifdef HAVE_LANGINFO_CODESET 100146205Stjr if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) 101146205Stjr using_utf8 = 1; 102146205Stjr#endif 103146205Stjr} 104146205Stjr 105146205Stjrvoid 10656920Srudfaerror (char const *mesg) 10753451Speter{ 108131557Stjr error (2, 0, mesg); 10953451Speter} 11053451Speter 11153451Speterstatic void 11256920Srukwsinit (void) 11353451Speter{ 11453451Speter static char trans[NCHAR]; 115250823Spfg size_t i; 11653451Speter 11753451Speter if (match_icase) 11853451Speter for (i = 0; i < NCHAR; ++i) 119131557Stjr trans[i] = TOLOWER (i); 12053451Speter 121131557Stjr if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) 122131557Stjr error (2, 0, _("memory exhausted")); 12353451Speter} 12453451Speter 12553451Speter/* If the DFA turns out to have some set of fixed strings one of 12653451Speter which must occur in the match, then we build a kwset matcher 12753451Speter to find those strings, and thus quickly filter out impossible 12853451Speter matches. */ 12953451Speterstatic void 13056920Srukwsmusts (void) 13153451Speter{ 132131557Stjr struct dfamust const *dm; 133131557Stjr char const *err; 13453451Speter 13553451Speter if (dfa.musts) 13653451Speter { 137131557Stjr kwsinit (); 13853451Speter /* First, we compile in the substrings known to be exact 13953451Speter matches. The kwset matcher will return the index 14053451Speter of the matching string that it chooses. */ 14153451Speter for (dm = dfa.musts; dm; dm = dm->next) 14253451Speter { 14353451Speter if (!dm->exact) 14453451Speter continue; 145131557Stjr ++kwset_exact_matches; 146131557Stjr if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 147131557Stjr error (2, 0, err); 14853451Speter } 14953451Speter /* Now, we compile the substrings that will require 15053451Speter the use of the regexp matcher. */ 15153451Speter for (dm = dfa.musts; dm; dm = dm->next) 15253451Speter { 15353451Speter if (dm->exact) 15453451Speter continue; 155131557Stjr if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 156131557Stjr error (2, 0, err); 15753451Speter } 158131557Stjr if ((err = kwsprep (kwset)) != 0) 159131557Stjr error (2, 0, err); 16053451Speter } 16153451Speter} 16253451Speter 16353451Speterstatic void 164131557StjrGcompile (char const *pattern, size_t size) 16553451Speter{ 16653479Sobrien const char *err; 167131557Stjr char const *sep; 168131557Stjr size_t total = size; 169131557Stjr char const *motif = pattern; 17053451Speter 171146205Stjr check_utf8 (); 172146820Stjr re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); 173131557Stjr dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 17453451Speter 175131557Stjr /* For GNU regex compiler we have to pass the patterns separately to detect 176131557Stjr errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 177131557Stjr GNU regex should have raise a syntax error. The same for backref, where 178131557Stjr the backref should have been local to each pattern. */ 179131557Stjr do 180131557Stjr { 181131557Stjr size_t len; 182131557Stjr sep = memchr (motif, '\n', total); 183131557Stjr if (sep) 184131557Stjr { 185131557Stjr len = sep - motif; 186131557Stjr sep++; 187131557Stjr total -= (len + 1); 188131557Stjr } 189131557Stjr else 190131557Stjr { 191131557Stjr len = total; 192131557Stjr total = 0; 193131557Stjr } 19453451Speter 195131557Stjr patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 196131557Stjr if (patterns == NULL) 197131557Stjr error (2, errno, _("memory exhausted")); 198131557Stjr 199131557Stjr patterns[pcount] = patterns0; 200131557Stjr 201131557Stjr if ((err = re_compile_pattern (motif, len, 202131557Stjr &(patterns[pcount].regexbuf))) != 0) 203131557Stjr error (2, 0, err); 204131557Stjr pcount++; 205131557Stjr 206131557Stjr motif = sep; 207131557Stjr } while (sep && total != 0); 208131557Stjr 20953451Speter /* In the match_words and match_lines cases, we use a different pattern 21053451Speter for the DFA matcher that will quickly throw out cases that won't work. 21153451Speter Then if DFA succeeds we do some hairy stuff using the regex matcher 21253451Speter to decide whether the match should really count. */ 21353451Speter if (match_words || match_lines) 21453451Speter { 21553451Speter /* In the whole-word case, we use the pattern: 216131557Stjr \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 21753451Speter In the whole-line case, we use the pattern: 218131557Stjr ^\(userpattern\)$. */ 21953451Speter 220131557Stjr static char const line_beg[] = "^\\("; 221131557Stjr static char const line_end[] = "\\)$"; 222131557Stjr static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 223131557Stjr static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 224131564Stjr char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 225131557Stjr size_t i; 226131557Stjr strcpy (n, match_lines ? line_beg : word_beg); 227131557Stjr i = strlen (n); 228131557Stjr memcpy (n + i, pattern, size); 22953451Speter i += size; 230131557Stjr strcpy (n + i, match_lines ? line_end : word_end); 231131557Stjr i += strlen (n + i); 232131557Stjr pattern = n; 233131557Stjr size = i; 23453451Speter } 23553451Speter 236131557Stjr dfacomp (pattern, size, &dfa, 1); 237131557Stjr kwsmusts (); 23853451Speter} 23953451Speter 24053451Speterstatic void 241131557StjrEcompile (char const *pattern, size_t size) 24253451Speter{ 24353479Sobrien const char *err; 244131557Stjr const char *sep; 245131557Stjr size_t total = size; 246131557Stjr char const *motif = pattern; 24753451Speter 248146205Stjr check_utf8 (); 249131557Stjr if (strcmp (matcher, "awk") == 0) 25053451Speter { 251146820Stjr re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); 252131557Stjr dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); 25353479Sobrien } 25453451Speter else 25553451Speter { 256146820Stjr re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); 25755379Sobrien dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); 25853451Speter } 25953451Speter 260131557Stjr /* For GNU regex compiler we have to pass the patterns separately to detect 261131557Stjr errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 262131557Stjr GNU regex should have raise a syntax error. The same for backref, where 263131557Stjr the backref should have been local to each pattern. */ 264131557Stjr do 265131557Stjr { 266131557Stjr size_t len; 267131557Stjr sep = memchr (motif, '\n', total); 268131557Stjr if (sep) 269131557Stjr { 270131557Stjr len = sep - motif; 271131557Stjr sep++; 272131557Stjr total -= (len + 1); 273131557Stjr } 274131557Stjr else 275131557Stjr { 276131557Stjr len = total; 277131557Stjr total = 0; 278131557Stjr } 27953451Speter 280131557Stjr patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 281131557Stjr if (patterns == NULL) 282131557Stjr error (2, errno, _("memory exhausted")); 283131557Stjr patterns[pcount] = patterns0; 284131557Stjr 285131557Stjr if ((err = re_compile_pattern (motif, len, 286131557Stjr &(patterns[pcount].regexbuf))) != 0) 287131557Stjr error (2, 0, err); 288131557Stjr pcount++; 289131557Stjr 290131557Stjr motif = sep; 291131557Stjr } while (sep && total != 0); 292131557Stjr 29353451Speter /* In the match_words and match_lines cases, we use a different pattern 29453451Speter for the DFA matcher that will quickly throw out cases that won't work. 29553451Speter Then if DFA succeeds we do some hairy stuff using the regex matcher 29653451Speter to decide whether the match should really count. */ 29753451Speter if (match_words || match_lines) 29853451Speter { 29953451Speter /* In the whole-word case, we use the pattern: 300131557Stjr (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 30153451Speter In the whole-line case, we use the pattern: 302131557Stjr ^(userpattern)$. */ 30353451Speter 304131557Stjr static char const line_beg[] = "^("; 305131557Stjr static char const line_end[] = ")$"; 306131557Stjr static char const word_beg[] = "(^|[^[:alnum:]_])("; 307131557Stjr static char const word_end[] = ")([^[:alnum:]_]|$)"; 308131564Stjr char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 309131557Stjr size_t i; 310131557Stjr strcpy (n, match_lines ? line_beg : word_beg); 31153451Speter i = strlen(n); 312131557Stjr memcpy (n + i, pattern, size); 31353451Speter i += size; 314131557Stjr strcpy (n + i, match_lines ? line_end : word_end); 315131557Stjr i += strlen (n + i); 316131557Stjr pattern = n; 317131557Stjr size = i; 31853451Speter } 31953451Speter 320131557Stjr dfacomp (pattern, size, &dfa, 1); 321131557Stjr kwsmusts (); 32253451Speter} 32353451Speter 324131557Stjrstatic size_t 325146199StjrEGexecute (char const *buf, size_t size, size_t *match_size, int exact) 32653451Speter{ 327131557Stjr register char const *buflim, *beg, *end; 32855379Sobrien char eol = eolbyte; 329250823Spfg int backref; 330250823Spfg ptrdiff_t start, len; 33153451Speter struct kwsmatch kwsm; 332131564Stjr size_t i, ret_val; 333146206Stjr static int use_dfa; 334146206Stjr static int use_dfa_checked = 0; 335131557Stjr#ifdef MBS_SUPPORT 336146207Stjr const char *last_char = NULL; 337146205Stjr int mb_cur_max = MB_CUR_MAX; 338146205Stjr mbstate_t mbs; 339146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 340131557Stjr#endif /* MBS_SUPPORT */ 34153451Speter 342146206Stjr if (!use_dfa_checked) 343146206Stjr { 344146206Stjr char *grep_use_dfa = getenv ("GREP_USE_DFA"); 345146206Stjr if (!grep_use_dfa) 346146206Stjr { 347146206Stjr#ifdef MBS_SUPPORT 348146206Stjr /* Turn off DFA when processing multibyte input. */ 349146206Stjr use_dfa = (MB_CUR_MAX == 1); 350146206Stjr#else 351146206Stjr use_dfa = 1; 352146206Stjr#endif /* MBS_SUPPORT */ 353146206Stjr } 354146206Stjr else 355146206Stjr { 356146206Stjr use_dfa = atoi (grep_use_dfa); 357146206Stjr } 358146206Stjr 359146206Stjr use_dfa_checked = 1; 360146206Stjr } 361146206Stjr 36253451Speter buflim = buf + size; 36353451Speter 364131557Stjr for (beg = end = buf; end < buflim; beg = end) 36553451Speter { 366131557Stjr if (!exact) 36753451Speter { 368131557Stjr if (kwset) 36953451Speter { 370131557Stjr /* Find a possible match using the KWset matcher. */ 371146205Stjr#ifdef MBS_SUPPORT 372146205Stjr size_t bytes_left = 0; 373146205Stjr#endif /* MBS_SUPPORT */ 374146205Stjr size_t offset; 375146205Stjr#ifdef MBS_SUPPORT 376146205Stjr /* kwsexec doesn't work with match_icase and multibyte input. */ 377146205Stjr if (match_icase && mb_cur_max > 1) 378146205Stjr /* Avoid kwset */ 379146205Stjr offset = 0; 380146205Stjr else 381146205Stjr#endif /* MBS_SUPPORT */ 382146205Stjr offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 383131557Stjr if (offset == (size_t) -1) 384131563Stjr goto failure; 385146205Stjr#ifdef MBS_SUPPORT 386146205Stjr if (mb_cur_max > 1 && !using_utf8) 387146205Stjr { 388146205Stjr bytes_left = offset; 389146205Stjr while (bytes_left) 390146205Stjr { 391146205Stjr size_t mlen = mbrlen (beg, bytes_left, &mbs); 392146207Stjr 393146207Stjr last_char = beg; 394146205Stjr if (mlen == (size_t) -1 || mlen == 0) 395146205Stjr { 396146205Stjr /* Incomplete character: treat as single-byte. */ 397146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 398146205Stjr beg++; 399146205Stjr bytes_left--; 400146205Stjr continue; 401146205Stjr } 402146205Stjr 403146205Stjr if (mlen == (size_t) -2) 404146205Stjr /* Offset points inside multibyte character: 405146205Stjr * no good. */ 406146205Stjr break; 407146205Stjr 408146205Stjr beg += mlen; 409146205Stjr bytes_left -= mlen; 410146205Stjr } 411146205Stjr } 412146205Stjr else 413146205Stjr#endif /* MBS_SUPPORT */ 414131557Stjr beg += offset; 415131557Stjr /* Narrow down to the line containing the candidate, and 416131557Stjr run it through DFA. */ 417131557Stjr end = memchr(beg, eol, buflim - beg); 418131557Stjr end++; 419131557Stjr#ifdef MBS_SUPPORT 420146205Stjr if (mb_cur_max > 1 && bytes_left) 421131557Stjr continue; 422146205Stjr#endif /* MBS_SUPPORT */ 423131557Stjr while (beg > buf && beg[-1] != eol) 424131557Stjr --beg; 425146205Stjr if ( 426146205Stjr#ifdef MBS_SUPPORT 427146205Stjr !(match_icase && mb_cur_max > 1) && 428146205Stjr#endif /* MBS_SUPPORT */ 429146205Stjr (kwsm.index < kwset_exact_matches)) 430131563Stjr goto success_in_beg_and_end; 431146206Stjr if (use_dfa && 432146206Stjr dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 433131557Stjr continue; 43453451Speter } 435131557Stjr else 436131557Stjr { 437131557Stjr /* No good fixed strings; start with DFA. */ 438146205Stjr#ifdef MBS_SUPPORT 439146205Stjr size_t bytes_left = 0; 440146205Stjr#endif /* MBS_SUPPORT */ 441146206Stjr size_t offset = 0; 442146206Stjr if (use_dfa) 443146206Stjr offset = dfaexec (&dfa, beg, buflim - beg, &backref); 444131557Stjr if (offset == (size_t) -1) 445131557Stjr break; 446131557Stjr /* Narrow down to the line we've found. */ 447146205Stjr#ifdef MBS_SUPPORT 448146205Stjr if (mb_cur_max > 1 && !using_utf8) 449146205Stjr { 450146205Stjr bytes_left = offset; 451146205Stjr while (bytes_left) 452146205Stjr { 453146205Stjr size_t mlen = mbrlen (beg, bytes_left, &mbs); 454146207Stjr 455146207Stjr last_char = beg; 456146205Stjr if (mlen == (size_t) -1 || mlen == 0) 457146205Stjr { 458146205Stjr /* Incomplete character: treat as single-byte. */ 459146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 460146205Stjr beg++; 461146205Stjr bytes_left--; 462146205Stjr continue; 463146205Stjr } 464146205Stjr 465146205Stjr if (mlen == (size_t) -2) 466146205Stjr /* Offset points inside multibyte character: 467146205Stjr * no good. */ 468146205Stjr break; 469146205Stjr 470146205Stjr beg += mlen; 471146205Stjr bytes_left -= mlen; 472146205Stjr } 473146205Stjr } 474146205Stjr else 475146205Stjr#endif /* MBS_SUPPORT */ 476131557Stjr beg += offset; 477131557Stjr end = memchr (beg, eol, buflim - beg); 478131557Stjr end++; 479146205Stjr#ifdef MBS_SUPPORT 480146205Stjr if (mb_cur_max > 1 && bytes_left) 481146205Stjr continue; 482146205Stjr#endif /* MBS_SUPPORT */ 483131557Stjr while (beg > buf && beg[-1] != eol) 484131557Stjr --beg; 485131557Stjr } 486131557Stjr /* Successful, no backreferences encountered! */ 487146206Stjr if (use_dfa && !backref) 488131563Stjr goto success_in_beg_and_end; 48953451Speter } 49053451Speter else 491131557Stjr end = beg + size; 492131557Stjr 49353451Speter /* If we've made it to this point, this means DFA has seen 49453451Speter a probable match, and we need to run it through Regex. */ 495131557Stjr for (i = 0; i < pcount; i++) 49653451Speter { 497131557Stjr patterns[i].regexbuf.not_eol = 0; 498131557Stjr if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, 499131557Stjr end - beg - 1, 0, 500131557Stjr end - beg - 1, &(patterns[i].regs)))) 501131557Stjr { 502131557Stjr len = patterns[i].regs.end[0] - start; 503131563Stjr if (exact && !match_words) 504131563Stjr goto success_in_start_and_len; 505131557Stjr if ((!match_lines && !match_words) 506131557Stjr || (match_lines && len == end - beg - 1)) 507131563Stjr goto success_in_beg_and_end; 508131557Stjr /* If -w, check if the match aligns with word boundaries. 509131557Stjr We do this iteratively because: 510131557Stjr (a) the line may contain more than one occurence of the 511131557Stjr pattern, and 512131557Stjr (b) Several alternatives in the pattern might be valid at a 513131557Stjr given point, and we may need to consider a shorter one to 514131557Stjr find a word boundary. */ 515131557Stjr if (match_words) 516131557Stjr while (start >= 0) 51753451Speter { 518146207Stjr int lword_match = 0; 519146207Stjr if (start == 0) 520146207Stjr lword_match = 1; 521146207Stjr else 522146207Stjr { 523146207Stjr assert (start > 0); 524146207Stjr#ifdef MBS_SUPPORT 525146207Stjr if (mb_cur_max > 1) 526146207Stjr { 527146207Stjr const char *s; 528155829Stjr size_t mr; 529146207Stjr wchar_t pwc; 530146207Stjr 531155829Stjr /* Locate the start of the multibyte character 532155829Stjr before the match position (== beg + start). */ 533146207Stjr if (using_utf8) 534146207Stjr { 535155829Stjr /* UTF-8 is a special case: scan backwards 536155829Stjr until we find a 7-bit character or a 537155829Stjr lead byte. */ 538146207Stjr s = beg + start - 1; 539146207Stjr while (s > buf 540146207Stjr && (unsigned char) *s >= 0x80 541146207Stjr && (unsigned char) *s <= 0xbf) 542146207Stjr --s; 543146207Stjr } 544146207Stjr else 545146207Stjr { 546155829Stjr /* Scan forwards to find the start of the 547155829Stjr last complete character before the 548155829Stjr match position. */ 549155829Stjr size_t bytes_left = start - 1; 550155829Stjr s = beg; 551155829Stjr while (bytes_left > 0) 552155829Stjr { 553155829Stjr mr = mbrlen (s, bytes_left, &mbs); 554155829Stjr if (mr == (size_t) -1 || mr == 0) 555155829Stjr { 556155829Stjr memset (&mbs, '\0', sizeof (mbs)); 557155829Stjr s++; 558155829Stjr bytes_left--; 559155829Stjr continue; 560155829Stjr } 561155829Stjr if (mr == (size_t) -2) 562155829Stjr { 563155829Stjr memset (&mbs, '\0', sizeof (mbs)); 564155829Stjr break; 565155829Stjr } 566155829Stjr s += mr; 567155829Stjr bytes_left -= mr; 568155829Stjr } 569155829Stjr } 570155829Stjr mr = mbrtowc (&pwc, s, beg + start - s, &mbs); 571155829Stjr if (mr == (size_t) -2 || mr == (size_t) -1 || 572155829Stjr mr == 0) 573155829Stjr { 574146207Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 575146207Stjr lword_match = 1; 576146207Stjr } 577146207Stjr else if (!(iswalnum (pwc) || pwc == L'_') 578155829Stjr && mr == beg + start - s) 579146207Stjr lword_match = 1; 580146207Stjr } 581146207Stjr else 582146207Stjr#endif /* MBS_SUPPORT */ 583146207Stjr if (!WCHAR ((unsigned char) beg[start - 1])) 584146207Stjr lword_match = 1; 585146207Stjr } 586146207Stjr 587146207Stjr if (lword_match) 588146207Stjr { 589146207Stjr int rword_match = 0; 590146207Stjr if (start + len == end - beg - 1) 591146207Stjr rword_match = 1; 592146207Stjr else 593146207Stjr { 594146207Stjr#ifdef MBS_SUPPORT 595146207Stjr if (mb_cur_max > 1) 596146207Stjr { 597146207Stjr wchar_t nwc; 598146207Stjr int mr; 599146207Stjr 600146207Stjr mr = mbtowc (&nwc, beg + start + len, 601146207Stjr end - beg - start - len - 1); 602146207Stjr if (mr <= 0) 603146207Stjr { 604146207Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 605146207Stjr rword_match = 1; 606146207Stjr } 607146207Stjr else if (!iswalnum (nwc) && nwc != L'_') 608146207Stjr rword_match = 1; 609146207Stjr } 610146207Stjr else 611146207Stjr#endif /* MBS_SUPPORT */ 612146207Stjr if (!WCHAR ((unsigned char) beg[start + len])) 613146207Stjr rword_match = 1; 614146207Stjr } 615146207Stjr 616146207Stjr if (rword_match) 617146207Stjr { 618146207Stjr if (!exact) 619146207Stjr /* Returns the whole line. */ 620146207Stjr goto success_in_beg_and_end; 621146207Stjr else 622146207Stjr /* Returns just this word match. */ 623146207Stjr goto success_in_start_and_len; 624146207Stjr } 625146207Stjr } 626131557Stjr if (len > 0) 627131557Stjr { 628131557Stjr /* Try a shorter length anchored at the same place. */ 629131557Stjr --len; 630131557Stjr patterns[i].regexbuf.not_eol = 1; 631131557Stjr len = re_match (&(patterns[i].regexbuf), beg, 632131557Stjr start + len, start, 633131557Stjr &(patterns[i].regs)); 634131557Stjr } 635131557Stjr if (len <= 0) 636131557Stjr { 637131557Stjr /* Try looking further on. */ 638131557Stjr if (start == end - beg - 1) 639131557Stjr break; 640131557Stjr ++start; 641131557Stjr patterns[i].regexbuf.not_eol = 0; 642131557Stjr start = re_search (&(patterns[i].regexbuf), beg, 643131557Stjr end - beg - 1, 644131557Stjr start, end - beg - 1 - start, 645131557Stjr &(patterns[i].regs)); 646131557Stjr len = patterns[i].regs.end[0] - start; 647131557Stjr } 64853451Speter } 649131557Stjr } 650131557Stjr } /* for Regex patterns. */ 651131557Stjr } /* for (beg = end ..) */ 652131563Stjr 653131563Stjr failure: 654131557Stjr return (size_t) -1; 65553451Speter 656131563Stjr success_in_beg_and_end: 657131563Stjr len = end - beg; 658131563Stjr start = beg - buf; 659131563Stjr /* FALLTHROUGH */ 660131563Stjr 661131563Stjr success_in_start_and_len: 662131563Stjr *match_size = len; 663131563Stjr return start; 66453451Speter} 66553451Speter 666146205Stjr#ifdef MBS_SUPPORT 667146205Stjrstatic int f_i_multibyte; /* whether we're using the new -Fi MB method */ 668146205Stjrstatic struct 669146205Stjr{ 670146205Stjr wchar_t **patterns; 671146205Stjr size_t count, maxlen; 672146205Stjr unsigned char *match; 673146205Stjr} Fimb; 674146205Stjr#endif 675146205Stjr 67653451Speterstatic void 677131557StjrFcompile (char const *pattern, size_t size) 67853451Speter{ 679146205Stjr int mb_cur_max = MB_CUR_MAX; 680131557Stjr char const *beg, *lim, *err; 68153451Speter 682146205Stjr check_utf8 (); 683146205Stjr#ifdef MBS_SUPPORT 684146205Stjr /* Support -F -i for UTF-8 input. */ 685146205Stjr if (match_icase && mb_cur_max > 1) 686146205Stjr { 687146205Stjr mbstate_t mbs; 688146205Stjr wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); 689146205Stjr const char *patternend = pattern; 690146205Stjr size_t wcsize; 691146205Stjr kwset_t fimb_kwset = NULL; 692146205Stjr char *starts = NULL; 693146205Stjr wchar_t *wcbeg, *wclim; 694146205Stjr size_t allocated = 0; 695146205Stjr 696146205Stjr memset (&mbs, '\0', sizeof (mbs)); 697146205Stjr# ifdef __GNU_LIBRARY__ 698146205Stjr wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); 699146205Stjr if (patternend != pattern + size) 700146205Stjr wcsize = (size_t) -1; 701146205Stjr# else 702146205Stjr { 703146205Stjr char *patterncopy = xmalloc (size + 1); 704146205Stjr 705146205Stjr memcpy (patterncopy, pattern, size); 706146205Stjr patterncopy[size] = '\0'; 707146205Stjr patternend = patterncopy; 708146205Stjr wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); 709146205Stjr if (patternend != patterncopy + size) 710146205Stjr wcsize = (size_t) -1; 711146205Stjr free (patterncopy); 712146205Stjr } 713146205Stjr# endif 714146205Stjr if (wcsize + 2 <= 2) 715146205Stjr { 716146205Stjrfimb_fail: 717146205Stjr free (wcpattern); 718146205Stjr free (starts); 719146205Stjr if (fimb_kwset) 720146205Stjr kwsfree (fimb_kwset); 721146205Stjr free (Fimb.patterns); 722146205Stjr Fimb.patterns = NULL; 723146205Stjr } 724146205Stjr else 725146205Stjr { 726146205Stjr if (!(fimb_kwset = kwsalloc (NULL))) 727146205Stjr error (2, 0, _("memory exhausted")); 728146205Stjr 729146205Stjr starts = xmalloc (mb_cur_max * 3); 730146205Stjr wcbeg = wcpattern; 731146205Stjr do 732146205Stjr { 733146205Stjr int i; 734146205Stjr size_t wclen; 735146205Stjr 736146205Stjr if (Fimb.count >= allocated) 737146205Stjr { 738146205Stjr if (allocated == 0) 739146205Stjr allocated = 128; 740146205Stjr else 741146205Stjr allocated *= 2; 742146205Stjr Fimb.patterns = xrealloc (Fimb.patterns, 743146205Stjr sizeof (wchar_t *) * allocated); 744146205Stjr } 745146205Stjr Fimb.patterns[Fimb.count++] = wcbeg; 746146205Stjr for (wclim = wcbeg; 747146205Stjr wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) 748146205Stjr *wclim = towlower (*wclim); 749146205Stjr *wclim = L'\0'; 750146205Stjr wclen = wclim - wcbeg; 751146205Stjr if (wclen > Fimb.maxlen) 752146205Stjr Fimb.maxlen = wclen; 753146205Stjr if (wclen > 3) 754146205Stjr wclen = 3; 755146205Stjr if (wclen == 0) 756146205Stjr { 757146205Stjr if ((err = kwsincr (fimb_kwset, "", 0)) != 0) 758146205Stjr error (2, 0, err); 759146205Stjr } 760146205Stjr else 761146205Stjr for (i = 0; i < (1 << wclen); i++) 762146205Stjr { 763146205Stjr char *p = starts; 764146205Stjr int j, k; 765146205Stjr 766146205Stjr for (j = 0; j < wclen; ++j) 767146205Stjr { 768146205Stjr wchar_t wc = wcbeg[j]; 769146205Stjr if (i & (1 << j)) 770146205Stjr { 771146205Stjr wc = towupper (wc); 772146205Stjr if (wc == wcbeg[j]) 773146205Stjr continue; 774146205Stjr } 775146205Stjr k = wctomb (p, wc); 776146205Stjr if (k <= 0) 777146205Stjr goto fimb_fail; 778146205Stjr p += k; 779146205Stjr } 780146205Stjr if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) 781146205Stjr error (2, 0, err); 782146205Stjr } 783146205Stjr if (wclim < wcpattern + wcsize) 784146205Stjr ++wclim; 785146205Stjr wcbeg = wclim; 786146205Stjr } 787146205Stjr while (wcbeg < wcpattern + wcsize); 788146205Stjr f_i_multibyte = 1; 789146205Stjr kwset = fimb_kwset; 790146205Stjr free (starts); 791146205Stjr Fimb.match = xmalloc (Fimb.count); 792146205Stjr if ((err = kwsprep (kwset)) != 0) 793146205Stjr error (2, 0, err); 794146205Stjr return; 795146205Stjr } 796146205Stjr } 797146205Stjr#endif /* MBS_SUPPORT */ 798146205Stjr 799146205Stjr 800131557Stjr kwsinit (); 80153451Speter beg = pattern; 80253451Speter do 80353451Speter { 80453451Speter for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) 80553451Speter ; 806131557Stjr if ((err = kwsincr (kwset, beg, lim - beg)) != 0) 807131557Stjr error (2, 0, err); 80853451Speter if (lim < pattern + size) 80953451Speter ++lim; 81053451Speter beg = lim; 81153451Speter } 81253451Speter while (beg < pattern + size); 81353451Speter 814131557Stjr if ((err = kwsprep (kwset)) != 0) 815131557Stjr error (2, 0, err); 81653451Speter} 81753451Speter 818146205Stjr#ifdef MBS_SUPPORT 819146205Stjrstatic int 820146205StjrFimbexec (const char *buf, size_t size, size_t *plen, int exact) 821146205Stjr{ 822146205Stjr size_t len, letter, i; 823146205Stjr int ret = -1; 824146205Stjr mbstate_t mbs; 825146205Stjr wchar_t wc; 826146205Stjr int patterns_left; 827146205Stjr 828146205Stjr assert (match_icase && f_i_multibyte == 1); 829146205Stjr assert (MB_CUR_MAX > 1); 830146205Stjr 831146205Stjr memset (&mbs, '\0', sizeof (mbs)); 832146205Stjr memset (Fimb.match, '\1', Fimb.count); 833146205Stjr letter = len = 0; 834146205Stjr patterns_left = 1; 835146205Stjr while (patterns_left && len <= size) 836146205Stjr { 837146205Stjr size_t c; 838146205Stjr 839146205Stjr patterns_left = 0; 840146205Stjr if (len < size) 841146205Stjr { 842146205Stjr c = mbrtowc (&wc, buf + len, size - len, &mbs); 843146205Stjr if (c + 2 <= 2) 844146205Stjr return ret; 845146205Stjr 846146205Stjr wc = towlower (wc); 847146205Stjr } 848146205Stjr else 849146205Stjr { 850146205Stjr c = 1; 851146205Stjr wc = L'\0'; 852146205Stjr } 853146205Stjr 854146205Stjr for (i = 0; i < Fimb.count; i++) 855146205Stjr { 856146205Stjr if (Fimb.match[i]) 857146205Stjr { 858146205Stjr if (Fimb.patterns[i][letter] == L'\0') 859146205Stjr { 860146205Stjr /* Found a match. */ 861146205Stjr *plen = len; 862146205Stjr if (!exact && !match_words) 863146205Stjr return 0; 864146205Stjr else 865146205Stjr { 866146205Stjr /* For -w or exact look for longest match. */ 867146205Stjr ret = 0; 868146205Stjr Fimb.match[i] = '\0'; 869146205Stjr continue; 870146205Stjr } 871146205Stjr } 872146205Stjr 873146205Stjr if (Fimb.patterns[i][letter] == wc) 874146205Stjr patterns_left = 1; 875146205Stjr else 876146205Stjr Fimb.match[i] = '\0'; 877146205Stjr } 878146205Stjr } 879146205Stjr 880146205Stjr len += c; 881146205Stjr letter++; 882146205Stjr } 883146205Stjr 884146205Stjr return ret; 885146205Stjr} 886146205Stjr#endif /* MBS_SUPPORT */ 887146205Stjr 888131557Stjrstatic size_t 889146199StjrFexecute (char const *buf, size_t size, size_t *match_size, int exact) 89053451Speter{ 891131557Stjr register char const *beg, *try, *end; 89253451Speter register size_t len; 89355379Sobrien char eol = eolbyte; 89453451Speter struct kwsmatch kwsmatch; 895131564Stjr size_t ret_val; 896131557Stjr#ifdef MBS_SUPPORT 897146205Stjr int mb_cur_max = MB_CUR_MAX; 898146205Stjr mbstate_t mbs; 899146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 900146205Stjr const char *last_char = NULL; 901131557Stjr#endif /* MBS_SUPPORT */ 90253451Speter 90353451Speter for (beg = buf; beg <= buf + size; ++beg) 90453451Speter { 905146205Stjr size_t offset; 906146205Stjr offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 907146205Stjr 908131557Stjr if (offset == (size_t) -1) 909131563Stjr goto failure; 910131557Stjr#ifdef MBS_SUPPORT 911146205Stjr if (mb_cur_max > 1 && !using_utf8) 912146205Stjr { 913146205Stjr size_t bytes_left = offset; 914146205Stjr while (bytes_left) 915146205Stjr { 916146205Stjr size_t mlen = mbrlen (beg, bytes_left, &mbs); 917146205Stjr 918146205Stjr last_char = beg; 919146205Stjr if (mlen == (size_t) -1 || mlen == 0) 920146205Stjr { 921146205Stjr /* Incomplete character: treat as single-byte. */ 922146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 923146205Stjr beg++; 924146205Stjr bytes_left--; 925146205Stjr continue; 926146205Stjr } 927146205Stjr 928146205Stjr if (mlen == (size_t) -2) 929146205Stjr /* Offset points inside multibyte character: no good. */ 930146205Stjr break; 931146205Stjr 932146205Stjr beg += mlen; 933146205Stjr bytes_left -= mlen; 934146205Stjr } 935146205Stjr 936146205Stjr if (bytes_left) 937146205Stjr continue; 938146205Stjr } 939146205Stjr else 940131557Stjr#endif /* MBS_SUPPORT */ 941131557Stjr beg += offset; 942146205Stjr#ifdef MBS_SUPPORT 943146205Stjr /* For f_i_multibyte, the string at beg now matches first 3 chars of 944146205Stjr one of the search strings (less if there are shorter search strings). 945146205Stjr See if this is a real match. */ 946146205Stjr if (f_i_multibyte 947146205Stjr && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) 948146205Stjr goto next_char; 949146205Stjr#endif /* MBS_SUPPORT */ 95053451Speter len = kwsmatch.size[0]; 951131563Stjr if (exact && !match_words) 952131563Stjr goto success_in_beg_and_len; 95353451Speter if (match_lines) 95453451Speter { 95555379Sobrien if (beg > buf && beg[-1] != eol) 956146205Stjr goto next_char; 95755379Sobrien if (beg + len < buf + size && beg[len] != eol) 958146205Stjr goto next_char; 95953451Speter goto success; 96053451Speter } 96153451Speter else if (match_words) 962146205Stjr { 963146205Stjr while (1) 964146205Stjr { 965146205Stjr int word_match = 0; 966146205Stjr if (beg > buf) 967146205Stjr { 968146201Stjr#ifdef MBS_SUPPORT 969146205Stjr if (mb_cur_max > 1) 970146205Stjr { 971146205Stjr const char *s; 972146205Stjr int mr; 973146205Stjr wchar_t pwc; 974146205Stjr 975146205Stjr if (using_utf8) 976146205Stjr { 977146205Stjr s = beg - 1; 978146205Stjr while (s > buf 979146205Stjr && (unsigned char) *s >= 0x80 980146205Stjr && (unsigned char) *s <= 0xbf) 981146205Stjr --s; 982146205Stjr } 983146205Stjr else 984146205Stjr s = last_char; 985146205Stjr mr = mbtowc (&pwc, s, beg - s); 986146205Stjr if (mr <= 0) 987146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 988146205Stjr else if ((iswalnum (pwc) || pwc == L'_') 989146205Stjr && mr == (int) (beg - s)) 990146205Stjr goto next_char; 991146205Stjr } 992146205Stjr else 993146201Stjr#endif /* MBS_SUPPORT */ 994151647Stjr if (WCHAR ((unsigned char) beg[-1])) 995146205Stjr goto next_char; 996146205Stjr } 997146205Stjr#ifdef MBS_SUPPORT 998146205Stjr if (mb_cur_max > 1) 999146205Stjr { 1000146205Stjr wchar_t nwc; 1001146205Stjr int mr; 1002146205Stjr 1003146205Stjr mr = mbtowc (&nwc, beg + len, buf + size - beg - len); 1004146205Stjr if (mr <= 0) 1005146205Stjr { 1006146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 1007146205Stjr word_match = 1; 1008146205Stjr } 1009146205Stjr else if (!iswalnum (nwc) && nwc != L'_') 1010146205Stjr word_match = 1; 1011146205Stjr } 1012146205Stjr else 1013146205Stjr#endif /* MBS_SUPPORT */ 1014151647Stjr if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) 1015146205Stjr word_match = 1; 1016146205Stjr if (word_match) 1017146205Stjr { 1018146205Stjr if (!exact) 1019146205Stjr /* Returns the whole line now we know there's a word match. */ 1020146205Stjr goto success; 1021146205Stjr else 1022146205Stjr /* Returns just this word match. */ 1023146205Stjr goto success_in_beg_and_len; 1024146205Stjr } 1025146205Stjr if (len > 0) 1026146205Stjr { 1027146205Stjr /* Try a shorter length anchored at the same place. */ 1028146205Stjr --len; 1029146205Stjr offset = kwsexec (kwset, beg, len, &kwsmatch); 1030146205Stjr 1031146205Stjr if (offset == -1) 1032146205Stjr goto next_char; /* Try a different anchor. */ 1033146205Stjr#ifdef MBS_SUPPORT 1034146205Stjr if (mb_cur_max > 1 && !using_utf8) 1035146205Stjr { 1036146205Stjr size_t bytes_left = offset; 1037146205Stjr while (bytes_left) 1038146205Stjr { 1039146205Stjr size_t mlen = mbrlen (beg, bytes_left, &mbs); 1040146205Stjr 1041146205Stjr last_char = beg; 1042146205Stjr if (mlen == (size_t) -1 || mlen == 0) 1043146205Stjr { 1044146205Stjr /* Incomplete character: treat as single-byte. */ 1045146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 1046146205Stjr beg++; 1047146205Stjr bytes_left--; 1048146205Stjr continue; 1049146205Stjr } 1050146205Stjr 1051146205Stjr if (mlen == (size_t) -2) 1052146205Stjr { 1053146205Stjr /* Offset points inside multibyte character: 1054146205Stjr * no good. */ 1055146205Stjr break; 1056146205Stjr } 1057146205Stjr 1058146205Stjr beg += mlen; 1059146205Stjr bytes_left -= mlen; 1060146205Stjr } 1061146205Stjr 1062146205Stjr if (bytes_left) 1063146205Stjr { 1064146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 1065146205Stjr goto next_char; /* Try a different anchor. */ 1066146205Stjr } 1067146205Stjr } 1068146205Stjr else 1069146205Stjr#endif /* MBS_SUPPORT */ 1070146205Stjr beg += offset; 1071146205Stjr#ifdef MBS_SUPPORT 1072146205Stjr /* The string at beg now matches first 3 chars of one of 1073146205Stjr the search strings (less if there are shorter search 1074146205Stjr strings). See if this is a real match. */ 1075146205Stjr if (f_i_multibyte 1076146205Stjr && Fimbexec (beg, len - offset, &kwsmatch.size[0], 1077146205Stjr exact)) 1078146205Stjr goto next_char; 1079146205Stjr#endif /* MBS_SUPPORT */ 1080146205Stjr len = kwsmatch.size[0]; 1081146205Stjr } 1082146205Stjr } 1083146205Stjr } 108453451Speter else 108553451Speter goto success; 1086146205Stjrnext_char:; 1087146205Stjr#ifdef MBS_SUPPORT 1088146205Stjr /* Advance to next character. For MB_CUR_MAX == 1 case this is handled 1089146205Stjr by ++beg above. */ 1090146205Stjr if (mb_cur_max > 1) 1091146205Stjr { 1092146205Stjr if (using_utf8) 1093146205Stjr { 1094146205Stjr unsigned char c = *beg; 1095146205Stjr if (c >= 0xc2) 1096146205Stjr { 1097146205Stjr if (c < 0xe0) 1098146205Stjr ++beg; 1099146205Stjr else if (c < 0xf0) 1100146205Stjr beg += 2; 1101146205Stjr else if (c < 0xf8) 1102146205Stjr beg += 3; 1103146205Stjr else if (c < 0xfc) 1104146205Stjr beg += 4; 1105146205Stjr else if (c < 0xfe) 1106146205Stjr beg += 5; 1107146205Stjr } 1108146205Stjr } 1109146205Stjr else 1110146205Stjr { 1111146205Stjr size_t l = mbrlen (beg, buf + size - beg, &mbs); 1112146205Stjr 1113146205Stjr last_char = beg; 1114146205Stjr if (l + 2 >= 2) 1115146205Stjr beg += l - 1; 1116146205Stjr else 1117146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 1118146205Stjr } 1119146205Stjr } 1120146205Stjr#endif /* MBS_SUPPORT */ 112153451Speter } 112253451Speter 1123131563Stjr failure: 1124146205Stjr return -1; 1125146205Stjr 1126146205Stjr success: 1127131557Stjr#ifdef MBS_SUPPORT 1128146205Stjr if (mb_cur_max > 1 && !using_utf8) 1129131564Stjr { 1130146205Stjr end = beg + len; 1131146205Stjr while (end < buf + size) 1132146205Stjr { 1133146205Stjr size_t mlen = mbrlen (end, buf + size - end, &mbs); 1134146205Stjr if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) 1135146205Stjr { 1136146205Stjr memset (&mbs, '\0', sizeof (mbstate_t)); 1137146205Stjr mlen = 1; 1138146205Stjr } 1139146205Stjr if (mlen == 1 && *end == eol) 1140146205Stjr break; 1141146205Stjr 1142146205Stjr end += mlen; 1143146205Stjr } 1144131564Stjr } 1145146205Stjr else 1146131557Stjr#endif /* MBS_SUPPORT */ 1147146205Stjr end = memchr (beg + len, eol, (buf + size) - (beg + len)); 114853451Speter 1149131557Stjr end++; 1150131557Stjr while (buf < beg && beg[-1] != eol) 115153451Speter --beg; 1152131563Stjr len = end - beg; 1153131563Stjr /* FALLTHROUGH */ 1154131563Stjr 1155131563Stjr success_in_beg_and_len: 1156131563Stjr *match_size = len; 1157131557Stjr return beg - buf; 115853451Speter} 1159131557Stjr 1160131557Stjr#if HAVE_LIBPCRE 1161131557Stjr/* Compiled internal form of a Perl regular expression. */ 1162131557Stjrstatic pcre *cre; 1163131557Stjr 1164131557Stjr/* Additional information about the pattern. */ 1165131557Stjrstatic pcre_extra *extra; 1166131557Stjr#endif 1167131557Stjr 1168131557Stjrstatic void 1169131557StjrPcompile (char const *pattern, size_t size) 1170131557Stjr{ 1171131557Stjr#if !HAVE_LIBPCRE 1172131557Stjr error (2, 0, _("The -P option is not supported")); 1173131557Stjr#else 1174131557Stjr int e; 1175131557Stjr char const *ep; 1176131557Stjr char *re = xmalloc (4 * size + 7); 1177131557Stjr int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); 1178131557Stjr char const *patlim = pattern + size; 1179131557Stjr char *n = re; 1180131557Stjr char const *p; 1181131557Stjr char const *pnul; 1182131557Stjr 1183131557Stjr /* FIXME: Remove this restriction. */ 1184131557Stjr if (eolbyte != '\n') 1185131557Stjr error (2, 0, _("The -P and -z options cannot be combined")); 1186131557Stjr 1187131557Stjr *n = '\0'; 1188131557Stjr if (match_lines) 1189131557Stjr strcpy (n, "^("); 1190131557Stjr if (match_words) 1191131557Stjr strcpy (n, "\\b("); 1192131557Stjr n += strlen (n); 1193131557Stjr 1194131557Stjr /* The PCRE interface doesn't allow NUL bytes in the pattern, so 1195131557Stjr replace each NUL byte in the pattern with the four characters 1196131557Stjr "\000", removing a preceding backslash if there are an odd 1197131557Stjr number of backslashes before the NUL. 1198131557Stjr 1199131557Stjr FIXME: This method does not work with some multibyte character 1200131557Stjr encodings, notably Shift-JIS, where a multibyte character can end 1201131557Stjr in a backslash byte. */ 1202131557Stjr for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) 1203131557Stjr { 1204131557Stjr memcpy (n, p, pnul - p); 1205131557Stjr n += pnul - p; 1206131557Stjr for (p = pnul; pattern < p && p[-1] == '\\'; p--) 1207131557Stjr continue; 1208131557Stjr n -= (pnul - p) & 1; 1209131557Stjr strcpy (n, "\\000"); 1210131557Stjr n += 4; 1211131557Stjr } 1212131557Stjr 1213131557Stjr memcpy (n, p, patlim - p); 1214131557Stjr n += patlim - p; 1215131557Stjr *n = '\0'; 1216131557Stjr if (match_words) 1217131557Stjr strcpy (n, ")\\b"); 1218131557Stjr if (match_lines) 1219131557Stjr strcpy (n, ")$"); 1220131557Stjr 1221131557Stjr cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); 1222131557Stjr if (!cre) 1223131557Stjr error (2, 0, ep); 1224131557Stjr 1225131557Stjr extra = pcre_study (cre, 0, &ep); 1226131557Stjr if (ep) 1227131557Stjr error (2, 0, ep); 1228131557Stjr 1229131557Stjr free (re); 1230131557Stjr#endif 1231131557Stjr} 1232131557Stjr 1233131557Stjrstatic size_t 1234146199StjrPexecute (char const *buf, size_t size, size_t *match_size, int exact) 1235131557Stjr{ 1236131557Stjr#if !HAVE_LIBPCRE 1237131557Stjr abort (); 1238131557Stjr return -1; 1239131557Stjr#else 1240131557Stjr /* This array must have at least two elements; everything after that 1241131557Stjr is just for performance improvement in pcre_exec. */ 1242131557Stjr int sub[300]; 1243131557Stjr 1244131557Stjr int e = pcre_exec (cre, extra, buf, size, 0, 0, 1245131557Stjr sub, sizeof sub / sizeof *sub); 1246131557Stjr 1247131557Stjr if (e <= 0) 1248131557Stjr { 1249131557Stjr switch (e) 1250131557Stjr { 1251131557Stjr case PCRE_ERROR_NOMATCH: 1252131557Stjr return -1; 1253131557Stjr 1254131557Stjr case PCRE_ERROR_NOMEMORY: 1255131557Stjr error (2, 0, _("Memory exhausted")); 1256131557Stjr 1257131557Stjr default: 1258131557Stjr abort (); 1259131557Stjr } 1260131557Stjr } 1261131557Stjr else 1262131557Stjr { 1263131557Stjr /* Narrow down to the line we've found. */ 1264131557Stjr char const *beg = buf + sub[0]; 1265131557Stjr char const *end = buf + sub[1]; 1266131557Stjr char const *buflim = buf + size; 1267131557Stjr char eol = eolbyte; 1268131557Stjr if (!exact) 1269131557Stjr { 1270131557Stjr end = memchr (end, eol, buflim - end); 1271131557Stjr end++; 1272131557Stjr while (buf < beg && beg[-1] != eol) 1273131557Stjr --beg; 1274131557Stjr } 1275131557Stjr 1276131557Stjr *match_size = end - beg; 1277131557Stjr return beg - buf; 1278131557Stjr } 1279131557Stjr#endif 1280131557Stjr} 1281131557Stjr 1282131557Stjrstruct matcher const matchers[] = { 1283131557Stjr { "default", Gcompile, EGexecute }, 1284131557Stjr { "grep", Gcompile, EGexecute }, 1285131557Stjr { "egrep", Ecompile, EGexecute }, 1286131557Stjr { "awk", Ecompile, EGexecute }, 1287131557Stjr { "fgrep", Fcompile, Fexecute }, 1288131557Stjr { "perl", Pcompile, Pexecute }, 1289131557Stjr { "", 0, 0 }, 1290131557Stjr}; 1291