1/* search.c - searching subroutines using dfa, kwset and regex for grep. 2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 17 02111-1307, USA. */ 18 19/* Written August 1992 by Mike Haertel. */ 20 21/* $FreeBSD$ */ 22 23#ifndef _GNU_SOURCE 24# define _GNU_SOURCE 1 25#endif 26#ifdef HAVE_CONFIG_H 27# include <config.h> 28#endif 29#include <assert.h> 30#include <sys/types.h> 31#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 32/* We can handle multibyte string. */ 33# define MBS_SUPPORT 34# include <wchar.h> 35# include <wctype.h> 36#endif 37 38#include "system.h" 39#include "grep.h" 40#include "regex.h" 41#include "dfa.h" 42#include "kwset.h" 43#include "error.h" 44#include "xalloc.h" 45#ifdef HAVE_LIBPCRE 46# include <pcre.h> 47#endif 48#ifdef HAVE_LANGINFO_CODESET 49# include <langinfo.h> 50#endif 51 52#define NCHAR (UCHAR_MAX + 1) 53 54/* For -w, we also consider _ to be word constituent. */ 55#define WCHAR(C) (ISALNUM(C) || (C) == '_') 56 57/* DFA compiled regexp. */ 58static struct dfa dfa; 59 60/* The Regex compiled patterns. */ 61static struct patterns 62{ 63 /* Regex compiled regexp. */ 64 struct re_pattern_buffer regexbuf; 65 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 66 Q@#%!# library interface in regex.c. */ 67} patterns0; 68 69struct patterns *patterns; 70size_t pcount; 71 72/* KWset compiled pattern. For Ecompile and Gcompile, we compile 73 a list of strings, at least one of which is known to occur in 74 any string matching the regexp. */ 75static kwset_t kwset; 76 77/* Number of compiled fixed strings known to exactly match the regexp. 78 If kwsexec returns < kwset_exact_matches, then we don't need to 79 call the regexp matcher at all. */ 80static int kwset_exact_matches; 81 82/* UTF-8 encoding allows some optimizations that we can't otherwise 83 assume in a multibyte encoding. */ 84static int using_utf8; 85 86static void kwsinit PARAMS ((void)); 87static void kwsmusts PARAMS ((void)); 88static void Gcompile PARAMS ((char const *, size_t)); 89static void Ecompile PARAMS ((char const *, size_t)); 90static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); 91static void Fcompile PARAMS ((char const *, size_t)); 92static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); 93static void Pcompile PARAMS ((char const *, size_t )); 94static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); 95 96void 97check_utf8 (void) 98{ 99#ifdef HAVE_LANGINFO_CODESET 100 if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) 101 using_utf8 = 1; 102#endif 103} 104 105void 106dfaerror (char const *mesg) 107{ 108 error (2, 0, mesg); 109} 110 111static void 112kwsinit (void) 113{ 114 static char trans[NCHAR]; 115 size_t i; 116 117 if (match_icase) 118 for (i = 0; i < NCHAR; ++i) 119 trans[i] = TOLOWER (i); 120 121 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) 122 error (2, 0, _("memory exhausted")); 123} 124 125/* If the DFA turns out to have some set of fixed strings one of 126 which must occur in the match, then we build a kwset matcher 127 to find those strings, and thus quickly filter out impossible 128 matches. */ 129static void 130kwsmusts (void) 131{ 132 struct dfamust const *dm; 133 char const *err; 134 135 if (dfa.musts) 136 { 137 kwsinit (); 138 /* First, we compile in the substrings known to be exact 139 matches. The kwset matcher will return the index 140 of the matching string that it chooses. */ 141 for (dm = dfa.musts; dm; dm = dm->next) 142 { 143 if (!dm->exact) 144 continue; 145 ++kwset_exact_matches; 146 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 147 error (2, 0, err); 148 } 149 /* Now, we compile the substrings that will require 150 the use of the regexp matcher. */ 151 for (dm = dfa.musts; dm; dm = dm->next) 152 { 153 if (dm->exact) 154 continue; 155 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 156 error (2, 0, err); 157 } 158 if ((err = kwsprep (kwset)) != 0) 159 error (2, 0, err); 160 } 161} 162 163static void 164Gcompile (char const *pattern, size_t size) 165{ 166 const char *err; 167 char const *sep; 168 size_t total = size; 169 char const *motif = pattern; 170 171 check_utf8 (); 172 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); 173 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 174 175 /* For GNU regex compiler we have to pass the patterns separately to detect 176 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 177 GNU regex should have raise a syntax error. The same for backref, where 178 the backref should have been local to each pattern. */ 179 do 180 { 181 size_t len; 182 sep = memchr (motif, '\n', total); 183 if (sep) 184 { 185 len = sep - motif; 186 sep++; 187 total -= (len + 1); 188 } 189 else 190 { 191 len = total; 192 total = 0; 193 } 194 195 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 196 if (patterns == NULL) 197 error (2, errno, _("memory exhausted")); 198 199 patterns[pcount] = patterns0; 200 201 if ((err = re_compile_pattern (motif, len, 202 &(patterns[pcount].regexbuf))) != 0) 203 error (2, 0, err); 204 pcount++; 205 206 motif = sep; 207 } while (sep && total != 0); 208 209 /* In the match_words and match_lines cases, we use a different pattern 210 for the DFA matcher that will quickly throw out cases that won't work. 211 Then if DFA succeeds we do some hairy stuff using the regex matcher 212 to decide whether the match should really count. */ 213 if (match_words || match_lines) 214 { 215 /* In the whole-word case, we use the pattern: 216 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 217 In the whole-line case, we use the pattern: 218 ^\(userpattern\)$. */ 219 220 static char const line_beg[] = "^\\("; 221 static char const line_end[] = "\\)$"; 222 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 223 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 224 char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 225 size_t i; 226 strcpy (n, match_lines ? line_beg : word_beg); 227 i = strlen (n); 228 memcpy (n + i, pattern, size); 229 i += size; 230 strcpy (n + i, match_lines ? line_end : word_end); 231 i += strlen (n + i); 232 pattern = n; 233 size = i; 234 } 235 236 dfacomp (pattern, size, &dfa, 1); 237 kwsmusts (); 238} 239 240static void 241Ecompile (char const *pattern, size_t size) 242{ 243 const char *err; 244 const char *sep; 245 size_t total = size; 246 char const *motif = pattern; 247 248 check_utf8 (); 249 if (strcmp (matcher, "awk") == 0) 250 { 251 re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); 252 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); 253 } 254 else 255 { 256 re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); 257 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); 258 } 259 260 /* For GNU regex compiler we have to pass the patterns separately to detect 261 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 262 GNU regex should have raise a syntax error. The same for backref, where 263 the backref should have been local to each pattern. */ 264 do 265 { 266 size_t len; 267 sep = memchr (motif, '\n', total); 268 if (sep) 269 { 270 len = sep - motif; 271 sep++; 272 total -= (len + 1); 273 } 274 else 275 { 276 len = total; 277 total = 0; 278 } 279 280 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 281 if (patterns == NULL) 282 error (2, errno, _("memory exhausted")); 283 patterns[pcount] = patterns0; 284 285 if ((err = re_compile_pattern (motif, len, 286 &(patterns[pcount].regexbuf))) != 0) 287 error (2, 0, err); 288 pcount++; 289 290 motif = sep; 291 } while (sep && total != 0); 292 293 /* In the match_words and match_lines cases, we use a different pattern 294 for the DFA matcher that will quickly throw out cases that won't work. 295 Then if DFA succeeds we do some hairy stuff using the regex matcher 296 to decide whether the match should really count. */ 297 if (match_words || match_lines) 298 { 299 /* In the whole-word case, we use the pattern: 300 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 301 In the whole-line case, we use the pattern: 302 ^(userpattern)$. */ 303 304 static char const line_beg[] = "^("; 305 static char const line_end[] = ")$"; 306 static char const word_beg[] = "(^|[^[:alnum:]_])("; 307 static char const word_end[] = ")([^[:alnum:]_]|$)"; 308 char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 309 size_t i; 310 strcpy (n, match_lines ? line_beg : word_beg); 311 i = strlen(n); 312 memcpy (n + i, pattern, size); 313 i += size; 314 strcpy (n + i, match_lines ? line_end : word_end); 315 i += strlen (n + i); 316 pattern = n; 317 size = i; 318 } 319 320 dfacomp (pattern, size, &dfa, 1); 321 kwsmusts (); 322} 323 324static size_t 325EGexecute (char const *buf, size_t size, size_t *match_size, int exact) 326{ 327 register char const *buflim, *beg, *end; 328 char eol = eolbyte; 329 int backref; 330 ptrdiff_t start, len; 331 struct kwsmatch kwsm; 332 size_t i, ret_val; 333 static int use_dfa; 334 static int use_dfa_checked = 0; 335#ifdef MBS_SUPPORT 336 const char *last_char = NULL; 337 int mb_cur_max = MB_CUR_MAX; 338 mbstate_t mbs; 339 memset (&mbs, '\0', sizeof (mbstate_t)); 340#endif /* MBS_SUPPORT */ 341 342 if (!use_dfa_checked) 343 { 344 char *grep_use_dfa = getenv ("GREP_USE_DFA"); 345 if (!grep_use_dfa) 346 { 347#ifdef MBS_SUPPORT 348 /* Turn off DFA when processing multibyte input. */ 349 use_dfa = (MB_CUR_MAX == 1); 350#else 351 use_dfa = 1; 352#endif /* MBS_SUPPORT */ 353 } 354 else 355 { 356 use_dfa = atoi (grep_use_dfa); 357 } 358 359 use_dfa_checked = 1; 360 } 361 362 buflim = buf + size; 363 364 for (beg = end = buf; end < buflim; beg = end) 365 { 366 if (!exact) 367 { 368 if (kwset) 369 { 370 /* Find a possible match using the KWset matcher. */ 371#ifdef MBS_SUPPORT 372 size_t bytes_left = 0; 373#endif /* MBS_SUPPORT */ 374 size_t offset; 375#ifdef MBS_SUPPORT 376 /* kwsexec doesn't work with match_icase and multibyte input. */ 377 if (match_icase && mb_cur_max > 1) 378 /* Avoid kwset */ 379 offset = 0; 380 else 381#endif /* MBS_SUPPORT */ 382 offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 383 if (offset == (size_t) -1) 384 goto failure; 385#ifdef MBS_SUPPORT 386 if (mb_cur_max > 1 && !using_utf8) 387 { 388 bytes_left = offset; 389 while (bytes_left) 390 { 391 size_t mlen = mbrlen (beg, bytes_left, &mbs); 392 393 last_char = beg; 394 if (mlen == (size_t) -1 || mlen == 0) 395 { 396 /* Incomplete character: treat as single-byte. */ 397 memset (&mbs, '\0', sizeof (mbstate_t)); 398 beg++; 399 bytes_left--; 400 continue; 401 } 402 403 if (mlen == (size_t) -2) 404 /* Offset points inside multibyte character: 405 * no good. */ 406 break; 407 408 beg += mlen; 409 bytes_left -= mlen; 410 } 411 } 412 else 413#endif /* MBS_SUPPORT */ 414 beg += offset; 415 /* Narrow down to the line containing the candidate, and 416 run it through DFA. */ 417 end = memchr(beg, eol, buflim - beg); 418 end++; 419#ifdef MBS_SUPPORT 420 if (mb_cur_max > 1 && bytes_left) 421 continue; 422#endif /* MBS_SUPPORT */ 423 while (beg > buf && beg[-1] != eol) 424 --beg; 425 if ( 426#ifdef MBS_SUPPORT 427 !(match_icase && mb_cur_max > 1) && 428#endif /* MBS_SUPPORT */ 429 (kwsm.index < kwset_exact_matches)) 430 goto success_in_beg_and_end; 431 if (use_dfa && 432 dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 433 continue; 434 } 435 else 436 { 437 /* No good fixed strings; start with DFA. */ 438#ifdef MBS_SUPPORT 439 size_t bytes_left = 0; 440#endif /* MBS_SUPPORT */ 441 size_t offset = 0; 442 if (use_dfa) 443 offset = dfaexec (&dfa, beg, buflim - beg, &backref); 444 if (offset == (size_t) -1) 445 break; 446 /* Narrow down to the line we've found. */ 447#ifdef MBS_SUPPORT 448 if (mb_cur_max > 1 && !using_utf8) 449 { 450 bytes_left = offset; 451 while (bytes_left) 452 { 453 size_t mlen = mbrlen (beg, bytes_left, &mbs); 454 455 last_char = beg; 456 if (mlen == (size_t) -1 || mlen == 0) 457 { 458 /* Incomplete character: treat as single-byte. */ 459 memset (&mbs, '\0', sizeof (mbstate_t)); 460 beg++; 461 bytes_left--; 462 continue; 463 } 464 465 if (mlen == (size_t) -2) 466 /* Offset points inside multibyte character: 467 * no good. */ 468 break; 469 470 beg += mlen; 471 bytes_left -= mlen; 472 } 473 } 474 else 475#endif /* MBS_SUPPORT */ 476 beg += offset; 477 end = memchr (beg, eol, buflim - beg); 478 end++; 479#ifdef MBS_SUPPORT 480 if (mb_cur_max > 1 && bytes_left) 481 continue; 482#endif /* MBS_SUPPORT */ 483 while (beg > buf && beg[-1] != eol) 484 --beg; 485 } 486 /* Successful, no backreferences encountered! */ 487 if (use_dfa && !backref) 488 goto success_in_beg_and_end; 489 } 490 else 491 end = beg + size; 492 493 /* If we've made it to this point, this means DFA has seen 494 a probable match, and we need to run it through Regex. */ 495 for (i = 0; i < pcount; i++) 496 { 497 patterns[i].regexbuf.not_eol = 0; 498 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, 499 end - beg - 1, 0, 500 end - beg - 1, &(patterns[i].regs)))) 501 { 502 len = patterns[i].regs.end[0] - start; 503 if (exact && !match_words) 504 goto success_in_start_and_len; 505 if ((!match_lines && !match_words) 506 || (match_lines && len == end - beg - 1)) 507 goto success_in_beg_and_end; 508 /* If -w, check if the match aligns with word boundaries. 509 We do this iteratively because: 510 (a) the line may contain more than one occurence of the 511 pattern, and 512 (b) Several alternatives in the pattern might be valid at a 513 given point, and we may need to consider a shorter one to 514 find a word boundary. */ 515 if (match_words) 516 while (start >= 0) 517 { 518 int lword_match = 0; 519 if (start == 0) 520 lword_match = 1; 521 else 522 { 523 assert (start > 0); 524#ifdef MBS_SUPPORT 525 if (mb_cur_max > 1) 526 { 527 const char *s; 528 size_t mr; 529 wchar_t pwc; 530 531 /* Locate the start of the multibyte character 532 before the match position (== beg + start). */ 533 if (using_utf8) 534 { 535 /* UTF-8 is a special case: scan backwards 536 until we find a 7-bit character or a 537 lead byte. */ 538 s = beg + start - 1; 539 while (s > buf 540 && (unsigned char) *s >= 0x80 541 && (unsigned char) *s <= 0xbf) 542 --s; 543 } 544 else 545 { 546 /* Scan forwards to find the start of the 547 last complete character before the 548 match position. */ 549 size_t bytes_left = start - 1; 550 s = beg; 551 while (bytes_left > 0) 552 { 553 mr = mbrlen (s, bytes_left, &mbs); 554 if (mr == (size_t) -1 || mr == 0) 555 { 556 memset (&mbs, '\0', sizeof (mbs)); 557 s++; 558 bytes_left--; 559 continue; 560 } 561 if (mr == (size_t) -2) 562 { 563 memset (&mbs, '\0', sizeof (mbs)); 564 break; 565 } 566 s += mr; 567 bytes_left -= mr; 568 } 569 } 570 mr = mbrtowc (&pwc, s, beg + start - s, &mbs); 571 if (mr == (size_t) -2 || mr == (size_t) -1 || 572 mr == 0) 573 { 574 memset (&mbs, '\0', sizeof (mbstate_t)); 575 lword_match = 1; 576 } 577 else if (!(iswalnum (pwc) || pwc == L'_') 578 && mr == beg + start - s) 579 lword_match = 1; 580 } 581 else 582#endif /* MBS_SUPPORT */ 583 if (!WCHAR ((unsigned char) beg[start - 1])) 584 lword_match = 1; 585 } 586 587 if (lword_match) 588 { 589 int rword_match = 0; 590 if (start + len == end - beg - 1) 591 rword_match = 1; 592 else 593 { 594#ifdef MBS_SUPPORT 595 if (mb_cur_max > 1) 596 { 597 wchar_t nwc; 598 int mr; 599 600 mr = mbtowc (&nwc, beg + start + len, 601 end - beg - start - len - 1); 602 if (mr <= 0) 603 { 604 memset (&mbs, '\0', sizeof (mbstate_t)); 605 rword_match = 1; 606 } 607 else if (!iswalnum (nwc) && nwc != L'_') 608 rword_match = 1; 609 } 610 else 611#endif /* MBS_SUPPORT */ 612 if (!WCHAR ((unsigned char) beg[start + len])) 613 rword_match = 1; 614 } 615 616 if (rword_match) 617 { 618 if (!exact) 619 /* Returns the whole line. */ 620 goto success_in_beg_and_end; 621 else 622 /* Returns just this word match. */ 623 goto success_in_start_and_len; 624 } 625 } 626 if (len > 0) 627 { 628 /* Try a shorter length anchored at the same place. */ 629 --len; 630 patterns[i].regexbuf.not_eol = 1; 631 len = re_match (&(patterns[i].regexbuf), beg, 632 start + len, start, 633 &(patterns[i].regs)); 634 } 635 if (len <= 0) 636 { 637 /* Try looking further on. */ 638 if (start == end - beg - 1) 639 break; 640 ++start; 641 patterns[i].regexbuf.not_eol = 0; 642 start = re_search (&(patterns[i].regexbuf), beg, 643 end - beg - 1, 644 start, end - beg - 1 - start, 645 &(patterns[i].regs)); 646 len = patterns[i].regs.end[0] - start; 647 } 648 } 649 } 650 } /* for Regex patterns. */ 651 } /* for (beg = end ..) */ 652 653 failure: 654 return (size_t) -1; 655 656 success_in_beg_and_end: 657 len = end - beg; 658 start = beg - buf; 659 /* FALLTHROUGH */ 660 661 success_in_start_and_len: 662 *match_size = len; 663 return start; 664} 665 666#ifdef MBS_SUPPORT 667static int f_i_multibyte; /* whether we're using the new -Fi MB method */ 668static struct 669{ 670 wchar_t **patterns; 671 size_t count, maxlen; 672 unsigned char *match; 673} Fimb; 674#endif 675 676static void 677Fcompile (char const *pattern, size_t size) 678{ 679 int mb_cur_max = MB_CUR_MAX; 680 char const *beg, *lim, *err; 681 682 check_utf8 (); 683#ifdef MBS_SUPPORT 684 /* Support -F -i for UTF-8 input. */ 685 if (match_icase && mb_cur_max > 1) 686 { 687 mbstate_t mbs; 688 wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); 689 const char *patternend = pattern; 690 size_t wcsize; 691 kwset_t fimb_kwset = NULL; 692 char *starts = NULL; 693 wchar_t *wcbeg, *wclim; 694 size_t allocated = 0; 695 696 memset (&mbs, '\0', sizeof (mbs)); 697# ifdef __GNU_LIBRARY__ 698 wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); 699 if (patternend != pattern + size) 700 wcsize = (size_t) -1; 701# else 702 { 703 char *patterncopy = xmalloc (size + 1); 704 705 memcpy (patterncopy, pattern, size); 706 patterncopy[size] = '\0'; 707 patternend = patterncopy; 708 wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); 709 if (patternend != patterncopy + size) 710 wcsize = (size_t) -1; 711 free (patterncopy); 712 } 713# endif 714 if (wcsize + 2 <= 2) 715 { 716fimb_fail: 717 free (wcpattern); 718 free (starts); 719 if (fimb_kwset) 720 kwsfree (fimb_kwset); 721 free (Fimb.patterns); 722 Fimb.patterns = NULL; 723 } 724 else 725 { 726 if (!(fimb_kwset = kwsalloc (NULL))) 727 error (2, 0, _("memory exhausted")); 728 729 starts = xmalloc (mb_cur_max * 3); 730 wcbeg = wcpattern; 731 do 732 { 733 int i; 734 size_t wclen; 735 736 if (Fimb.count >= allocated) 737 { 738 if (allocated == 0) 739 allocated = 128; 740 else 741 allocated *= 2; 742 Fimb.patterns = xrealloc (Fimb.patterns, 743 sizeof (wchar_t *) * allocated); 744 } 745 Fimb.patterns[Fimb.count++] = wcbeg; 746 for (wclim = wcbeg; 747 wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) 748 *wclim = towlower (*wclim); 749 *wclim = L'\0'; 750 wclen = wclim - wcbeg; 751 if (wclen > Fimb.maxlen) 752 Fimb.maxlen = wclen; 753 if (wclen > 3) 754 wclen = 3; 755 if (wclen == 0) 756 { 757 if ((err = kwsincr (fimb_kwset, "", 0)) != 0) 758 error (2, 0, err); 759 } 760 else 761 for (i = 0; i < (1 << wclen); i++) 762 { 763 char *p = starts; 764 int j, k; 765 766 for (j = 0; j < wclen; ++j) 767 { 768 wchar_t wc = wcbeg[j]; 769 if (i & (1 << j)) 770 { 771 wc = towupper (wc); 772 if (wc == wcbeg[j]) 773 continue; 774 } 775 k = wctomb (p, wc); 776 if (k <= 0) 777 goto fimb_fail; 778 p += k; 779 } 780 if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) 781 error (2, 0, err); 782 } 783 if (wclim < wcpattern + wcsize) 784 ++wclim; 785 wcbeg = wclim; 786 } 787 while (wcbeg < wcpattern + wcsize); 788 f_i_multibyte = 1; 789 kwset = fimb_kwset; 790 free (starts); 791 Fimb.match = xmalloc (Fimb.count); 792 if ((err = kwsprep (kwset)) != 0) 793 error (2, 0, err); 794 return; 795 } 796 } 797#endif /* MBS_SUPPORT */ 798 799 800 kwsinit (); 801 beg = pattern; 802 do 803 { 804 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) 805 ; 806 if ((err = kwsincr (kwset, beg, lim - beg)) != 0) 807 error (2, 0, err); 808 if (lim < pattern + size) 809 ++lim; 810 beg = lim; 811 } 812 while (beg < pattern + size); 813 814 if ((err = kwsprep (kwset)) != 0) 815 error (2, 0, err); 816} 817 818#ifdef MBS_SUPPORT 819static int 820Fimbexec (const char *buf, size_t size, size_t *plen, int exact) 821{ 822 size_t len, letter, i; 823 int ret = -1; 824 mbstate_t mbs; 825 wchar_t wc; 826 int patterns_left; 827 828 assert (match_icase && f_i_multibyte == 1); 829 assert (MB_CUR_MAX > 1); 830 831 memset (&mbs, '\0', sizeof (mbs)); 832 memset (Fimb.match, '\1', Fimb.count); 833 letter = len = 0; 834 patterns_left = 1; 835 while (patterns_left && len <= size) 836 { 837 size_t c; 838 839 patterns_left = 0; 840 if (len < size) 841 { 842 c = mbrtowc (&wc, buf + len, size - len, &mbs); 843 if (c + 2 <= 2) 844 return ret; 845 846 wc = towlower (wc); 847 } 848 else 849 { 850 c = 1; 851 wc = L'\0'; 852 } 853 854 for (i = 0; i < Fimb.count; i++) 855 { 856 if (Fimb.match[i]) 857 { 858 if (Fimb.patterns[i][letter] == L'\0') 859 { 860 /* Found a match. */ 861 *plen = len; 862 if (!exact && !match_words) 863 return 0; 864 else 865 { 866 /* For -w or exact look for longest match. */ 867 ret = 0; 868 Fimb.match[i] = '\0'; 869 continue; 870 } 871 } 872 873 if (Fimb.patterns[i][letter] == wc) 874 patterns_left = 1; 875 else 876 Fimb.match[i] = '\0'; 877 } 878 } 879 880 len += c; 881 letter++; 882 } 883 884 return ret; 885} 886#endif /* MBS_SUPPORT */ 887 888static size_t 889Fexecute (char const *buf, size_t size, size_t *match_size, int exact) 890{ 891 register char const *beg, *try, *end; 892 register size_t len; 893 char eol = eolbyte; 894 struct kwsmatch kwsmatch; 895 size_t ret_val; 896#ifdef MBS_SUPPORT 897 int mb_cur_max = MB_CUR_MAX; 898 mbstate_t mbs; 899 memset (&mbs, '\0', sizeof (mbstate_t)); 900 const char *last_char = NULL; 901#endif /* MBS_SUPPORT */ 902 903 for (beg = buf; beg <= buf + size; ++beg) 904 { 905 size_t offset; 906 offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 907 908 if (offset == (size_t) -1) 909 goto failure; 910#ifdef MBS_SUPPORT 911 if (mb_cur_max > 1 && !using_utf8) 912 { 913 size_t bytes_left = offset; 914 while (bytes_left) 915 { 916 size_t mlen = mbrlen (beg, bytes_left, &mbs); 917 918 last_char = beg; 919 if (mlen == (size_t) -1 || mlen == 0) 920 { 921 /* Incomplete character: treat as single-byte. */ 922 memset (&mbs, '\0', sizeof (mbstate_t)); 923 beg++; 924 bytes_left--; 925 continue; 926 } 927 928 if (mlen == (size_t) -2) 929 /* Offset points inside multibyte character: no good. */ 930 break; 931 932 beg += mlen; 933 bytes_left -= mlen; 934 } 935 936 if (bytes_left) 937 continue; 938 } 939 else 940#endif /* MBS_SUPPORT */ 941 beg += offset; 942#ifdef MBS_SUPPORT 943 /* For f_i_multibyte, the string at beg now matches first 3 chars of 944 one of the search strings (less if there are shorter search strings). 945 See if this is a real match. */ 946 if (f_i_multibyte 947 && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) 948 goto next_char; 949#endif /* MBS_SUPPORT */ 950 len = kwsmatch.size[0]; 951 if (exact && !match_words) 952 goto success_in_beg_and_len; 953 if (match_lines) 954 { 955 if (beg > buf && beg[-1] != eol) 956 goto next_char; 957 if (beg + len < buf + size && beg[len] != eol) 958 goto next_char; 959 goto success; 960 } 961 else if (match_words) 962 { 963 while (1) 964 { 965 int word_match = 0; 966 if (beg > buf) 967 { 968#ifdef MBS_SUPPORT 969 if (mb_cur_max > 1) 970 { 971 const char *s; 972 int mr; 973 wchar_t pwc; 974 975 if (using_utf8) 976 { 977 s = beg - 1; 978 while (s > buf 979 && (unsigned char) *s >= 0x80 980 && (unsigned char) *s <= 0xbf) 981 --s; 982 } 983 else 984 s = last_char; 985 mr = mbtowc (&pwc, s, beg - s); 986 if (mr <= 0) 987 memset (&mbs, '\0', sizeof (mbstate_t)); 988 else if ((iswalnum (pwc) || pwc == L'_') 989 && mr == (int) (beg - s)) 990 goto next_char; 991 } 992 else 993#endif /* MBS_SUPPORT */ 994 if (WCHAR ((unsigned char) beg[-1])) 995 goto next_char; 996 } 997#ifdef MBS_SUPPORT 998 if (mb_cur_max > 1) 999 { 1000 wchar_t nwc; 1001 int mr; 1002 1003 mr = mbtowc (&nwc, beg + len, buf + size - beg - len); 1004 if (mr <= 0) 1005 { 1006 memset (&mbs, '\0', sizeof (mbstate_t)); 1007 word_match = 1; 1008 } 1009 else if (!iswalnum (nwc) && nwc != L'_') 1010 word_match = 1; 1011 } 1012 else 1013#endif /* MBS_SUPPORT */ 1014 if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) 1015 word_match = 1; 1016 if (word_match) 1017 { 1018 if (!exact) 1019 /* Returns the whole line now we know there's a word match. */ 1020 goto success; 1021 else 1022 /* Returns just this word match. */ 1023 goto success_in_beg_and_len; 1024 } 1025 if (len > 0) 1026 { 1027 /* Try a shorter length anchored at the same place. */ 1028 --len; 1029 offset = kwsexec (kwset, beg, len, &kwsmatch); 1030 1031 if (offset == -1) 1032 goto next_char; /* Try a different anchor. */ 1033#ifdef MBS_SUPPORT 1034 if (mb_cur_max > 1 && !using_utf8) 1035 { 1036 size_t bytes_left = offset; 1037 while (bytes_left) 1038 { 1039 size_t mlen = mbrlen (beg, bytes_left, &mbs); 1040 1041 last_char = beg; 1042 if (mlen == (size_t) -1 || mlen == 0) 1043 { 1044 /* Incomplete character: treat as single-byte. */ 1045 memset (&mbs, '\0', sizeof (mbstate_t)); 1046 beg++; 1047 bytes_left--; 1048 continue; 1049 } 1050 1051 if (mlen == (size_t) -2) 1052 { 1053 /* Offset points inside multibyte character: 1054 * no good. */ 1055 break; 1056 } 1057 1058 beg += mlen; 1059 bytes_left -= mlen; 1060 } 1061 1062 if (bytes_left) 1063 { 1064 memset (&mbs, '\0', sizeof (mbstate_t)); 1065 goto next_char; /* Try a different anchor. */ 1066 } 1067 } 1068 else 1069#endif /* MBS_SUPPORT */ 1070 beg += offset; 1071#ifdef MBS_SUPPORT 1072 /* The string at beg now matches first 3 chars of one of 1073 the search strings (less if there are shorter search 1074 strings). See if this is a real match. */ 1075 if (f_i_multibyte 1076 && Fimbexec (beg, len - offset, &kwsmatch.size[0], 1077 exact)) 1078 goto next_char; 1079#endif /* MBS_SUPPORT */ 1080 len = kwsmatch.size[0]; 1081 } 1082 } 1083 } 1084 else 1085 goto success; 1086next_char:; 1087#ifdef MBS_SUPPORT 1088 /* Advance to next character. For MB_CUR_MAX == 1 case this is handled 1089 by ++beg above. */ 1090 if (mb_cur_max > 1) 1091 { 1092 if (using_utf8) 1093 { 1094 unsigned char c = *beg; 1095 if (c >= 0xc2) 1096 { 1097 if (c < 0xe0) 1098 ++beg; 1099 else if (c < 0xf0) 1100 beg += 2; 1101 else if (c < 0xf8) 1102 beg += 3; 1103 else if (c < 0xfc) 1104 beg += 4; 1105 else if (c < 0xfe) 1106 beg += 5; 1107 } 1108 } 1109 else 1110 { 1111 size_t l = mbrlen (beg, buf + size - beg, &mbs); 1112 1113 last_char = beg; 1114 if (l + 2 >= 2) 1115 beg += l - 1; 1116 else 1117 memset (&mbs, '\0', sizeof (mbstate_t)); 1118 } 1119 } 1120#endif /* MBS_SUPPORT */ 1121 } 1122 1123 failure: 1124 return -1; 1125 1126 success: 1127#ifdef MBS_SUPPORT 1128 if (mb_cur_max > 1 && !using_utf8) 1129 { 1130 end = beg + len; 1131 while (end < buf + size) 1132 { 1133 size_t mlen = mbrlen (end, buf + size - end, &mbs); 1134 if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) 1135 { 1136 memset (&mbs, '\0', sizeof (mbstate_t)); 1137 mlen = 1; 1138 } 1139 if (mlen == 1 && *end == eol) 1140 break; 1141 1142 end += mlen; 1143 } 1144 } 1145 else 1146#endif /* MBS_SUPPORT */ 1147 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 1148 1149 end++; 1150 while (buf < beg && beg[-1] != eol) 1151 --beg; 1152 len = end - beg; 1153 /* FALLTHROUGH */ 1154 1155 success_in_beg_and_len: 1156 *match_size = len; 1157 return beg - buf; 1158} 1159 1160#if HAVE_LIBPCRE 1161/* Compiled internal form of a Perl regular expression. */ 1162static pcre *cre; 1163 1164/* Additional information about the pattern. */ 1165static pcre_extra *extra; 1166#endif 1167 1168static void 1169Pcompile (char const *pattern, size_t size) 1170{ 1171#if !HAVE_LIBPCRE 1172 error (2, 0, _("The -P option is not supported")); 1173#else 1174 int e; 1175 char const *ep; 1176 char *re = xmalloc (4 * size + 7); 1177 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); 1178 char const *patlim = pattern + size; 1179 char *n = re; 1180 char const *p; 1181 char const *pnul; 1182 1183 /* FIXME: Remove this restriction. */ 1184 if (eolbyte != '\n') 1185 error (2, 0, _("The -P and -z options cannot be combined")); 1186 1187 *n = '\0'; 1188 if (match_lines) 1189 strcpy (n, "^("); 1190 if (match_words) 1191 strcpy (n, "\\b("); 1192 n += strlen (n); 1193 1194 /* The PCRE interface doesn't allow NUL bytes in the pattern, so 1195 replace each NUL byte in the pattern with the four characters 1196 "\000", removing a preceding backslash if there are an odd 1197 number of backslashes before the NUL. 1198 1199 FIXME: This method does not work with some multibyte character 1200 encodings, notably Shift-JIS, where a multibyte character can end 1201 in a backslash byte. */ 1202 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) 1203 { 1204 memcpy (n, p, pnul - p); 1205 n += pnul - p; 1206 for (p = pnul; pattern < p && p[-1] == '\\'; p--) 1207 continue; 1208 n -= (pnul - p) & 1; 1209 strcpy (n, "\\000"); 1210 n += 4; 1211 } 1212 1213 memcpy (n, p, patlim - p); 1214 n += patlim - p; 1215 *n = '\0'; 1216 if (match_words) 1217 strcpy (n, ")\\b"); 1218 if (match_lines) 1219 strcpy (n, ")$"); 1220 1221 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); 1222 if (!cre) 1223 error (2, 0, ep); 1224 1225 extra = pcre_study (cre, 0, &ep); 1226 if (ep) 1227 error (2, 0, ep); 1228 1229 free (re); 1230#endif 1231} 1232 1233static size_t 1234Pexecute (char const *buf, size_t size, size_t *match_size, int exact) 1235{ 1236#if !HAVE_LIBPCRE 1237 abort (); 1238 return -1; 1239#else 1240 /* This array must have at least two elements; everything after that 1241 is just for performance improvement in pcre_exec. */ 1242 int sub[300]; 1243 1244 int e = pcre_exec (cre, extra, buf, size, 0, 0, 1245 sub, sizeof sub / sizeof *sub); 1246 1247 if (e <= 0) 1248 { 1249 switch (e) 1250 { 1251 case PCRE_ERROR_NOMATCH: 1252 return -1; 1253 1254 case PCRE_ERROR_NOMEMORY: 1255 error (2, 0, _("Memory exhausted")); 1256 1257 default: 1258 abort (); 1259 } 1260 } 1261 else 1262 { 1263 /* Narrow down to the line we've found. */ 1264 char const *beg = buf + sub[0]; 1265 char const *end = buf + sub[1]; 1266 char const *buflim = buf + size; 1267 char eol = eolbyte; 1268 if (!exact) 1269 { 1270 end = memchr (end, eol, buflim - end); 1271 end++; 1272 while (buf < beg && beg[-1] != eol) 1273 --beg; 1274 } 1275 1276 *match_size = end - beg; 1277 return beg - buf; 1278 } 1279#endif 1280} 1281 1282struct matcher const matchers[] = { 1283 { "default", Gcompile, EGexecute }, 1284 { "grep", Gcompile, EGexecute }, 1285 { "egrep", Ecompile, EGexecute }, 1286 { "awk", Ecompile, EGexecute }, 1287 { "fgrep", Fcompile, Fexecute }, 1288 { "perl", Pcompile, Pexecute }, 1289 { "", 0, 0 }, 1290}; 1291