1/*    regexec.c
2 */
3
4/*
5 *	One Ring to rule them all, One Ring to find them
6 *
7 *     [p.v of _The Lord of the Rings_, opening poem]
8 *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12/* This file contains functions for executing a regular expression.  See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below).  Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places.  In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31/* The names of the functions have been changed from regcomp and
32 * regexec to  pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
36#ifdef PERL_EXT_RE_BUILD
37#include "re_top.h"
38#endif
39
40/*
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
42 *
43 *	Copyright (c) 1986 by University of Toronto.
44 *	Written by Henry Spencer.  Not derived from licensed software.
45 *
46 *	Permission is granted to anyone to use this software for any
47 *	purpose on any computer system, and to redistribute it freely,
48 *	subject to the following restrictions:
49 *
50 *	1. The author is not responsible for the consequences of use of
51 *		this software, no matter how awful, even if they arise
52 *		from defects in it.
53 *
54 *	2. The origin of this software must not be misrepresented, either
55 *		by explicit claim or by omission.
56 *
57 *	3. Altered versions must be plainly marked as such, and must not
58 *		be misrepresented as being the original software.
59 *
60 ****    Alterations to Henry's code are...
61 ****
62 ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 ****    by Larry Wall and others
65 ****
66 ****    You may distribute under the terms of either the GNU General Public
67 ****    License or the Artistic License, as specified in the README file.
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions.  Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGEX_ENGINE
75#define PERL_IN_REGEXEC_C
76#include "perl.h"
77
78#ifdef PERL_IN_XSUB_RE
79#  include "re_comp.h"
80#else
81#  include "regcomp.h"
82#endif
83
84#include "invlist_inline.h"
85#include "unicode_constants.h"
86
87static const char b_utf8_locale_required[] =
88 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
89                                                "  Assuming a UTF-8 locale";
90
91#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
92    STMT_START {                                                            \
93        if (! IN_UTF8_CTYPE_LOCALE) {                                       \
94          Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
95                                                b_utf8_locale_required);    \
96        }                                                                   \
97    } STMT_END
98
99static const char sets_utf8_locale_required[] =
100      "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
101
102#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
103    STMT_START {                                                            \
104        if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
105          Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
106                                             sets_utf8_locale_required);    \
107        }                                                                   \
108    } STMT_END
109
110#ifdef DEBUGGING
111/* At least one required character in the target string is expressible only in
112 * UTF-8. */
113static const char non_utf8_target_but_utf8_required[]
114                = "Can't match, because target string needs to be in UTF-8\n";
115#endif
116
117#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
118    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
119    goto target;                                                         \
120} STMT_END
121
122#ifndef STATIC
123#define STATIC  static
124#endif
125
126/*
127 * Forwards.
128 */
129
130#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131
132#define HOPc(pos,off) \
133        (char *)(reginfo->is_utf8_target \
134            ? reghop3((U8*)pos, off, \
135                    (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
136            : (U8*)(pos + off))
137
138/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
139#define HOPBACK3(pos, off, lim) \
140        (reginfo->is_utf8_target                          \
141            ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
142            : (pos - off >= lim)	                         \
143                ? (U8*)pos - off		                 \
144                : NULL)
145
146#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147
148#define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
149#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150
151/* lim must be +ve. Returns NULL on overshoot */
152#define HOPMAYBE3(pos,off,lim) \
153        (reginfo->is_utf8_target                        \
154            ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
155            : ((U8*)pos + off <= lim)                   \
156                ? (U8*)pos + off                        \
157                : NULL)
158
159/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
160 * off must be >=0; args should be vars rather than expressions */
161#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
162    ? reghop3((U8*)(pos), off, (U8*)(lim)) \
163    : (U8*)((pos + off) > lim ? lim : (pos + off)))
164#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165
166#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
167    ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168    : (U8*)(pos + off))
169#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170
171#define PLACEHOLDER	/* Something for the preprocessor to grab onto */
172/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173
174/* for use after a quantifier and before an EXACT-like node -- japhy */
175/* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 *
177 * NOTE that *nothing* that affects backtracking should be in here, specifically
178 * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
179 * node that is in between two EXACT like nodes when ascertaining what the required
180 * "follow" character is. This should probably be moved to regex compile time
181 * although it may be done at run time because of the REF possibility - more
182 * investigation required. -- demerphq
183*/
184#define JUMPABLE(rn) (                                                             \
185    OP(rn) == OPEN ||                                                              \
186    (OP(rn) == CLOSE &&                                                            \
187     !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) ||                                 \
188    OP(rn) == EVAL ||                                                              \
189    OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
190    OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
191    OP(rn) == KEEPS ||                                                             \
192    (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0)                                  \
193)
194#define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195
196#define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
197
198/*
199  Search for mandatory following text node; for lookahead, the text must
200  follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step.
201*/
202#define FIND_NEXT_IMPT(rn) STMT_START {                                   \
203    while (JUMPABLE(rn)) { \
204        const OPCODE type = OP(rn); \
205        if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
206            rn = REGNODE_AFTER_opcode(rn,type); \
207        else if (type == PLUS) \
208            rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
209        else if (type == IFMATCH) \
210            rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \
211        else rn += NEXT_OFF(rn); \
212    } \
213} STMT_END
214
215#define SLAB_FIRST(s) (&(s)->states[0])
216#define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217
218static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
219static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
220static regmatch_state * S_push_slab(pTHX);
221
222#define REGCP_OTHER_ELEMS 3
223#define REGCP_FRAME_ELEMS 1
224/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
226
227STATIC CHECKPOINT
228S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH)
229{
230    const int retval = PL_savestack_ix;
231    /* Number of bytes about to be stored in the stack */
232    const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor);
233    /* Number of savestack[] entries to be filled by the paren data */
234    /* Rounding is performed in case we are few elements short */
235    const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack);
236    const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
237    const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
238
239    DECLARE_AND_GET_RE_DEBUG_FLAGS;
240
241    PERL_ARGS_ASSERT_REGCPPUSH;
242
243    if (paren_elems_to_push < 0)
244        Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i",
245                   (int)paren_elems_to_push, (int)maxopenparen,
246                   (int)parenfloor);
247
248    if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
249        Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
250                   " out of range (%lu-%ld)",
251                   total_elems,
252                   (unsigned long)maxopenparen,
253                   (long)parenfloor);
254
255    DEBUG_BUFFERS_r(
256        if ((int)maxopenparen > (int)parenfloor)
257            Perl_re_exec_indentf( aTHX_
258                "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
259                depth,
260                PTR2UV(rex),
261                PTR2UV(RXp_OFFSp(rex))
262            );
263    );
264
265    SSGROW(total_elems + REGCP_FRAME_ELEMS);
266    assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS));
267
268    /* memcpy the offs inside the stack - it's faster than for loop */
269    memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push);
270    PL_savestack_ix += paren_elems_to_push;
271
272    DEBUG_BUFFERS_r({
273	I32 p;
274        for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) {
275            Perl_re_exec_indentf(aTHX_
276                "    \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n",
277                depth,
278                (UV)p,
279                (IV)RXp_OFFSp(rex)[p].start,
280                (IV)RXp_OFFSp(rex)[p].start_tmp,
281                (IV)RXp_OFFSp(rex)[p].end
282            );
283        }
284    });
285
286/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
287    SSPUSHINT(maxopenparen);
288    SSPUSHINT(RXp_LASTPAREN(rex));
289    SSPUSHINT(RXp_LASTCLOSEPAREN(rex));
290    SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
291
292
293    DEBUG_BUFFERS_r({
294        Perl_re_exec_indentf(aTHX_
295                "finished regcppush returning %" IVdf " cur: %" IVdf "\n",
296                depth, retval, PL_savestack_ix);
297    });
298
299    return retval;
300}
301
302/* These are needed since we do not localize EVAL nodes: */
303#define REGCP_SET(cp)                                           \
304    DEBUG_STATE_r(                                              \
305        Perl_re_exec_indentf( aTHX_                             \
306            "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
307            depth, (IV)PL_savestack_ix                          \
308        )                                                       \
309    );                                                          \
310    cp = PL_savestack_ix
311
312#define REGCP_UNWIND(cp)                                        \
313    DEBUG_STATE_r(                                              \
314        if (cp != PL_savestack_ix)                              \
315            Perl_re_exec_indentf( aTHX_                         \
316                "Clearing an EVAL scope, savestack=%"           \
317                IVdf "..%" IVdf "\n",                           \
318                depth, (IV)(cp), (IV)PL_savestack_ix            \
319            )                                                   \
320    );                                                          \
321    regcpblow(cp)
322
323/* set the start and end positions of capture ix */
324#define CLOSE_ANY_CAPTURE(rex, ix, s, e)                                    \
325    RXp_OFFSp(rex)[(ix)].start = (s);                                       \
326    RXp_OFFSp(rex)[(ix)].end = (e)
327
328#define CLOSE_CAPTURE(rex, ix, s, e)                                        \
329    CLOSE_ANY_CAPTURE(rex, ix, s, e);                                       \
330    if (ix > RXp_LASTPAREN(rex))                                            \
331        RXp_LASTPAREN(rex) = (ix);                                          \
332    RXp_LASTCLOSEPAREN(rex) = (ix);                                         \
333    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                             \
334        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \
335        depth,                                                              \
336        PTR2UV(rex),                                                        \
337        PTR2UV(RXp_OFFSp(rex)),                                             \
338        (UV)(ix),                                                           \
339        (IV)RXp_OFFSp(rex)[ix].start,                                       \
340        (IV)RXp_OFFSp(rex)[ix].end,                                         \
341        (UV)RXp_LASTPAREN(rex)                                              \
342    ))
343
344/* the lp and lcp args match the relevant members of the
345 * regexp structure, but in practice they should all be U16
346 * instead as we have a hard limit of U16_MAX parens. See
347 * line 4003 or so of regcomp.c where we parse OPEN parens
348 * of various types. */
349PERL_STATIC_INLINE void
350S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) {
351    PERL_ARGS_ASSERT_UNWIND_PAREN;
352    U32 n;
353    DECLARE_AND_GET_RE_DEBUG_FLAGS;
354    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
355        "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf
356        ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n",
357        depth,
358        PTR2UV(rex),
359        PTR2UV(RXp_OFFSp(rex)),
360        (UV)(lp),
361        (UV)(RXp_LASTPAREN(rex)),
362        (UV)(lcp)
363    ));
364    for (n = RXp_LASTPAREN(rex); n > lp; n--) {
365        RXp_OFFSp(rex)[n].end = -1;
366    }
367    RXp_LASTPAREN(rex) = n;
368    RXp_LASTCLOSEPAREN(rex) = lcp;
369}
370#define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp)
371
372PERL_STATIC_INLINE void
373S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) {
374    PERL_ARGS_ASSERT_CAPTURE_CLEAR;
375    PERL_UNUSED_ARG(str); /* only used for debugging */
376    U16 my_ix;
377    DECLARE_AND_GET_RE_DEBUG_FLAGS;
378    for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) {
379        DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
380                "CAPTURE_CLEAR %s \\%" IVdf ": "
381                "%" IVdf "(%" IVdf ") .. %" IVdf
382                " => "
383                "%" IVdf "(%" IVdf ") .. %" IVdf
384                "\n",
385            depth, str, (IV)my_ix,
386            (IV)RXp_OFFSp(rex)[my_ix].start,
387            (IV)RXp_OFFSp(rex)[my_ix].start_tmp,
388            (IV)RXp_OFFSp(rex)[my_ix].end,
389            (IV)-1, (IV)-1, (IV)-1));
390        RXp_OFFSp(rex)[my_ix].start = -1;
391        RXp_OFFSp(rex)[my_ix].start_tmp = -1;
392        RXp_OFFSp(rex)[my_ix].end = -1;
393    }
394}
395
396#define CAPTURE_CLEAR(from_ix, to_ix, str) \
397    if (from_ix) capture_clear(rex,from_ix, to_ix, str)
398
399STATIC void
400S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH)
401{
402    UV i;
403    U32 paren;
404    DECLARE_AND_GET_RE_DEBUG_FLAGS;
405
406    PERL_ARGS_ASSERT_REGCPPOP;
407
408
409    DEBUG_BUFFERS_r({
410        Perl_re_exec_indentf(aTHX_
411                "starting regcppop at %" IVdf "\n",
412                depth, PL_savestack_ix);
413    });
414
415    /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
416    i = SSPOPUV;
417    assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418    i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419    RXp_LASTCLOSEPAREN(rex) = SSPOPINT;
420    RXp_LASTPAREN(rex) = SSPOPINT;
421    *maxopenparen_p = SSPOPINT;
422
423    i -= REGCP_OTHER_ELEMS;
424    /* Now restore the parentheses context. */
425    DEBUG_BUFFERS_r(
426        if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens)
427            Perl_re_exec_indentf( aTHX_
428                "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
429                depth,
430                PTR2UV(rex),
431                PTR2UV(RXp_OFFSp(rex))
432            );
433    );
434    /* substract remaining elements from the stack */
435    PL_savestack_ix -= i;
436
437    /* static assert that offs struc size is not less than stack elem size */
438    STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack));
439
440    /* calculate actual number of offs/capture groups stored */
441    /* by doing integer division (leaving potential alignment aside) */
442    i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex));
443
444    /* calculate paren starting point */
445    /* i is our number of entries which we are subtracting from *maxopenparen_p */
446    /* and we are storing + 1 this to get the beginning */
447    paren = *maxopenparen_p - i + 1;
448
449    /* restore them */
450    memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex)));
451
452    DEBUG_BUFFERS_r(
453        for (; paren <= *maxopenparen_p; ++paren) {
454            Perl_re_exec_indentf(aTHX_
455                "    \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n",
456                depth,
457                (UV)paren,
458                (IV)RXp_OFFSp(rex)[paren].start,
459                (IV)RXp_OFFSp(rex)[paren].start_tmp,
460                (IV)RXp_OFFSp(rex)[paren].end,
461                (paren > RXp_LASTPAREN(rex) ? "(skipped)" : ""));
462        }
463    );
464#if 1
465    /* It would seem that the similar code in regtry()
466     * already takes care of this, and in fact it is in
467     * a better location to since this code can #if 0-ed out
468     * but the code in regtry() is needed or otherwise tests
469     * requiring null fields (pat.t#187 and split.t#{13,14}
470     * (as of patchlevel 7877)  will fail.  Then again,
471     * this code seems to be necessary or otherwise
472     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473     * --jhi updated by dapm */
474    for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) {
475        if (i > *maxopenparen_p) {
476            RXp_OFFSp(rex)[i].start = -1;
477        }
478        RXp_OFFSp(rex)[i].end = -1;
479        DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
480            "    \\%" UVuf ": %s   ..-1 undeffing (regcppop)\n",
481            depth,
482            (UV)i,
483            (i > *maxopenparen_p) ? "-1" : "  "
484        ));
485    }
486#endif
487    DEBUG_BUFFERS_r({
488        Perl_re_exec_indentf(aTHX_
489                "finished regcppop at %" IVdf "\n",
490                depth, PL_savestack_ix);
491    });
492}
493
494/* restore the parens and associated vars at savestack position ix,
495 * but without popping the stack */
496
497STATIC void
498S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH)
499{
500    I32 tmpix = PL_savestack_ix;
501    PERL_ARGS_ASSERT_REGCP_RESTORE;
502
503    PL_savestack_ix = ix;
504    regcppop(rex, maxopenparen_p);
505    PL_savestack_ix = tmpix;
506}
507
508#define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
509
510STATIC bool
511S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
512{
513    /* Returns a boolean as to whether or not 'character' is a member of the
514     * Posix character class given by 'classnum' that should be equivalent to a
515     * value in the typedef 'char_class_number_'.
516     *
517     * Ideally this could be replaced by a just an array of function pointers
518     * to the C library functions that implement the macros this calls.
519     * However, to compile, the precise function signatures are required, and
520     * these may vary from platform to platform.  To avoid having to figure
521     * out what those all are on each platform, I (khw) am using this method,
522     * which adds an extra layer of function call overhead (unless the C
523     * optimizer strips it away).  But we don't particularly care about
524     * performance with locales anyway. */
525
526    if (IN_UTF8_CTYPE_LOCALE) {
527        return cBOOL(generic_isCC_(character, classnum));
528    }
529
530    switch ((char_class_number_) classnum) {
531        case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
532        case CC_ENUM_ALPHA_:        return    isU8_ALPHA_LC(character);
533        case CC_ENUM_ASCII_:        return    isU8_ASCII_LC(character);
534        case CC_ENUM_BLANK_:        return    isU8_BLANK_LC(character);
535        case CC_ENUM_CASED_:        return    isU8_CASED_LC(character);
536        case CC_ENUM_CNTRL_:        return    isU8_CNTRL_LC(character);
537        case CC_ENUM_DIGIT_:        return    isU8_DIGIT_LC(character);
538        case CC_ENUM_GRAPH_:        return    isU8_GRAPH_LC(character);
539        case CC_ENUM_LOWER_:        return    isU8_LOWER_LC(character);
540        case CC_ENUM_PRINT_:        return    isU8_PRINT_LC(character);
541        case CC_ENUM_PUNCT_:        return    isU8_PUNCT_LC(character);
542        case CC_ENUM_SPACE_:        return    isU8_SPACE_LC(character);
543        case CC_ENUM_UPPER_:        return    isU8_UPPER_LC(character);
544        case CC_ENUM_WORDCHAR_:     return isU8_WORDCHAR_LC(character);
545        case CC_ENUM_XDIGIT_:       return   isU8_XDIGIT_LC(character);
546        default:    /* VERTSPACE should never occur in locales */
547            break;
548    }
549
550    Perl_croak(aTHX_
551               "panic: isFOO_lc() has an unexpected character class '%d'",
552               classnum);
553
554    NOT_REACHED; /* NOTREACHED */
555    return FALSE;
556}
557
558PERL_STATIC_INLINE I32
559S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
560{
561    /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
562     * folded.  Works on all folds representable without UTF-8, except for
563     * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
564     * check that the strings each have at least 'len' characters.
565     *
566     * There is almost an identical API function where s2 need not be folded:
567     * Perl_foldEQ_latin1() */
568
569    const U8 *a = (const U8 *)s1;
570    const U8 *b = (const U8 *)s2;
571
572    PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
573
574    assert(len >= 0);
575
576    while (len--) {
577        assert(! isUPPER_L1(*b));
578        if (toLOWER_L1(*a) != *b) {
579            return 0;
580        }
581        a++, b++;
582    }
583    return 1;
584}
585
586STATIC bool
587S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
588{
589    /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
590     * 'character' is a member of the Posix character class given by 'classnum'
591     * that should be equivalent to a value in the typedef
592     * 'char_class_number_'.
593     *
594     * This just calls isFOO_lc on the code point for the character if it is in
595     * the range 0-255.  Outside that range, all characters use Unicode
596     * rules, ignoring any locale.  So use the Unicode function if this class
597     * requires an inversion list, and use the Unicode macro otherwise. */
598
599
600    PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
601
602    if (UTF8_IS_INVARIANT(*character)) {
603        return isFOO_lc(classnum, *character);
604    }
605    else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
606        return isFOO_lc(classnum,
607                        EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
608    }
609
610    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
611
612    switch ((char_class_number_) classnum) {
613        case CC_ENUM_SPACE_:     return is_XPERLSPACE_high(character);
614        case CC_ENUM_BLANK_:     return is_HORIZWS_high(character);
615        case CC_ENUM_XDIGIT_:    return is_XDIGIT_high(character);
616        case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
617        default:
618            return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
619                                        utf8_to_uvchr_buf(character, e, NULL));
620    }
621    NOT_REACHED; /* NOTREACHED */
622}
623
624STATIC U8 *
625S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
626{
627    /* Returns the position of the first byte in the sequence between 's' and
628     * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
629     * */
630
631    PERL_ARGS_ASSERT_FIND_SPAN_END;
632
633    assert(send >= s);
634
635    if ((STRLEN) (send - s) >= PERL_WORDSIZE
636                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638    {
639        PERL_UINTMAX_T span_word;
640
641        /* Process per-byte until reach word boundary.  XXX This loop could be
642         * eliminated if we knew that this platform had fast unaligned reads */
643        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
644            if (*s != span_byte) {
645                return s;
646            }
647            s++;
648        }
649
650        /* Create a word filled with the bytes we are spanning */
651        span_word = PERL_COUNT_MULTIPLIER * span_byte;
652
653        /* Process per-word as long as we have at least a full word left */
654        do {
655
656            /* Keep going if the whole word is composed of 'span_byte's */
657            if ((* (PERL_UINTMAX_T *) s) == span_word)  {
658                s += PERL_WORDSIZE;
659                continue;
660            }
661
662            /* Here, at least one byte in the word isn't 'span_byte'. */
663
664#ifdef EBCDIC
665
666            break;
667
668#else
669
670            /* This xor leaves 1 bits only in those non-matching bytes */
671            span_word ^= * (PERL_UINTMAX_T *) s;
672
673            /* Make sure the upper bit of each non-matching byte is set.  This
674             * makes each such byte look like an ASCII platform variant byte */
675            span_word |= span_word << 1;
676            span_word |= span_word << 2;
677            span_word |= span_word << 4;
678
679            /* That reduces the problem to what this function solves */
680            return s + variant_byte_number(span_word);
681
682#endif
683
684        } while (s + PERL_WORDSIZE <= send);
685    }
686
687    /* Process the straggler bytes beyond the final word boundary */
688    while (s < send) {
689        if (*s != span_byte) {
690            return s;
691        }
692        s++;
693    }
694
695    return s;
696}
697
698STATIC U8 *
699S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
700{
701    /* Returns the position of the first byte in the sequence between 's'
702     * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
703     * returns 'send' if none found.  It uses word-level operations instead of
704     * byte to speed up the process */
705
706    PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
707
708    assert(send >= s);
709    assert((byte & mask) == byte);
710
711#ifndef EBCDIC
712
713    if ((STRLEN) (send - s) >= PERL_WORDSIZE
714                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
715                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
716    {
717        PERL_UINTMAX_T word, mask_word;
718
719        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
720            if (((*s) & mask) == byte) {
721                return s;
722            }
723            s++;
724        }
725
726        word      = PERL_COUNT_MULTIPLIER * byte;
727        mask_word = PERL_COUNT_MULTIPLIER * mask;
728
729        do {
730            PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
731
732            /* If 'masked' contains bytes with the bit pattern of 'byte' within
733             * it, xoring with 'word' will leave each of the 8 bits in such
734             * bytes be 0, and no byte containing any other bit pattern will be
735             * 0. */
736            masked ^= word;
737
738            /* This causes the most significant bit to be set to 1 for any
739             * bytes in the word that aren't completely 0 */
740            masked |= masked << 1;
741            masked |= masked << 2;
742            masked |= masked << 4;
743
744            /* The msbits are the same as what marks a byte as variant, so we
745             * can use this mask.  If all msbits are 1, the word doesn't
746             * contain 'byte' */
747            if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
748                s += PERL_WORDSIZE;
749                continue;
750            }
751
752            /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
753             * and any that are, are 0.  Complement and re-AND to swap that */
754            masked = ~ masked;
755            masked &= PERL_VARIANTS_WORD_MASK;
756
757            /* This reduces the problem to that solved by this function */
758            s += variant_byte_number(masked);
759            return s;
760
761        } while (s + PERL_WORDSIZE <= send);
762    }
763
764#endif
765
766    while (s < send) {
767        if (((*s) & mask) == byte) {
768            return s;
769        }
770        s++;
771    }
772
773    return s;
774}
775
776STATIC U8 *
777S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
778{
779    /* Returns the position of the first byte in the sequence between 's' and
780     * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
781     * 'span_byte' should have been ANDed with 'mask' in the call of this
782     * function.  Returns 'send' if none found.  Works like find_span_end(),
783     * except for the AND */
784
785    PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
786
787    assert(send >= s);
788    assert((span_byte & mask) == span_byte);
789
790    if ((STRLEN) (send - s) >= PERL_WORDSIZE
791                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
792                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
793    {
794        PERL_UINTMAX_T span_word, mask_word;
795
796        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
797            if (((*s) & mask) != span_byte) {
798                return s;
799            }
800            s++;
801        }
802
803        span_word = PERL_COUNT_MULTIPLIER * span_byte;
804        mask_word = PERL_COUNT_MULTIPLIER * mask;
805
806        do {
807            PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
808
809            if (masked == span_word) {
810                s += PERL_WORDSIZE;
811                continue;
812            }
813
814#ifdef EBCDIC
815
816            break;
817
818#else
819
820            masked ^= span_word;
821            masked |= masked << 1;
822            masked |= masked << 2;
823            masked |= masked << 4;
824            return s + variant_byte_number(masked);
825
826#endif
827
828        } while (s + PERL_WORDSIZE <= send);
829    }
830
831    while (s < send) {
832        if (((*s) & mask) != span_byte) {
833            return s;
834        }
835        s++;
836    }
837
838    return s;
839}
840
841/*
842 * pregexec and friends
843 */
844
845#ifndef PERL_IN_XSUB_RE
846/*
847 - pregexec - match a regexp against a string
848 */
849I32
850Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
851         char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
852/* stringarg: the point in the string at which to begin matching */
853/* strend:    pointer to null at end of string */
854/* strbeg:    real beginning of string */
855/* minend:    end of match must be >= minend bytes after stringarg. */
856/* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
857 *            itself is accessed via the pointers above */
858/* nosave:    For optimizations. */
859{
860    PERL_ARGS_ASSERT_PREGEXEC;
861
862    return
863        regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
864                      nosave ? 0 : REXEC_COPY_STR);
865}
866#endif
867
868
869
870/* re_intuit_start():
871 *
872 * Based on some optimiser hints, try to find the earliest position in the
873 * string where the regex could match.
874 *
875 *   rx:     the regex to match against
876 *   sv:     the SV being matched: only used for utf8 flag; the string
877 *           itself is accessed via the pointers below. Note that on
878 *           something like an overloaded SV, SvPOK(sv) may be false
879 *           and the string pointers may point to something unrelated to
880 *           the SV itself.
881 *   strbeg: real beginning of string
882 *   strpos: the point in the string at which to begin matching
883 *   strend: pointer to the byte following the last char of the string
884 *   flags   currently unused; set to 0
885 *   data:   currently unused; set to NULL
886 *
887 * The basic idea of re_intuit_start() is to use some known information
888 * about the pattern, namely:
889 *
890 *   a) the longest known anchored substring (i.e. one that's at a
891 *      constant offset from the beginning of the pattern; but not
892 *      necessarily at a fixed offset from the beginning of the
893 *      string);
894 *   b) the longest floating substring (i.e. one that's not at a constant
895 *      offset from the beginning of the pattern);
896 *   c) Whether the pattern is anchored to the string; either
897 *      an absolute anchor: /^../, or anchored to \n: /^.../m,
898 *      or anchored to pos(): /\G/;
899 *   d) A start class: a real or synthetic character class which
900 *      represents which characters are legal at the start of the pattern;
901 *
902 * to either quickly reject the match, or to find the earliest position
903 * within the string at which the pattern might match, thus avoiding
904 * running the full NFA engine at those earlier locations, only to
905 * eventually fail and retry further along.
906 *
907 * Returns NULL if the pattern can't match, or returns the address within
908 * the string which is the earliest place the match could occur.
909 *
910 * The longest of the anchored and floating substrings is called 'check'
911 * and is checked first. The other is called 'other' and is checked
912 * second. The 'other' substring may not be present.  For example,
913 *
914 *    /(abc|xyz)ABC\d{0,3}DEFG/
915 *
916 * will have
917 *
918 *   check substr (float)    = "DEFG", offset 6..9 chars
919 *   other substr (anchored) = "ABC",  offset 3..3 chars
920 *   stclass = [ax]
921 *
922 * Be aware that during the course of this function, sometimes 'anchored'
923 * refers to a substring being anchored relative to the start of the
924 * pattern, and sometimes to the pattern itself being anchored relative to
925 * the string. For example:
926 *
927 *   /\dabc/:   "abc" is anchored to the pattern;
928 *   /^\dabc/:  "abc" is anchored to the pattern and the string;
929 *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
930 *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
931 *                    but the pattern is anchored to the string.
932 */
933
934char *
935Perl_re_intuit_start(pTHX_
936                    REGEXP * const rx,
937                    SV *sv,
938                    const char * const strbeg,
939                    char *strpos,
940                    char *strend,
941                    const U32 flags,
942                    re_scream_pos_data *data)
943{
944    struct regexp *const prog = ReANY(rx);
945    SSize_t start_shift = prog->check_offset_min;
946    /* Should be nonnegative! */
947    SSize_t end_shift   = 0;
948    /* current lowest pos in string where the regex can start matching */
949    char *rx_origin = strpos;
950    SV *check;
951    const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
952    U8   other_ix = 1 - prog->substrs->check_ix;
953    bool ml_anch = 0;
954    char *other_last = strpos;/* latest pos 'other' substr already checked to */
955    char *check_at = NULL;		/* check substr found at this pos */
956    const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
957    RXi_GET_DECL(prog,progi);
958    regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
959    regmatch_info *const reginfo = &reginfo_buf;
960    DECLARE_AND_GET_RE_DEBUG_FLAGS;
961
962    PERL_ARGS_ASSERT_RE_INTUIT_START;
963    PERL_UNUSED_ARG(flags);
964    PERL_UNUSED_ARG(data);
965
966    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
967                "Intuit: trying to determine minimum start position...\n"));
968
969    /* for now, assume that all substr offsets are positive. If at some point
970     * in the future someone wants to do clever things with lookbehind and
971     * -ve offsets, they'll need to fix up any code in this function
972     * which uses these offsets. See the thread beginning
973     * <20140113145929.GF27210@iabyn.com>
974     */
975    assert(prog->substrs->data[0].min_offset >= 0);
976    assert(prog->substrs->data[0].max_offset >= 0);
977    assert(prog->substrs->data[1].min_offset >= 0);
978    assert(prog->substrs->data[1].max_offset >= 0);
979    assert(prog->substrs->data[2].min_offset >= 0);
980    assert(prog->substrs->data[2].max_offset >= 0);
981
982    /* for now, assume that if both present, that the floating substring
983     * doesn't start before the anchored substring.
984     * If you break this assumption (e.g. doing better optimisations
985     * with lookahead/behind), then you'll need to audit the code in this
986     * function carefully first
987     */
988    assert(
989            ! (  (prog->anchored_utf8 || prog->anchored_substr)
990              && (prog->float_utf8    || prog->float_substr))
991           || (prog->float_min_offset >= prog->anchored_offset));
992
993    /* byte rather than char calculation for efficiency. It fails
994     * to quickly reject some cases that can't match, but will reject
995     * them later after doing full char arithmetic */
996    if (prog->minlen > strend - strpos) {
997        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998                              "  String too short...\n"));
999        goto fail;
1000    }
1001
1002    RXp_MATCH_UTF8_set(prog, utf8_target);
1003    reginfo->is_utf8_target = cBOOL(utf8_target);
1004    reginfo->info_aux = NULL;
1005    reginfo->strbeg = strbeg;
1006    reginfo->strend = strend;
1007    reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1008    reginfo->intuit = 1;
1009    /* not actually used within intuit, but zero for safety anyway */
1010    reginfo->poscache_maxiter = 0;
1011
1012    if (utf8_target) {
1013        if ((!prog->anchored_utf8 && prog->anchored_substr)
1014                || (!prog->float_utf8 && prog->float_substr))
1015            to_utf8_substr(prog);
1016        check = prog->check_utf8;
1017    } else {
1018        if (!prog->check_substr && prog->check_utf8) {
1019            if (! to_byte_substr(prog)) {
1020                NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1021            }
1022        }
1023        check = prog->check_substr;
1024    }
1025
1026    /* dump the various substring data */
1027    DEBUG_OPTIMISE_MORE_r({
1028        int i;
1029        for (i=0; i<=2; i++) {
1030            SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1031                                  : prog->substrs->data[i].substr);
1032            if (!sv)
1033                continue;
1034
1035            Perl_re_printf( aTHX_
1036                "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1037                " useful=%" IVdf " utf8=%d [%s]\n",
1038                i,
1039                (IV)prog->substrs->data[i].min_offset,
1040                (IV)prog->substrs->data[i].max_offset,
1041                (IV)prog->substrs->data[i].end_shift,
1042                BmUSEFUL(sv),
1043                utf8_target ? 1 : 0,
1044                SvPEEK(sv));
1045        }
1046    });
1047
1048    if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1049
1050        /* ml_anch: check after \n?
1051         *
1052         * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1053         * with /.*.../, these flags will have been added by the
1054         * compiler:
1055         *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1056         *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1057         */
1058        ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1059                   && !(prog->intflags & PREGf_IMPLICIT);
1060
1061        if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1062            /* we are only allowed to match at BOS or \G */
1063
1064            /* trivially reject if there's a BOS anchor and we're not at BOS.
1065             *
1066             * Note that we don't try to do a similar quick reject for
1067             * \G, since generally the caller will have calculated strpos
1068             * based on pos() and gofs, so the string is already correctly
1069             * anchored by definition; and handling the exceptions would
1070             * be too fiddly (e.g. REXEC_IGNOREPOS).
1071             */
1072            if (   strpos != strbeg
1073                && (prog->intflags & PREGf_ANCH_SBOL))
1074            {
1075                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1076                                "  Not at start...\n"));
1077                goto fail;
1078            }
1079
1080            /* in the presence of an anchor, the anchored (relative to the
1081             * start of the regex) substr must also be anchored relative
1082             * to strpos. So quickly reject if substr isn't found there.
1083             * This works for \G too, because the caller will already have
1084             * subtracted gofs from pos, and gofs is the offset from the
1085             * \G to the start of the regex. For example, in /.abc\Gdef/,
1086             * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1087             * caller will have set strpos=pos()-4; we look for the substr
1088             * at position pos()-4+1, which lines up with the "a" */
1089
1090            if (prog->check_offset_min == prog->check_offset_max) {
1091                /* Substring at constant offset from beg-of-str... */
1092                SSize_t slen = SvCUR(check);
1093                char *s = HOP3c(strpos, prog->check_offset_min, strend);
1094
1095                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096                    "  Looking for check substr at fixed offset %" IVdf "...\n",
1097                    (IV)prog->check_offset_min));
1098
1099                if (SvTAIL(check)) {
1100                    /* In this case, the regex is anchored at the end too.
1101                     * Unless it's a multiline match, the lengths must match
1102                     * exactly, give or take a \n.  NB: slen >= 1 since
1103                     * the last char of check is \n */
1104                    if (!multiline
1105                        && (   strend - s > slen
1106                            || strend - s < slen - 1
1107                            || (strend - s == slen && strend[-1] != '\n')))
1108                    {
1109                        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1110                                            "  String too long...\n"));
1111                        goto fail_finish;
1112                    }
1113                    /* Now should match s[0..slen-2] */
1114                    slen--;
1115                }
1116                if (slen && (strend - s < slen
1117                    || *SvPVX_const(check) != *s
1118                    || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1119                {
1120                    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                                    "  String not equal...\n"));
1122                    goto fail_finish;
1123                }
1124
1125                check_at = s;
1126                goto success_at_start;
1127            }
1128        }
1129    }
1130
1131    end_shift = prog->check_end_shift;
1132
1133#ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1134    if (end_shift < 0)
1135        Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1136                   (IV)end_shift, RX_PRECOMP(rx));
1137#endif
1138
1139  restart:
1140
1141    /* This is the (re)entry point of the main loop in this function.
1142     * The goal of this loop is to:
1143     * 1) find the "check" substring in the region rx_origin..strend
1144     *    (adjusted by start_shift / end_shift). If not found, reject
1145     *    immediately.
1146     * 2) If it exists, look for the "other" substr too if defined; for
1147     *    example, if the check substr maps to the anchored substr, then
1148     *    check the floating substr, and vice-versa. If not found, go
1149     *    back to (1) with rx_origin suitably incremented.
1150     * 3) If we find an rx_origin position that doesn't contradict
1151     *    either of the substrings, then check the possible additional
1152     *    constraints on rx_origin of /^.../m or a known start class.
1153     *    If these fail, then depending on which constraints fail, jump
1154     *    back to here, or to various other re-entry points further along
1155     *    that skip some of the first steps.
1156     * 4) If we pass all those tests, update the BmUSEFUL() count on the
1157     *    substring. If the start position was determined to be at the
1158     *    beginning of the string  - so, not rejected, but not optimised,
1159     *    since we have to run regmatch from position 0 - decrement the
1160     *    BmUSEFUL() count. Otherwise increment it.
1161     */
1162
1163
1164    /* first, look for the 'check' substring */
1165
1166    {
1167        U8* start_point;
1168        U8* end_point;
1169
1170        DEBUG_OPTIMISE_MORE_r({
1171            Perl_re_printf( aTHX_
1172                "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1173                " Start shift: %" IVdf " End shift %" IVdf
1174                " Real end Shift: %" IVdf "\n",
1175                (IV)(rx_origin - strbeg),
1176                (IV)prog->check_offset_min,
1177                (IV)start_shift,
1178                (IV)end_shift,
1179                (IV)prog->check_end_shift);
1180        });
1181
1182        end_point = HOPBACK3(strend, end_shift, rx_origin);
1183        if (!end_point)
1184            goto fail_finish;
1185        start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1186        if (!start_point)
1187            goto fail_finish;
1188
1189
1190        /* If the regex is absolutely anchored to either the start of the
1191         * string (SBOL) or to pos() (ANCH_GPOS), then
1192         * check_offset_max represents an upper bound on the string where
1193         * the substr could start. For the ANCH_GPOS case, we assume that
1194         * the caller of intuit will have already set strpos to
1195         * pos()-gofs, so in this case strpos + offset_max will still be
1196         * an upper bound on the substr.
1197         */
1198        if (!ml_anch
1199            && prog->intflags & PREGf_ANCH
1200            && prog->check_offset_max != SSize_t_MAX)
1201        {
1202            SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1203            const char * const anchor =
1204                        (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1205            SSize_t targ_len = (char*)end_point - anchor;
1206
1207            if (check_len > targ_len) {
1208                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209                              "Target string too short to match required substring...\n"));
1210                goto fail_finish;
1211            }
1212
1213            /* do a bytes rather than chars comparison. It's conservative;
1214             * so it skips doing the HOP if the result can't possibly end
1215             * up earlier than the old value of end_point.
1216             */
1217            assert(anchor + check_len <= (char *)end_point);
1218            if (prog->check_offset_max + check_len < targ_len) {
1219                end_point = HOP3lim((U8*)anchor,
1220                                prog->check_offset_max,
1221                                end_point - check_len
1222                            )
1223                            + check_len;
1224                if (end_point < start_point)
1225                    goto fail_finish;
1226            }
1227        }
1228
1229        check_at = fbm_instr( start_point, end_point,
1230                      check, multiline ? FBMrf_MULTILINE : 0);
1231
1232        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233            "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1234            (IV)((char*)start_point - strbeg),
1235            (IV)((char*)end_point   - strbeg),
1236            (IV)(check_at ? check_at - strbeg : -1)
1237        ));
1238
1239        /* Update the count-of-usability, remove useless subpatterns,
1240            unshift s.  */
1241
1242        DEBUG_EXECUTE_r({
1243            RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1244                SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1245            Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1246                              (check_at ? "Found" : "Did not find"),
1247                (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1248                    ? "anchored" : "floating"),
1249                quoted,
1250                RE_SV_TAIL(check),
1251                (check_at ? " at offset " : "...\n") );
1252        });
1253
1254        if (!check_at)
1255            goto fail_finish;
1256        /* set rx_origin to the minimum position where the regex could start
1257         * matching, given the constraint of the just-matched check substring.
1258         * But don't set it lower than previously.
1259         */
1260
1261        if (check_at - rx_origin > prog->check_offset_max)
1262            rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1263        /* Finish the diagnostic message */
1264        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265            "%ld (rx_origin now %" IVdf ")...\n",
1266            (long)(check_at - strbeg),
1267            (IV)(rx_origin - strbeg)
1268        ));
1269    }
1270
1271
1272    /* now look for the 'other' substring if defined */
1273
1274    if (prog->substrs->data[other_ix].utf8_substr
1275        || prog->substrs->data[other_ix].substr)
1276    {
1277        /* Take into account the "other" substring. */
1278        char *last, *last1;
1279        char *s;
1280        SV* must;
1281        struct reg_substr_datum *other;
1282
1283      do_other_substr:
1284        other = &prog->substrs->data[other_ix];
1285        if (!utf8_target && !other->substr) {
1286            if (!to_byte_substr(prog)) {
1287                NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1288            }
1289        }
1290
1291        /* if "other" is anchored:
1292         * we've previously found a floating substr starting at check_at.
1293         * This means that the regex origin must lie somewhere
1294         * between min (rx_origin): HOP3(check_at, -check_offset_max)
1295         * and max:                 HOP3(check_at, -check_offset_min)
1296         * (except that min will be >= strpos)
1297         * So the fixed  substr must lie somewhere between
1298         *  HOP3(min, anchored_offset)
1299         *  HOP3(max, anchored_offset) + SvCUR(substr)
1300         */
1301
1302        /* if "other" is floating
1303         * Calculate last1, the absolute latest point where the
1304         * floating substr could start in the string, ignoring any
1305         * constraints from the earlier fixed match. It is calculated
1306         * as follows:
1307         *
1308         * strend - prog->minlen (in chars) is the absolute latest
1309         * position within the string where the origin of the regex
1310         * could appear. The latest start point for the floating
1311         * substr is float_min_offset(*) on from the start of the
1312         * regex.  last1 simply combines thee two offsets.
1313         *
1314         * (*) You might think the latest start point should be
1315         * float_max_offset from the regex origin, and technically
1316         * you'd be correct. However, consider
1317         *    /a\d{2,4}bcd\w/
1318         * Here, float min, max are 3,5 and minlen is 7.
1319         * This can match either
1320         *    /a\d\dbcd\w/
1321         *    /a\d\d\dbcd\w/
1322         *    /a\d\d\d\dbcd\w/
1323         * In the first case, the regex matches minlen chars; in the
1324         * second, minlen+1, in the third, minlen+2.
1325         * In the first case, the floating offset is 3 (which equals
1326         * float_min), in the second, 4, and in the third, 5 (which
1327         * equals float_max). In all cases, the floating string bcd
1328         * can never start more than 4 chars from the end of the
1329         * string, which equals minlen - float_min. As the substring
1330         * starts to match more than float_min from the start of the
1331         * regex, it makes the regex match more than minlen chars,
1332         * and the two cancel each other out. So we can always use
1333         * float_min - minlen, rather than float_max - minlen for the
1334         * latest position in the string.
1335         *
1336         * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1337         * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1338         */
1339
1340        assert(prog->minlen >= other->min_offset);
1341        last1 = HOP3c(strend,
1342                        other->min_offset - prog->minlen, strbeg);
1343
1344        if (other_ix) {/* i.e. if (other-is-float) */
1345            /* last is the latest point where the floating substr could
1346             * start, *given* any constraints from the earlier fixed
1347             * match. This constraint is that the floating string starts
1348             * <= float_max_offset chars from the regex origin (rx_origin).
1349             * If this value is less than last1, use it instead.
1350             */
1351            assert(rx_origin <= last1);
1352            last =
1353                /* this condition handles the offset==infinity case, and
1354                 * is a short-cut otherwise. Although it's comparing a
1355                 * byte offset to a char length, it does so in a safe way,
1356                 * since 1 char always occupies 1 or more bytes,
1357                 * so if a string range is  (last1 - rx_origin) bytes,
1358                 * it will be less than or equal to  (last1 - rx_origin)
1359                 * chars; meaning it errs towards doing the accurate HOP3
1360                 * rather than just using last1 as a short-cut */
1361                (last1 - rx_origin) < other->max_offset
1362                    ? last1
1363                    : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1364        }
1365        else {
1366            assert(strpos + start_shift <= check_at);
1367            last = HOP4c(check_at, other->min_offset - start_shift,
1368                        strbeg, strend);
1369        }
1370
1371        s = HOP3c(rx_origin, other->min_offset, strend);
1372        if (s < other_last)	/* These positions already checked */
1373            s = other_last;
1374
1375        must = utf8_target ? other->utf8_substr : other->substr;
1376        assert(SvPOK(must));
1377        {
1378            char *from = s;
1379            char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1380
1381            if (to > strend)
1382                to = strend;
1383            if (from > to) {
1384                s = NULL;
1385                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1386                    "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1387                    (IV)(from - strbeg),
1388                    (IV)(to   - strbeg)
1389                ));
1390            }
1391            else {
1392                s = fbm_instr(
1393                    (unsigned char*)from,
1394                    (unsigned char*)to,
1395                    must,
1396                    multiline ? FBMrf_MULTILINE : 0
1397                );
1398                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1399                    "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1400                    (IV)(from - strbeg),
1401                    (IV)(to   - strbeg),
1402                    (IV)(s ? s - strbeg : -1)
1403                ));
1404            }
1405        }
1406
1407        DEBUG_EXECUTE_r({
1408            RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1409                SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1410            Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1411                s ? "Found" : "Contradicts",
1412                other_ix ? "floating" : "anchored",
1413                quoted, RE_SV_TAIL(must));
1414        });
1415
1416
1417        if (!s) {
1418            /* last1 is latest possible substr location. If we didn't
1419             * find it before there, we never will */
1420            if (last >= last1) {
1421                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                                        "; giving up...\n"));
1423                goto fail_finish;
1424            }
1425
1426            /* try to find the check substr again at a later
1427             * position. Maybe next time we'll find the "other" substr
1428             * in range too */
1429            other_last = HOP3c(last, 1, strend) /* highest failure */;
1430            rx_origin =
1431                other_ix /* i.e. if other-is-float */
1432                    ? HOP3c(rx_origin, 1, strend)
1433                    : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1434            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1435                "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1436                (other_ix ? "floating" : "anchored"),
1437                (long)(HOP3c(check_at, 1, strend) - strbeg),
1438                (IV)(rx_origin - strbeg)
1439            ));
1440            goto restart;
1441        }
1442        else {
1443            if (other_ix) { /* if (other-is-float) */
1444                /* other_last is set to s, not s+1, since its possible for
1445                 * a floating substr to fail first time, then succeed
1446                 * second time at the same floating position; e.g.:
1447                 *     "-AB--AABZ" =~ /\wAB\d*Z/
1448                 * The first time round, anchored and float match at
1449                 * "-(AB)--AAB(Z)" then fail on the initial \w character
1450                 * class. Second time round, they match at "-AB--A(AB)(Z)".
1451                 */
1452                other_last = s;
1453            }
1454            else {
1455                rx_origin = HOP3c(s, -other->min_offset, strbeg);
1456                other_last = HOP3c(s, 1, strend);
1457            }
1458            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1459                " at offset %ld (rx_origin now %" IVdf ")...\n",
1460                  (long)(s - strbeg),
1461                (IV)(rx_origin - strbeg)
1462              ));
1463
1464        }
1465    }
1466    else {
1467        DEBUG_OPTIMISE_MORE_r(
1468            Perl_re_printf( aTHX_
1469                "  Check-only match: offset min:%" IVdf " max:%" IVdf
1470                " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1471                " strend:%" IVdf "\n",
1472                (IV)prog->check_offset_min,
1473                (IV)prog->check_offset_max,
1474                (IV)(check_at-strbeg),
1475                (IV)(rx_origin-strbeg),
1476                (IV)(rx_origin-check_at),
1477                (IV)(strend-strbeg)
1478            )
1479        );
1480    }
1481
1482  postprocess_substr_matches:
1483
1484    /* handle the extra constraint of /^.../m if present */
1485
1486    if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1487        char *s;
1488
1489        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1490                        "  looking for /^/m anchor"));
1491
1492        /* we have failed the constraint of a \n before rx_origin.
1493         * Find the next \n, if any, even if it's beyond the current
1494         * anchored and/or floating substrings. Whether we should be
1495         * scanning ahead for the next \n or the next substr is debatable.
1496         * On the one hand you'd expect rare substrings to appear less
1497         * often than \n's. On the other hand, searching for \n means
1498         * we're effectively flipping between check_substr and "\n" on each
1499         * iteration as the current "rarest" candidate string, which
1500         * means for example that we'll quickly reject the whole string if
1501         * hasn't got a \n, rather than trying every substr position
1502         * first
1503         */
1504
1505        s = HOP3c(strend, - prog->minlen, strpos);
1506        if (s <= rx_origin ||
1507            ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1508        {
1509            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1510                            "  Did not find /%s^%s/m...\n",
1511                            PL_colors[0], PL_colors[1]));
1512            goto fail_finish;
1513        }
1514
1515        /* earliest possible origin is 1 char after the \n.
1516         * (since *rx_origin == '\n', it's safe to ++ here rather than
1517         * HOP(rx_origin, 1)) */
1518        rx_origin++;
1519
1520        if (prog->substrs->check_ix == 0  /* check is anchored */
1521            || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1522        {
1523            /* Position contradicts check-string; either because
1524             * check was anchored (and thus has no wiggle room),
1525             * or check was float and rx_origin is above the float range */
1526            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1527                "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1528                PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1529            goto restart;
1530        }
1531
1532        /* if we get here, the check substr must have been float,
1533         * is in range, and we may or may not have had an anchored
1534         * "other" substr which still contradicts */
1535        assert(prog->substrs->check_ix); /* check is float */
1536
1537        if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1538            /* whoops, the anchored "other" substr exists, so we still
1539             * contradict. On the other hand, the float "check" substr
1540             * didn't contradict, so just retry the anchored "other"
1541             * substr */
1542            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1543                "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1544                PL_colors[0], PL_colors[1],
1545                (IV)(rx_origin - strbeg + prog->anchored_offset),
1546                (IV)(rx_origin - strbeg)
1547            ));
1548            goto do_other_substr;
1549        }
1550
1551        /* success: we don't contradict the found floating substring
1552         * (and there's no anchored substr). */
1553        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554            "  Found /%s^%s/m with rx_origin %ld...\n",
1555            PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556    }
1557    else {
1558        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1559            "  (multiline anchor test skipped)\n"));
1560    }
1561
1562  success_at_start:
1563
1564
1565    /* if we have a starting character class, then test that extra constraint.
1566     * (trie stclasses are too expensive to use here, we are better off to
1567     * leave it to regmatch itself) */
1568
1569    if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1570        const U8* const str = (U8*)STRING(progi->regstclass);
1571
1572        /* XXX this value could be pre-computed */
1573        const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1574                    ?  (reginfo->is_utf8_pat
1575                        ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1576                        : (SSize_t)STR_LEN(progi->regstclass))
1577                    : 1);
1578        char * endpos;
1579        char *s;
1580        /* latest pos that a matching float substr constrains rx start to */
1581        char *rx_max_float = NULL;
1582
1583        /* if the current rx_origin is anchored, either by satisfying an
1584         * anchored substring constraint, or a /^.../m constraint, then we
1585         * can reject the current origin if the start class isn't found
1586         * at the current position. If we have a float-only match, then
1587         * rx_origin is constrained to a range; so look for the start class
1588         * in that range. if neither, then look for the start class in the
1589         * whole rest of the string */
1590
1591        /* XXX DAPM it's not clear what the minlen test is for, and why
1592         * it's not used in the floating case. Nothing in the test suite
1593         * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1594         * Here are some old comments, which may or may not be correct:
1595         *
1596         *   minlen == 0 is possible if regstclass is \b or \B,
1597         *   and the fixed substr is ''$.
1598         *   Since minlen is already taken into account, rx_origin+1 is
1599         *   before strend; accidentally, minlen >= 1 guaranties no false
1600         *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1601         *   0) below assumes that regstclass does not come from lookahead...
1602         *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1603         *   This leaves EXACTF-ish only, which are dealt with in
1604         *   find_byclass().
1605         */
1606
1607        if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1608            endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1609        else if (prog->float_substr || prog->float_utf8) {
1610            rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1611            endpos = HOP3clim(rx_max_float, cl_l, strend);
1612        }
1613        else
1614            endpos= strend;
1615
1616        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1617            "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1618            " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1619              (IV)start_shift, (IV)(check_at - strbeg),
1620              (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1621
1622        s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1623                            reginfo);
1624        if (!s) {
1625            if (endpos == strend) {
1626                DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1627                                "  Could not match STCLASS...\n") );
1628                goto fail;
1629            }
1630            DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1631                               "  This position contradicts STCLASS...\n") );
1632            if ((prog->intflags & PREGf_ANCH) && !ml_anch
1633                        && !(prog->intflags & PREGf_IMPLICIT))
1634                goto fail;
1635
1636            /* Contradict one of substrings */
1637            if (prog->anchored_substr || prog->anchored_utf8) {
1638                if (prog->substrs->check_ix == 1) { /* check is float */
1639                    /* Have both, check_string is floating */
1640                    assert(rx_origin + start_shift <= check_at);
1641                    if (rx_origin + start_shift != check_at) {
1642                        /* not at latest position float substr could match:
1643                         * Recheck anchored substring, but not floating.
1644                         * The condition above is in bytes rather than
1645                         * chars for efficiency. It's conservative, in
1646                         * that it errs on the side of doing 'goto
1647                         * do_other_substr'. In this case, at worst,
1648                         * an extra anchored search may get done, but in
1649                         * practice the extra fbm_instr() is likely to
1650                         * get skipped anyway. */
1651                        DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1652                            "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1653                            (long)(other_last - strbeg),
1654                            (IV)(rx_origin - strbeg)
1655                        ));
1656                        goto do_other_substr;
1657                    }
1658                }
1659            }
1660            else {
1661                /* float-only */
1662
1663                if (ml_anch) {
1664                    /* In the presence of ml_anch, we might be able to
1665                     * find another \n without breaking the current float
1666                     * constraint. */
1667
1668                    /* strictly speaking this should be HOP3c(..., 1, ...),
1669                     * but since we goto a block of code that's going to
1670                     * search for the next \n if any, its safe here */
1671                    rx_origin++;
1672                    DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1673                              "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1674                              PL_colors[0], PL_colors[1],
1675                              (long)(rx_origin - strbeg)) );
1676                    goto postprocess_substr_matches;
1677                }
1678
1679                /* strictly speaking this can never be true; but might
1680                 * be if we ever allow intuit without substrings */
1681                if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1682                    goto fail;
1683
1684                rx_origin = rx_max_float;
1685            }
1686
1687            /* at this point, any matching substrings have been
1688             * contradicted. Start again... */
1689
1690            rx_origin = HOP3c(rx_origin, 1, strend);
1691
1692            /* uses bytes rather than char calculations for efficiency.
1693             * It's conservative: it errs on the side of doing 'goto restart',
1694             * where there is code that does a proper char-based test */
1695            if (rx_origin + start_shift + end_shift > strend) {
1696                DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1697                                       "  Could not match STCLASS...\n") );
1698                goto fail;
1699            }
1700            DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701                "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1702                (prog->substrs->check_ix ? "floating" : "anchored"),
1703                (long)(rx_origin + start_shift - strbeg),
1704                (IV)(rx_origin - strbeg)
1705            ));
1706            goto restart;
1707        }
1708
1709        /* Success !!! */
1710
1711        if (rx_origin != s) {
1712            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1713                        "  By STCLASS: moving %ld --> %ld\n",
1714                                  (long)(rx_origin - strbeg), (long)(s - strbeg))
1715                   );
1716        }
1717        else {
1718            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1719                                  "  Does not contradict STCLASS...\n");
1720                   );
1721        }
1722    }
1723
1724    /* Decide whether using the substrings helped */
1725
1726    if (rx_origin != strpos) {
1727        /* Fixed substring is found far enough so that the match
1728           cannot start at strpos. */
1729
1730        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1731        ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1732    }
1733    else {
1734        /* The found rx_origin position does not prohibit matching at
1735         * strpos, so calling intuit didn't gain us anything. Decrement
1736         * the BmUSEFUL() count on the check substring, and if we reach
1737         * zero, free it.  */
1738        if (!(prog->intflags & PREGf_NAUGHTY)
1739            && (utf8_target ? (
1740                prog->check_utf8		/* Could be deleted already */
1741                && --BmUSEFUL(prog->check_utf8) < 0
1742                && (prog->check_utf8 == prog->float_utf8)
1743            ) : (
1744                prog->check_substr		/* Could be deleted already */
1745                && --BmUSEFUL(prog->check_substr) < 0
1746                && (prog->check_substr == prog->float_substr)
1747            )))
1748        {
1749            /* If flags & SOMETHING - do not do it many times on the same match */
1750            DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1751            /* XXX Does the destruction order has to change with utf8_target? */
1752            SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1753            SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1754            prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1755            prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1756            check = NULL;			/* abort */
1757            /* XXXX This is a remnant of the old implementation.  It
1758                    looks wasteful, since now INTUIT can use many
1759                    other heuristics. */
1760            prog->extflags &= ~RXf_USE_INTUIT;
1761        }
1762    }
1763
1764    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1765            "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1766             PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1767
1768    return rx_origin;
1769
1770  fail_finish:				/* Substring not found */
1771    if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1772        BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1773  fail:
1774    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1775                          PL_colors[4], PL_colors[5]));
1776    return NULL;
1777}
1778
1779
1780#define DECL_TRIE_TYPE(scan) \
1781    const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1782                 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1783                 trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1784                    trie_type = ((FLAGS(scan) == EXACT)                             \
1785                                 ? (utf8_target ? trie_utf8 : trie_plain)           \
1786                                 : (FLAGS(scan) == EXACTL)                          \
1787                                    ? (utf8_target ? trie_utf8l : trie_plain)       \
1788                                    : (FLAGS(scan) == EXACTFAA)                     \
1789                                      ? (utf8_target                                \
1790                                         ? trie_utf8_exactfa_fold                   \
1791                                         : trie_latin_utf8_exactfa_fold)            \
1792                                      : (FLAGS(scan) == EXACTFLU8                   \
1793                                         ? (utf8_target                             \
1794                                           ? trie_flu8                              \
1795                                           : trie_flu8_latin)                       \
1796                                         : (utf8_target                             \
1797                                           ? trie_utf8_fold                         \
1798                                           : trie_latin_utf8_fold)))
1799
1800/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1801 * 'foldbuf+sizeof(foldbuf)' */
1802#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1803STMT_START {                                                                        \
1804    STRLEN skiplen;                                                                 \
1805    U8 flags = FOLD_FLAGS_FULL;                                                     \
1806    switch (trie_type) {                                                            \
1807    case trie_flu8:                                                                 \
1808        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1809        if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1810            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1811        }                                                                           \
1812        goto do_trie_utf8_fold;                                                     \
1813    case trie_utf8_exactfa_fold:                                                    \
1814        flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1815        /* FALLTHROUGH */                                                           \
1816    case trie_utf8_fold:                                                            \
1817      do_trie_utf8_fold:                                                            \
1818        if ( foldlen>0 ) {                                                          \
1819            uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1820            foldlen -= len;                                                         \
1821            uscan += len;                                                           \
1822            len=0;                                                                  \
1823        } else {                                                                    \
1824            uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1825                                                                            flags); \
1826            len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1827            skiplen = UVCHR_SKIP( uvc );                                            \
1828            foldlen -= skiplen;                                                     \
1829            uscan = foldbuf + skiplen;                                              \
1830        }                                                                           \
1831        break;                                                                      \
1832    case trie_flu8_latin:                                                           \
1833        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1834        goto do_trie_latin_utf8_fold;                                               \
1835    case trie_latin_utf8_exactfa_fold:                                              \
1836        flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1837        /* FALLTHROUGH */                                                           \
1838    case trie_latin_utf8_fold:                                                      \
1839      do_trie_latin_utf8_fold:                                                      \
1840        if ( foldlen>0 ) {                                                          \
1841            uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1842            foldlen -= len;                                                         \
1843            uscan += len;                                                           \
1844            len=0;                                                                  \
1845        } else {                                                                    \
1846            len = 1;                                                                \
1847            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1848            skiplen = UVCHR_SKIP( uvc );                                            \
1849            foldlen -= skiplen;                                                     \
1850            uscan = foldbuf + skiplen;                                              \
1851        }                                                                           \
1852        break;                                                                      \
1853    case trie_utf8l:                                                                \
1854        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1855        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1856            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1857        }                                                                           \
1858        /* FALLTHROUGH */                                                           \
1859    case trie_utf8:                                                                 \
1860        uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1861        break;                                                                      \
1862    case trie_plain:                                                                \
1863        uvc = (UV)*uc;                                                              \
1864        len = 1;                                                                    \
1865    }                                                                               \
1866    if (uvc < 256) {                                                                \
1867        charid = trie->charmap[ uvc ];                                              \
1868    }                                                                               \
1869    else {                                                                          \
1870        charid = 0;                                                                 \
1871        if (widecharmap) {                                                          \
1872            SV** const svpp = hv_fetch(widecharmap,                                 \
1873                        (char*)&uvc, sizeof(UV), 0);                                \
1874            if (svpp)                                                               \
1875                charid = (U16)SvIV(*svpp);                                          \
1876        }                                                                           \
1877    }                                                                               \
1878} STMT_END
1879
1880#define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1881    dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1882                startpos, doutf8, depth)
1883
1884#define GET_ANYOFH_INVLIST(prog, n)                                         \
1885                        GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1886
1887#define REXEC_FBC_UTF8_SCAN(CODE)                           \
1888    STMT_START {                                            \
1889        while (s < strend) {                                \
1890            CODE                                            \
1891            s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1892        }                                                   \
1893    } STMT_END
1894
1895#define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1896    STMT_START {                                            \
1897        while (s < strend) {                                \
1898            CODE                                            \
1899            s++;                                            \
1900        }                                                   \
1901    } STMT_END
1902
1903#define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1904    STMT_START {                                            \
1905        while (s < strend) {                                \
1906            REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1907        }                                                   \
1908    } STMT_END
1909
1910#define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1911    STMT_START {                                            \
1912        while (s < strend) {                                \
1913            REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1914        }                                                   \
1915    } STMT_END
1916
1917#define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1918    if (COND) {                                                \
1919        FBC_CHECK_AND_TRY                                      \
1920        s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1921        previous_occurrence_end = s;                           \
1922    }                                                          \
1923    else {                                                     \
1924        s += UTF8SKIP(s);                                      \
1925    }
1926
1927#define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1928    if (COND) {                                                \
1929        FBC_CHECK_AND_TRY                                      \
1930        s++;                                                   \
1931        previous_occurrence_end = s;                           \
1932    }                                                          \
1933    else {                                                     \
1934        s++;                                                   \
1935    }
1936
1937/* We keep track of where the next character should start after an occurrence
1938 * of the one we're looking for.  Knowing that, we can see right away if the
1939 * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1940 * don't accept the 2nd and succeeding adjacent occurrences */
1941#define FBC_CHECK_AND_TRY                                           \
1942        if (   (   doevery                                          \
1943                || s != previous_occurrence_end)                    \
1944            && (   reginfo->intuit                                  \
1945                || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1946        {                                                           \
1947            goto got_it;                                            \
1948        }
1949
1950
1951/* These differ from the above macros in that they call a function which
1952 * returns the next occurrence of the thing being looked for in 's'; and
1953 * 'strend' if there is no such occurrence.  'f' is something like fcn(a,b,c)
1954 * */
1955#define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1956    while (s < strend) {                                    \
1957        s = (char *) (f);                                   \
1958        if (s >= strend) {                                  \
1959            break;                                          \
1960        }                                                   \
1961                                                            \
1962        FBC_CHECK_AND_TRY                                   \
1963        s += UTF8SKIP(s);                                   \
1964        previous_occurrence_end = s;                        \
1965    }
1966
1967#define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1968    while (s < strend) {                                    \
1969        s = (char *) (f);                                   \
1970        if (s >= strend) {                                  \
1971            break;                                          \
1972        }                                                   \
1973                                                            \
1974        FBC_CHECK_AND_TRY                                   \
1975        s++;                                                \
1976        previous_occurrence_end = s;                        \
1977    }
1978
1979/* This is like the above macro except the function returns NULL if there is no
1980 * occurrence, and there is a further condition that must be matched besides
1981 * the function */
1982#define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
1983    while (s < strend) {                                    \
1984        s = (char *) (f);                                     \
1985        if (s == NULL) {                                    \
1986            s = (char *) strend;                            \
1987            break;                                          \
1988        }                                                   \
1989                                                            \
1990        if (COND) {                                         \
1991            FBC_CHECK_AND_TRY                               \
1992            s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1993            previous_occurrence_end = s;                    \
1994        }                                                   \
1995        else {                                              \
1996            s += UTF8SKIP(s);                               \
1997        }                                                   \
1998    }
1999
2000/* This differs from the above macros in that it is passed a single byte that
2001 * is known to begin the next occurrence of the thing being looked for in 's'.
2002 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
2003 * at that position. */
2004#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
2005    REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
2006                                              COND)
2007
2008/* This is like the function above, but takes an entire string to look for
2009 * instead of a single byte */
2010#define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
2011    REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
2012                                     ninstr(s, strend, substr, substr_end), \
2013                                     COND)
2014
2015/* The four macros below are slightly different versions of the same logic.
2016 *
2017 * The first is for /a and /aa when the target string is UTF-8.  This can only
2018 * match ascii, but it must advance based on UTF-8.   The other three handle
2019 * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
2020 * looking for the boundary (or non-boundary) between a word and non-word
2021 * character.  The utf8 and non-utf8 cases have the same logic, but the details
2022 * must be different.  Find the "wordness" of the character just prior to this
2023 * one, and compare it with the wordness of this one.  If they differ, we have
2024 * a boundary.  At the beginning of the string, pretend that the previous
2025 * character was a new-line.
2026 *
2027 * All these macros uncleanly have side-effects with each other and outside
2028 * variables.  So far it's been too much trouble to clean-up
2029 *
2030 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
2031 *               a word character or not.
2032 * IF_SUCCESS    is code to do if it finds that we are at a boundary between
2033 *               word/non-word
2034 * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
2035 *
2036 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
2037 * are looking for a boundary or for a non-boundary.  If we are looking for a
2038 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
2039 * see if this tentative match actually works, and if so, to quit the loop
2040 * here.  And vice-versa if we are looking for a non-boundary.
2041 *
2042 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
2043 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
2044 * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
2045 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2046 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2047 * complement.  But in that branch we complement tmp, meaning that at the
2048 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2049 * which means at the top of the loop in the next iteration, it is
2050 * TEST_NON_UTF8(s-1) */
2051#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2052    tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2053    tmp = TEST_NON_UTF8(tmp);                                                  \
2054    REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
2055        if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2056            tmp = !tmp;                                                        \
2057            IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2058        }                                                                      \
2059        else {                                                                 \
2060            IF_FAIL;                                                           \
2061        }                                                                      \
2062    );                                                                         \
2063
2064/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2065 * TEST_UTF8 is a macro that for the same input code points returns identically
2066 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
2067 * end pointer as well) */
2068#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2069    if (s == reginfo->strbeg) {                                                \
2070        tmp = '\n';                                                            \
2071    }                                                                          \
2072    else { /* Back-up to the start of the previous character */                \
2073        U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2074        tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2075                                                       0, UTF8_ALLOW_DEFAULT); \
2076    }                                                                          \
2077    tmp = TEST_UV(tmp);                                                        \
2078    REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
2079        if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2080            tmp = !tmp;                                                        \
2081            IF_SUCCESS;                                                        \
2082        }                                                                      \
2083        else {                                                                 \
2084            IF_FAIL;                                                           \
2085        }                                                                      \
2086    );
2087
2088/* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
2089 * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
2090 * set-up by the FBC_BOUND, etc macros below */
2091#define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
2092    UTF8_CODE;                                                                 \
2093    /* Here, things have been set up by the previous code so that tmp is the   \
2094     * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
2095     * against the EOS, which we treat as a \n */                              \
2096    if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2097        IF_SUCCESS;                                                            \
2098    }                                                                          \
2099    else {                                                                     \
2100        IF_FAIL;                                                               \
2101    }
2102
2103/* Same as the macro above, but the target isn't UTF-8 */
2104#define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
2105    tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2106    tmp = TEST_NON_UTF8(tmp);                                               \
2107    REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2108        if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2109            IF_SUCCESS;                                                     \
2110            tmp = !tmp;                                                     \
2111        }                                                                   \
2112        else {                                                              \
2113            IF_FAIL;                                                        \
2114        }                                                                   \
2115    );                                                                      \
2116    /* Here, things have been set up by the previous code so that tmp is    \
2117     * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2118     * matches against the EOS, which we treat as a \n */                   \
2119    if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2120        IF_SUCCESS;                                                         \
2121    }                                                                       \
2122    else {                                                                  \
2123        IF_FAIL;                                                            \
2124    }
2125
2126/* This is the macro to use when we want to see if something that looks like it
2127 * could match, actually does, and if so exits the loop.  It needs to be used
2128 * only for bounds checking macros, as it allows for matching beyond the end of
2129 * string (which should be zero length without having to look at the string
2130 * contents) */
2131#define REXEC_FBC_TRYIT                                                     \
2132    if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2133        goto got_it
2134
2135/* The only difference between the BOUND and NBOUND cases is that
2136 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2137 * NBOUND.  This is accomplished by passing it as either the if or else clause,
2138 * with the other one being empty (PLACEHOLDER is defined as empty).
2139 *
2140 * The TEST_FOO parameters are for operating on different forms of input, but
2141 * all should be ones that return identically for the same underlying code
2142 * points */
2143
2144#define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2145    FBC_BOUND_COMMON_UTF8(                                                  \
2146          FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2147          TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2148
2149#define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2150    FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2151
2152#define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2153    FBC_BOUND_COMMON_UTF8(                                                  \
2154                    FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2155                    TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2156
2157#define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2158    FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2159
2160#define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2161    FBC_BOUND_COMMON_UTF8(                                                  \
2162              FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2163              TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2164
2165#define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2166    FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2167
2168#define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2169    FBC_BOUND_COMMON_UTF8(                                                  \
2170            FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2171            TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2172
2173#define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2174    FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2175
2176#ifdef DEBUGGING
2177static IV
2178S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2179  IV cp_out = _invlist_search(invlist, cp_in);
2180  assert(cp_out >= 0);
2181  return cp_out;
2182}
2183#  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2184        invmap[S_get_break_val_cp_checked(invlist, cp)]
2185#else
2186#  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2187        invmap[_invlist_search(invlist, cp)]
2188#endif
2189
2190/* Takes a pointer to an inversion list, a pointer to its corresponding
2191 * inversion map, and a code point, and returns the code point's value
2192 * according to the two arrays.  It assumes that all code points have a value.
2193 * This is used as the base macro for macros for particular properties */
2194#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2195        _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2196
2197/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2198 * of a code point, returning the value for the first code point in the string.
2199 * And it takes the particular macro name that finds the desired value given a
2200 * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2201#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2202             (__ASSERT_(pos < strend)                                          \
2203                 /* Note assumes is valid UTF-8 */                             \
2204             (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2205
2206/* Returns the GCB value for the input code point */
2207#define getGCB_VAL_CP(cp)                                                      \
2208          _generic_GET_BREAK_VAL_CP(                                           \
2209                                    PL_GCB_invlist,                            \
2210                                    _Perl_GCB_invmap,                          \
2211                                    (cp))
2212
2213/* Returns the GCB value for the first code point in the UTF-8 encoded string
2214 * bounded by pos and strend */
2215#define getGCB_VAL_UTF8(pos, strend)                                           \
2216    _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2217
2218/* Returns the LB value for the input code point */
2219#define getLB_VAL_CP(cp)                                                       \
2220          _generic_GET_BREAK_VAL_CP(                                           \
2221                                    PL_LB_invlist,                             \
2222                                    _Perl_LB_invmap,                           \
2223                                    (cp))
2224
2225/* Returns the LB value for the first code point in the UTF-8 encoded string
2226 * bounded by pos and strend */
2227#define getLB_VAL_UTF8(pos, strend)                                            \
2228    _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2229
2230
2231/* Returns the SB value for the input code point */
2232#define getSB_VAL_CP(cp)                                                       \
2233          _generic_GET_BREAK_VAL_CP(                                           \
2234                                    PL_SB_invlist,                             \
2235                                    _Perl_SB_invmap,                     \
2236                                    (cp))
2237
2238/* Returns the SB value for the first code point in the UTF-8 encoded string
2239 * bounded by pos and strend */
2240#define getSB_VAL_UTF8(pos, strend)                                            \
2241    _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2242
2243/* Returns the WB value for the input code point */
2244#define getWB_VAL_CP(cp)                                                       \
2245          _generic_GET_BREAK_VAL_CP(                                           \
2246                                    PL_WB_invlist,                             \
2247                                    _Perl_WB_invmap,                         \
2248                                    (cp))
2249
2250/* Returns the WB value for the first code point in the UTF-8 encoded string
2251 * bounded by pos and strend */
2252#define getWB_VAL_UTF8(pos, strend)                                            \
2253    _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2254
2255/* We know what class REx starts with.  Try to find this position... */
2256/* if reginfo->intuit, its a dryrun */
2257/* annoyingly all the vars in this routine have different names from their counterparts
2258   in regmatch. /grrr */
2259STATIC char *
2260S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2261    const char *strend, regmatch_info *reginfo)
2262{
2263
2264    /* TRUE if x+ need not match at just the 1st pos of run of x's */
2265    const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2266
2267    char *pat_string;   /* The pattern's exactish string */
2268    char *pat_end;	    /* ptr to end char of pat_string */
2269    re_fold_t folder;	/* Function for computing non-utf8 folds */
2270    const U8 *fold_array;   /* array for folding ords < 256 */
2271    STRLEN ln;
2272    STRLEN lnc;
2273    U8 c1;
2274    U8 c2;
2275    char *e = NULL;
2276
2277    /* In some cases we accept only the first occurrence of 'x' in a sequence of
2278     * them.  This variable points to just beyond the end of the previous
2279     * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2280     * it point to beyond the 'x' allows us to work for UTF-8 without having to
2281     * hop back.) */
2282    char * previous_occurrence_end = 0;
2283
2284    I32 tmp;            /* Scratch variable */
2285    const bool utf8_target = reginfo->is_utf8_target;
2286    UV utf8_fold_flags = 0;
2287    const bool is_utf8_pat = reginfo->is_utf8_pat;
2288    bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2289                                   with a result inverts that result, as 0^1 =
2290                                   1 and 1^1 = 0 */
2291    char_class_number_ classnum;
2292
2293    RXi_GET_DECL(prog,progi);
2294
2295    PERL_ARGS_ASSERT_FIND_BYCLASS;
2296
2297    /* We know what class it must start with. The case statements below have
2298     * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2299     * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2300     * ('p8' and 'pb'. */
2301    switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2302        SV * anyofh_list;
2303
2304      case ANYOFPOSIXL_t8_pb:
2305      case ANYOFPOSIXL_t8_p8:
2306      case ANYOFL_t8_pb:
2307      case ANYOFL_t8_p8:
2308        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2309        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2310
2311        /* FALLTHROUGH */
2312
2313      case ANYOFD_t8_pb:
2314      case ANYOFD_t8_p8:
2315      case ANYOF_t8_pb:
2316      case ANYOF_t8_p8:
2317        REXEC_FBC_UTF8_CLASS_SCAN(
2318                reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2319        break;
2320
2321      case ANYOFPOSIXL_tb_pb:
2322      case ANYOFPOSIXL_tb_p8:
2323      case ANYOFL_tb_pb:
2324      case ANYOFL_tb_p8:
2325        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2326        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2327
2328        /* FALLTHROUGH */
2329
2330      case ANYOFD_tb_pb:
2331      case ANYOFD_tb_p8:
2332      case ANYOF_tb_pb:
2333      case ANYOF_tb_p8:
2334        if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2335            /* We know that s is in the bitmap range since the target isn't
2336             * UTF-8, so what happens for out-of-range values is not relevant,
2337             * so exclude that from the flags */
2338            REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2339        }
2340        else {
2341            REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2342                                                     0));
2343        }
2344        break;
2345
2346      case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */
2347      case ANYOFM_tb_p8:
2348        REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2349             find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2350        break;
2351
2352      case ANYOFM_t8_pb:
2353      case ANYOFM_t8_p8:
2354        /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2355         * we do anyway for performance reasons, as otherwise we would have to
2356         * examine all the continuation characters */
2357        REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2358             find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2359        break;
2360
2361      case NANYOFM_tb_pb:
2362      case NANYOFM_tb_p8:
2363        REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2364           find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2365        break;
2366
2367      case NANYOFM_t8_pb:
2368      case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2369                                  variants. */
2370        REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2371                        (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2372                                                    (U8) ARG1u(c), FLAGS(c)));
2373        break;
2374
2375      /* These nodes all require at least one code point to be in UTF-8 to
2376       * match */
2377      case ANYOFH_tb_pb:
2378      case ANYOFH_tb_p8:
2379      case ANYOFHb_tb_pb:
2380      case ANYOFHb_tb_p8:
2381      case ANYOFHbbm_tb_pb:
2382      case ANYOFHbbm_tb_p8:
2383      case ANYOFHr_tb_pb:
2384      case ANYOFHr_tb_p8:
2385      case ANYOFHs_tb_pb:
2386      case ANYOFHs_tb_p8:
2387      case EXACTFLU8_tb_pb:
2388      case EXACTFLU8_tb_p8:
2389      case EXACTFU_REQ8_tb_pb:
2390      case EXACTFU_REQ8_tb_p8:
2391        break;
2392
2393      case ANYOFH_t8_pb:
2394      case ANYOFH_t8_p8:
2395        anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2396        REXEC_FBC_UTF8_CLASS_SCAN(
2397              (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2398               && _invlist_contains_cp(anyofh_list,
2399                                       utf8_to_uvchr_buf((U8 *) s,
2400                                                         (U8 *) strend,
2401                                                         NULL))));
2402        break;
2403
2404      case ANYOFHb_t8_pb:
2405      case ANYOFHb_t8_p8:
2406        {
2407            /* We know what the first byte of any matched string should be. */
2408            U8 first_byte = FLAGS(c);
2409
2410            anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2411            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2412                   _invlist_contains_cp(anyofh_list,
2413                                           utf8_to_uvchr_buf((U8 *) s,
2414                                                              (U8 *) strend,
2415                                                              NULL)));
2416        }
2417        break;
2418
2419      case ANYOFHbbm_t8_pb:
2420      case ANYOFHbbm_t8_p8:
2421        {
2422            /* We know what the first byte of any matched string should be. */
2423            U8 first_byte = FLAGS(c);
2424
2425            /* And a bitmap defines all the legal 2nd byte matches */
2426            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2427                               (    s < strend
2428                                && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2429                                            (U8) s[1] & UTF_CONTINUATION_MASK)));
2430        }
2431        break;
2432
2433      case ANYOFHr_t8_pb:
2434      case ANYOFHr_t8_p8:
2435        anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2436        REXEC_FBC_UTF8_CLASS_SCAN(
2437                    (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2438                                LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2439                                HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2440                   && _invlist_contains_cp(anyofh_list,
2441                                           utf8_to_uvchr_buf((U8 *) s,
2442                                                              (U8 *) strend,
2443                                                              NULL))));
2444        break;
2445
2446      case ANYOFHs_t8_pb:
2447      case ANYOFHs_t8_p8:
2448        anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2449        REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2450                        ((struct regnode_anyofhs *) c)->string,
2451                        /* Note FLAGS is the string length in this regnode */
2452                        ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2453                        _invlist_contains_cp(anyofh_list,
2454                                             utf8_to_uvchr_buf((U8 *) s,
2455                                                               (U8 *) strend,
2456                                                               NULL)));
2457        break;
2458
2459      case ANYOFR_tb_pb:
2460      case ANYOFR_tb_p8:
2461        REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2462                                            ANYOFRbase(c), ANYOFRdelta(c)));
2463        break;
2464
2465      case ANYOFR_t8_pb:
2466      case ANYOFR_t8_p8:
2467        REXEC_FBC_UTF8_CLASS_SCAN(
2468                            (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2469                             && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2470                                                              (U8 *) strend,
2471                                                              NULL),
2472                                            ANYOFRbase(c), ANYOFRdelta(c))));
2473        break;
2474
2475      case ANYOFRb_tb_pb:
2476      case ANYOFRb_tb_p8:
2477        REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2478                                            ANYOFRbase(c), ANYOFRdelta(c)));
2479        break;
2480
2481      case ANYOFRb_t8_pb:
2482      case ANYOFRb_t8_p8:
2483        {   /* We know what the first byte of any matched string should be */
2484            U8 first_byte = FLAGS(c);
2485
2486            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2487                                withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2488                                                              (U8 *) strend,
2489                                                              NULL),
2490                                            ANYOFRbase(c), ANYOFRdelta(c)));
2491        }
2492        break;
2493
2494      case EXACTFAA_tb_pb:
2495
2496        /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2497         * which these functions don't handle anyway */
2498        fold_array = PL_fold_latin1;
2499        folder = S_foldEQ_latin1_s2_folded;
2500        goto do_exactf_non_utf8;
2501
2502      case EXACTF_tb_pb:
2503        fold_array = PL_fold;
2504        folder = Perl_foldEQ;
2505        goto do_exactf_non_utf8;
2506
2507      case EXACTFL_tb_pb:
2508        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2509
2510        if (IN_UTF8_CTYPE_LOCALE) {
2511            utf8_fold_flags = FOLDEQ_LOCALE;
2512            goto do_exactf_utf8;
2513        }
2514
2515        fold_array = PL_fold_locale;
2516        folder = Perl_foldEQ_locale;
2517        goto do_exactf_non_utf8;
2518
2519      case EXACTFU_tb_pb:
2520        /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2521         * don't have to worry here about this single special case in the
2522         * Latin1 range */
2523        fold_array = PL_fold_latin1;
2524        folder = S_foldEQ_latin1_s2_folded;
2525
2526        /* FALLTHROUGH */
2527
2528       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2529                              are no glitches with fold-length differences
2530                              between the target string and pattern */
2531
2532        /* The idea in the non-utf8 EXACTF* cases is to first find the first
2533         * character of the EXACTF* node and then, if necessary,
2534         * case-insensitively compare the full text of the node.  c1 is the
2535         * first character.  c2 is its fold.  This logic will not work for
2536         * Unicode semantics and the german sharp ss, which hence should not be
2537         * compiled into a node that gets here. */
2538        pat_string = STRINGs(c);
2539        ln  = STR_LENs(c);	/* length to match in octets/bytes */
2540
2541        /* We know that we have to match at least 'ln' bytes (which is the same
2542         * as characters, since not utf8).  If we have to match 3 characters,
2543         * and there are only 2 available, we know without trying that it will
2544         * fail; so don't start a match past the required minimum number from
2545         * the far end */
2546        e = HOP3c(strend, -((SSize_t)ln), s);
2547        if (e < s)
2548            break;
2549
2550        c1 = *pat_string;
2551        c2 = fold_array[c1];
2552        if (c1 == c2) { /* If char and fold are the same */
2553            while (s <= e) {
2554                s = (char *) memchr(s, c1, e + 1 - s);
2555                if (s == NULL) {
2556                    break;
2557                }
2558
2559                /* Check that the rest of the node matches */
2560                if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2561                    && (reginfo->intuit || regtry(reginfo, &s)) )
2562                {
2563                    goto got_it;
2564                }
2565                s++;
2566            }
2567        }
2568        else {
2569            U8 bits_differing = c1 ^ c2;
2570
2571            /* If the folds differ in one bit position only, we can mask to
2572             * match either of them, and can use this faster find method.  Both
2573             * ASCII and EBCDIC tend to have their case folds differ in only
2574             * one position, so this is very likely */
2575            if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2576                bits_differing = ~ bits_differing;
2577                while (s <= e) {
2578                    s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2579                                        (c1 & bits_differing), bits_differing);
2580                    if (s > e) {
2581                        break;
2582                    }
2583
2584                    if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2585                        && (reginfo->intuit || regtry(reginfo, &s)) )
2586                    {
2587                        goto got_it;
2588                    }
2589                    s++;
2590                }
2591            }
2592            else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2593                       should actually happen only in EXACTFL nodes */
2594                while (s <= e) {
2595                    if (    (*(U8*)s == c1 || *(U8*)s == c2)
2596                        && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2597                        && (reginfo->intuit || regtry(reginfo, &s)) )
2598                    {
2599                        goto got_it;
2600                    }
2601                    s++;
2602                }
2603            }
2604        }
2605        break;
2606
2607      case EXACTFAA_tb_p8:
2608      case EXACTFAA_t8_p8:
2609        utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2610                         |FOLDEQ_S2_ALREADY_FOLDED
2611                         |FOLDEQ_S2_FOLDS_SANE;
2612        goto do_exactf_utf8;
2613
2614      case EXACTFAA_NO_TRIE_tb_pb:
2615      case EXACTFAA_NO_TRIE_t8_pb:
2616      case EXACTFAA_t8_pb:
2617
2618        /* Here, and elsewhere in this file, the reason we can't consider a
2619         * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2620         * is because any MICRO SIGN in the pattern won't be folded.  Since the
2621         * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2622         * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2623        utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2624        goto do_exactf_utf8;
2625
2626      case EXACTFL_tb_p8:
2627      case EXACTFL_t8_pb:
2628      case EXACTFL_t8_p8:
2629        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2630        utf8_fold_flags = FOLDEQ_LOCALE;
2631        goto do_exactf_utf8;
2632
2633      case EXACTFLU8_t8_pb:
2634      case EXACTFLU8_t8_p8:
2635        utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2636                                         | FOLDEQ_S2_FOLDS_SANE;
2637        goto do_exactf_utf8;
2638
2639      case EXACTFU_REQ8_t8_p8:
2640        utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2641        goto do_exactf_utf8;
2642
2643      case EXACTFU_tb_p8:
2644      case EXACTFU_t8_pb:
2645      case EXACTFU_t8_p8:
2646        utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2647        goto do_exactf_utf8;
2648
2649      /* The following are problematic even though pattern isn't UTF-8.  Use
2650       * full functionality normally not done except for UTF-8. */
2651      case EXACTF_t8_pb:
2652      case EXACTFUP_tb_pb:
2653      case EXACTFUP_t8_pb:
2654
2655       do_exactf_utf8:
2656        {
2657            unsigned expansion;
2658
2659            /* If one of the operands is in utf8, we can't use the simpler
2660             * folding above, due to the fact that many different characters
2661             * can have the same fold, or portion of a fold, or different-
2662             * length fold */
2663            pat_string = STRINGs(c);
2664            ln  = STR_LENs(c);	/* length to match in octets/bytes */
2665            pat_end = pat_string + ln;
2666            lnc = is_utf8_pat       /* length to match in characters */
2667                  ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2668                  : ln;
2669
2670            /* We have 'lnc' characters to match in the pattern, but because of
2671             * multi-character folding, each character in the target can match
2672             * up to 3 characters (Unicode guarantees it will never exceed
2673             * this) if it is utf8-encoded; and up to 2 if not (based on the
2674             * fact that the Latin 1 folds are already determined, and the only
2675             * multi-char fold in that range is the sharp-s folding to 'ss'.
2676             * Thus, a pattern character can match as little as 1/3 of a string
2677             * character.  Adjust lnc accordingly, rounding up, so that if we
2678             * need to match at least 4+1/3 chars, that really is 5. */
2679            expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2680            lnc = (lnc + expansion - 1) / expansion;
2681
2682            /* As in the non-UTF8 case, if we have to match 3 characters, and
2683             * only 2 are left, it's guaranteed to fail, so don't start a match
2684             * that would require us to go beyond the end of the string */
2685            e = HOP3c(strend, -((SSize_t)lnc), s);
2686
2687            /* XXX Note that we could recalculate e to stop the loop earlier,
2688             * as the worst case expansion above will rarely be met, and as we
2689             * go along we would usually find that e moves further to the left.
2690             * This would happen only after we reached the point in the loop
2691             * where if there were no expansion we should fail.  Unclear if
2692             * worth the expense */
2693
2694            while (s <= e) {
2695                char *my_strend= (char *)strend;
2696                if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2697                                         pat_string, NULL, ln, is_utf8_pat,
2698                                         utf8_fold_flags)
2699                    && (reginfo->intuit || regtry(reginfo, &s)) )
2700                {
2701                    goto got_it;
2702                }
2703                s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2704            }
2705        }
2706        break;
2707
2708      case BOUNDA_tb_pb:
2709      case BOUNDA_tb_p8:
2710      case BOUND_tb_pb:  /* /d without utf8 target is /a */
2711      case BOUND_tb_p8:
2712        /* regcomp.c makes sure that these only have the traditional \b
2713         * meaning. */
2714        assert(FLAGS(c) == TRADITIONAL_BOUND);
2715
2716        FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2717        break;
2718
2719      case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2720      case BOUNDA_t8_p8:
2721        /* regcomp.c makes sure that these only have the traditional \b
2722         * meaning. */
2723        assert(FLAGS(c) == TRADITIONAL_BOUND);
2724
2725        FBC_BOUND_A_UTF8(isWORDCHAR_A);
2726        break;
2727
2728      case NBOUNDA_tb_pb:
2729      case NBOUNDA_tb_p8:
2730      case NBOUND_tb_pb: /* /d without utf8 target is /a */
2731      case NBOUND_tb_p8:
2732        /* regcomp.c makes sure that these only have the traditional \b
2733         * meaning. */
2734        assert(FLAGS(c) == TRADITIONAL_BOUND);
2735
2736        FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2737        break;
2738
2739      case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2740      case NBOUNDA_t8_p8:
2741        /* regcomp.c makes sure that these only have the traditional \b
2742         * meaning. */
2743        assert(FLAGS(c) == TRADITIONAL_BOUND);
2744
2745        FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2746        break;
2747
2748      case NBOUNDU_tb_pb:
2749      case NBOUNDU_tb_p8:
2750        if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2751            FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2752            break;
2753        }
2754
2755        to_complement = 1;
2756        goto do_boundu_non_utf8;
2757
2758      case NBOUNDL_tb_pb:
2759      case NBOUNDL_tb_p8:
2760        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2761        if (FLAGS(c) == TRADITIONAL_BOUND) {
2762            FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2763            break;
2764        }
2765
2766        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2767
2768        to_complement = 1;
2769        goto do_boundu_non_utf8;
2770
2771      case BOUNDL_tb_pb:
2772      case BOUNDL_tb_p8:
2773        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2774        if (FLAGS(c) == TRADITIONAL_BOUND) {
2775            FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2776            break;
2777        }
2778
2779        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2780
2781        goto do_boundu_non_utf8;
2782
2783      case BOUNDU_tb_pb:
2784      case BOUNDU_tb_p8:
2785        if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2786            FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2787            break;
2788        }
2789
2790      do_boundu_non_utf8:
2791        if (s == reginfo->strbeg) {
2792            if (reginfo->intuit || regtry(reginfo, &s))
2793            {
2794                goto got_it;
2795            }
2796
2797            /* Didn't match.  Try at the next position (if there is one) */
2798            s++;
2799            if (UNLIKELY(s >= reginfo->strend)) {
2800                break;
2801            }
2802        }
2803
2804        switch((bound_type) FLAGS(c)) {
2805          case TRADITIONAL_BOUND: /* Should have already been handled */
2806            assert(0);
2807            break;
2808
2809          case GCB_BOUND:
2810            /* Not utf8.  Everything is a GCB except between CR and LF */
2811            while (s < strend) {
2812                if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2813                                      || UCHARAT(s) != '\n'))
2814                    && (reginfo->intuit || regtry(reginfo, &s)))
2815                {
2816                    goto got_it;
2817                }
2818                s++;
2819            }
2820
2821            break;
2822
2823          case LB_BOUND:
2824            {
2825                LB_enum before = getLB_VAL_CP((U8) *(s -1));
2826                while (s < strend) {
2827                    LB_enum after = getLB_VAL_CP((U8) *s);
2828                    if (to_complement ^ isLB(before,
2829                                             after,
2830                                             (U8*) reginfo->strbeg,
2831                                             (U8*) s,
2832                                             (U8*) reginfo->strend,
2833                                             0 /* target not utf8 */ )
2834                        && (reginfo->intuit || regtry(reginfo, &s)))
2835                    {
2836                        goto got_it;
2837                    }
2838                    before = after;
2839                    s++;
2840                }
2841            }
2842
2843            break;
2844
2845          case SB_BOUND:
2846            {
2847                SB_enum before = getSB_VAL_CP((U8) *(s -1));
2848                while (s < strend) {
2849                    SB_enum after = getSB_VAL_CP((U8) *s);
2850                    if ((to_complement ^ isSB(before,
2851                                              after,
2852                                              (U8*) reginfo->strbeg,
2853                                              (U8*) s,
2854                                              (U8*) reginfo->strend,
2855                                             0 /* target not utf8 */ ))
2856                        && (reginfo->intuit || regtry(reginfo, &s)))
2857                    {
2858                        goto got_it;
2859                    }
2860                    before = after;
2861                    s++;
2862                }
2863            }
2864
2865            break;
2866
2867          case WB_BOUND:
2868            {
2869                WB_enum previous = WB_UNKNOWN;
2870                WB_enum before = getWB_VAL_CP((U8) *(s -1));
2871                while (s < strend) {
2872                    WB_enum after = getWB_VAL_CP((U8) *s);
2873                    if ((to_complement ^ isWB(previous,
2874                                              before,
2875                                              after,
2876                                              (U8*) reginfo->strbeg,
2877                                              (U8*) s,
2878                                              (U8*) reginfo->strend,
2879                                               0 /* target not utf8 */ ))
2880                        && (reginfo->intuit || regtry(reginfo, &s)))
2881                    {
2882                        goto got_it;
2883                    }
2884                    previous = before;
2885                    before = after;
2886                    s++;
2887                }
2888            }
2889        }
2890
2891        /* Here are at the final position in the target string, which is a
2892         * boundary by definition, so matches, depending on other constraints.
2893         * */
2894        if (   reginfo->intuit
2895            || (s <= reginfo->strend && regtry(reginfo, &s)))
2896        {
2897            goto got_it;
2898        }
2899
2900        break;
2901
2902      case BOUNDL_t8_pb:
2903      case BOUNDL_t8_p8:
2904        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2905        if (FLAGS(c) == TRADITIONAL_BOUND) {
2906            FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2907                           isWORDCHAR_LC_utf8_safe);
2908            break;
2909        }
2910
2911        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2912
2913        to_complement = 1;
2914        goto do_boundu_utf8;
2915
2916      case NBOUNDL_t8_pb:
2917      case NBOUNDL_t8_p8:
2918        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2919        if (FLAGS(c) == TRADITIONAL_BOUND) {
2920            FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2921                            isWORDCHAR_LC_utf8_safe);
2922            break;
2923        }
2924
2925        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2926
2927        to_complement = 1;
2928        goto do_boundu_utf8;
2929
2930      case NBOUND_t8_pb:
2931      case NBOUND_t8_p8:
2932        /* regcomp.c makes sure that these only have the traditional \b
2933         * meaning. */
2934        assert(FLAGS(c) == TRADITIONAL_BOUND);
2935
2936        /* FALLTHROUGH */
2937
2938      case NBOUNDU_t8_pb:
2939      case NBOUNDU_t8_p8:
2940        if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2941            FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2942                            isWORDCHAR_utf8_safe);
2943            break;
2944        }
2945
2946        to_complement = 1;
2947        goto do_boundu_utf8;
2948
2949      case BOUND_t8_pb:
2950      case BOUND_t8_p8:
2951        /* regcomp.c makes sure that these only have the traditional \b
2952         * meaning. */
2953        assert(FLAGS(c) == TRADITIONAL_BOUND);
2954
2955        /* FALLTHROUGH */
2956
2957      case BOUNDU_t8_pb:
2958      case BOUNDU_t8_p8:
2959        if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2960            FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2961            break;
2962        }
2963
2964      do_boundu_utf8:
2965        if (s == reginfo->strbeg) {
2966            if (reginfo->intuit || regtry(reginfo, &s))
2967            {
2968                goto got_it;
2969            }
2970
2971            /* Didn't match.  Try at the next position (if there is one) */
2972            s += UTF8_SAFE_SKIP(s, reginfo->strend);
2973            if (UNLIKELY(s >= reginfo->strend)) {
2974                break;
2975            }
2976        }
2977
2978        switch((bound_type) FLAGS(c)) {
2979          case TRADITIONAL_BOUND: /* Should have already been handled */
2980            assert(0);
2981            break;
2982
2983          case GCB_BOUND:
2984            {
2985                GCB_enum before = getGCB_VAL_UTF8(
2986                                           reghop3((U8*)s, -1,
2987                                                   (U8*)(reginfo->strbeg)),
2988                                           (U8*) reginfo->strend);
2989                while (s < strend) {
2990                    GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2991                                                    (U8*) reginfo->strend);
2992                    if (   (to_complement ^ isGCB(before,
2993                                                  after,
2994                                                  (U8*) reginfo->strbeg,
2995                                                  (U8*) s,
2996                                                  1 /* target is utf8 */ ))
2997                        && (reginfo->intuit || regtry(reginfo, &s)))
2998                    {
2999                        goto got_it;
3000                    }
3001                    before = after;
3002                    s += UTF8_SAFE_SKIP(s, reginfo->strend);
3003                }
3004            }
3005            break;
3006
3007          case LB_BOUND:
3008            {
3009                LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
3010                                                        -1,
3011                                                        (U8*)(reginfo->strbeg)),
3012                                                   (U8*) reginfo->strend);
3013                while (s < strend) {
3014                    LB_enum after = getLB_VAL_UTF8((U8*) s,
3015                                                   (U8*) reginfo->strend);
3016                    if (to_complement ^ isLB(before,
3017                                             after,
3018                                             (U8*) reginfo->strbeg,
3019                                             (U8*) s,
3020                                             (U8*) reginfo->strend,
3021                                             1 /* target is utf8 */ )
3022                        && (reginfo->intuit || regtry(reginfo, &s)))
3023                    {
3024                        goto got_it;
3025                    }
3026                    before = after;
3027                    s += UTF8_SAFE_SKIP(s, reginfo->strend);
3028                }
3029            }
3030
3031            break;
3032
3033          case SB_BOUND:
3034            {
3035                SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
3036                                                    -1,
3037                                                    (U8*)(reginfo->strbeg)),
3038                                                  (U8*) reginfo->strend);
3039                while (s < strend) {
3040                    SB_enum after = getSB_VAL_UTF8((U8*) s,
3041                                                     (U8*) reginfo->strend);
3042                    if ((to_complement ^ isSB(before,
3043                                              after,
3044                                              (U8*) reginfo->strbeg,
3045                                              (U8*) s,
3046                                              (U8*) reginfo->strend,
3047                                              1 /* target is utf8 */ ))
3048                        && (reginfo->intuit || regtry(reginfo, &s)))
3049                    {
3050                        goto got_it;
3051                    }
3052                    before = after;
3053                    s += UTF8_SAFE_SKIP(s, reginfo->strend);
3054                }
3055            }
3056
3057            break;
3058
3059          case WB_BOUND:
3060            {
3061                /* We are at a boundary between char_sub_0 and char_sub_1.
3062                 * We also keep track of the value for char_sub_-1 as we
3063                 * loop through the line.   Context may be needed to make a
3064                 * determination, and if so, this can save having to
3065                 * recalculate it */
3066                WB_enum previous = WB_UNKNOWN;
3067                WB_enum before = getWB_VAL_UTF8(
3068                                          reghop3((U8*)s,
3069                                                  -1,
3070                                                  (U8*)(reginfo->strbeg)),
3071                                          (U8*) reginfo->strend);
3072                while (s < strend) {
3073                    WB_enum after = getWB_VAL_UTF8((U8*) s,
3074                                                    (U8*) reginfo->strend);
3075                    if ((to_complement ^ isWB(previous,
3076                                              before,
3077                                              after,
3078                                              (U8*) reginfo->strbeg,
3079                                              (U8*) s,
3080                                              (U8*) reginfo->strend,
3081                                              1 /* target is utf8 */ ))
3082                        && (reginfo->intuit || regtry(reginfo, &s)))
3083                    {
3084                        goto got_it;
3085                    }
3086                    previous = before;
3087                    before = after;
3088                    s += UTF8_SAFE_SKIP(s, reginfo->strend);
3089                }
3090            }
3091        }
3092
3093        /* Here are at the final position in the target string, which is a
3094         * boundary by definition, so matches, depending on other constraints.
3095         * */
3096
3097        if (   reginfo->intuit
3098            || (s <= reginfo->strend && regtry(reginfo, &s)))
3099        {
3100            goto got_it;
3101        }
3102        break;
3103
3104      case LNBREAK_t8_pb:
3105      case LNBREAK_t8_p8:
3106        REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3107        break;
3108
3109      case LNBREAK_tb_pb:
3110      case LNBREAK_tb_p8:
3111        REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3112        break;
3113
3114      /* The argument to all the POSIX node types is the class number to pass
3115       * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3116
3117      case NPOSIXL_t8_pb:
3118      case NPOSIXL_t8_p8:
3119        to_complement = 1;
3120        /* FALLTHROUGH */
3121
3122      case POSIXL_t8_pb:
3123      case POSIXL_t8_p8:
3124        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3125        REXEC_FBC_UTF8_CLASS_SCAN(
3126            to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3127                                                          (U8 *) strend)));
3128        break;
3129
3130      case NPOSIXL_tb_pb:
3131      case NPOSIXL_tb_p8:
3132        to_complement = 1;
3133        /* FALLTHROUGH */
3134
3135      case POSIXL_tb_pb:
3136      case POSIXL_tb_p8:
3137        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3138        REXEC_FBC_NON_UTF8_CLASS_SCAN(
3139                                to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3140        break;
3141
3142      case NPOSIXA_t8_pb:
3143      case NPOSIXA_t8_p8:
3144        /* The complement of something that matches only ASCII matches all
3145         * non-ASCII, plus everything in ASCII that isn't in the class. */
3146        REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3147                                  || ! generic_isCC_A_(*s, FLAGS(c)));
3148        break;
3149
3150      case POSIXA_t8_pb:
3151      case POSIXA_t8_p8:
3152        /* Don't need to worry about utf8, as it can match only a single
3153         * byte invariant character.  But we do anyway for performance reasons,
3154         * as otherwise we would have to examine all the continuation
3155         * characters */
3156        REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3157        break;
3158
3159      case NPOSIXD_tb_pb:
3160      case NPOSIXD_tb_p8:
3161      case NPOSIXA_tb_pb:
3162      case NPOSIXA_tb_p8:
3163        to_complement = 1;
3164        /* FALLTHROUGH */
3165
3166      case POSIXD_tb_pb:
3167      case POSIXD_tb_p8:
3168      case POSIXA_tb_pb:
3169      case POSIXA_tb_p8:
3170        REXEC_FBC_NON_UTF8_CLASS_SCAN(
3171                        to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3172        break;
3173
3174      case NPOSIXU_tb_pb:
3175      case NPOSIXU_tb_p8:
3176        to_complement = 1;
3177        /* FALLTHROUGH */
3178
3179      case POSIXU_tb_pb:
3180      case POSIXU_tb_p8:
3181            REXEC_FBC_NON_UTF8_CLASS_SCAN(
3182                                 to_complement ^ cBOOL(generic_isCC_(*s,
3183                                                                    FLAGS(c))));
3184        break;
3185
3186      case NPOSIXD_t8_pb:
3187      case NPOSIXD_t8_p8:
3188      case NPOSIXU_t8_pb:
3189      case NPOSIXU_t8_p8:
3190        to_complement = 1;
3191        /* FALLTHROUGH */
3192
3193      case POSIXD_t8_pb:
3194      case POSIXD_t8_p8:
3195      case POSIXU_t8_pb:
3196      case POSIXU_t8_p8:
3197        classnum = (char_class_number_) FLAGS(c);
3198        switch (classnum) {
3199          default:
3200            REXEC_FBC_UTF8_CLASS_SCAN(
3201                        to_complement ^ cBOOL(_invlist_contains_cp(
3202                                                PL_XPosix_ptrs[classnum],
3203                                                utf8_to_uvchr_buf((U8 *) s,
3204                                                                (U8 *) strend,
3205                                                                NULL))));
3206            break;
3207
3208          case CC_ENUM_SPACE_:
3209            REXEC_FBC_UTF8_CLASS_SCAN(
3210                        to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3211            break;
3212
3213          case CC_ENUM_BLANK_:
3214            REXEC_FBC_UTF8_CLASS_SCAN(
3215                        to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3216            break;
3217
3218          case CC_ENUM_XDIGIT_:
3219            REXEC_FBC_UTF8_CLASS_SCAN(
3220                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3221            break;
3222
3223          case CC_ENUM_VERTSPACE_:
3224            REXEC_FBC_UTF8_CLASS_SCAN(
3225                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3226            break;
3227
3228          case CC_ENUM_CNTRL_:
3229            REXEC_FBC_UTF8_CLASS_SCAN(
3230                        to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3231            break;
3232        }
3233        break;
3234
3235      case AHOCORASICKC_tb_pb:
3236      case AHOCORASICKC_tb_p8:
3237      case AHOCORASICKC_t8_pb:
3238      case AHOCORASICKC_t8_p8:
3239      case AHOCORASICK_tb_pb:
3240      case AHOCORASICK_tb_p8:
3241      case AHOCORASICK_t8_pb:
3242      case AHOCORASICK_t8_p8:
3243        {
3244            DECL_TRIE_TYPE(c);
3245            /* what trie are we using right now */
3246            reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ];
3247            reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3248            HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3249
3250            const char *last_start = strend - trie->minlen;
3251#ifdef DEBUGGING
3252            const char *real_start = s;
3253#endif
3254            STRLEN maxlen = trie->maxlen;
3255            SV *sv_points;
3256            U8 **points; /* map of where we were in the input string
3257                            when reading a given char. For ASCII this
3258                            is unnecessary overhead as the relationship
3259                            is always 1:1, but for Unicode, especially
3260                            case folded Unicode this is not true. */
3261            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3262            U8 *bitmap=NULL;
3263
3264
3265            DECLARE_AND_GET_RE_DEBUG_FLAGS;
3266
3267            /* We can't just allocate points here. We need to wrap it in
3268             * an SV so it gets freed properly if there is a croak while
3269             * running the match */
3270            ENTER;
3271            SAVETMPS;
3272            sv_points=newSV(maxlen * sizeof(U8 *));
3273            SvCUR_set(sv_points,
3274                maxlen * sizeof(U8 *));
3275            SvPOK_on(sv_points);
3276            sv_2mortal(sv_points);
3277            points=(U8**)SvPV_nolen(sv_points );
3278            if ( trie_type != trie_utf8_fold
3279                 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3280            {
3281                if (trie->bitmap)
3282                    bitmap=(U8*)trie->bitmap;
3283                else
3284                    bitmap=(U8*)ANYOF_BITMAP(c);
3285            }
3286            /* this is the Aho-Corasick algorithm modified a touch
3287               to include special handling for long "unknown char" sequences.
3288               The basic idea being that we use AC as long as we are dealing
3289               with a possible matching char, when we encounter an unknown char
3290               (and we have not encountered an accepting state) we scan forward
3291               until we find a legal starting char.
3292               AC matching is basically that of trie matching, except that when
3293               we encounter a failing transition, we fall back to the current
3294               states "fail state", and try the current char again, a process
3295               we repeat until we reach the root state, state 1, or a legal
3296               transition. If we fail on the root state then we can either
3297               terminate if we have reached an accepting state previously, or
3298               restart the entire process from the beginning if we have not.
3299
3300             */
3301            while (s <= last_start) {
3302                const U32 uniflags = UTF8_ALLOW_DEFAULT;
3303                U8 *uc = (U8*)s;
3304                U16 charid = 0;
3305                U32 base = 1;
3306                U32 state = 1;
3307                UV uvc = 0;
3308                STRLEN len = 0;
3309                STRLEN foldlen = 0;
3310                U8 *uscan = (U8*)NULL;
3311                U8 *leftmost = NULL;
3312#ifdef DEBUGGING
3313                U32 accepted_word= 0;
3314#endif
3315                U32 pointpos = 0;
3316
3317                while ( state && uc <= (U8*)strend ) {
3318                    int failed=0;
3319                    U32 word = aho->states[ state ].wordnum;
3320
3321                    if( state==1 ) {
3322                        if ( bitmap ) {
3323                            DEBUG_TRIE_EXECUTE_r(
3324                                if (  uc <= (U8*)last_start
3325                                    && !BITMAP_TEST(bitmap,*uc) )
3326                                {
3327                                    dump_exec_pos( (char *)uc, c, strend,
3328                                        real_start,
3329                                        (char *)uc, utf8_target, 0 );
3330                                    Perl_re_printf( aTHX_
3331                                        " Scanning for legal start char...\n");
3332                                }
3333                            );
3334                            if (utf8_target) {
3335                                while (  uc <= (U8*)last_start
3336                                       && !BITMAP_TEST(bitmap,*uc) )
3337                                {
3338                                    uc += UTF8SKIP(uc);
3339                                }
3340                            } else {
3341                                while (  uc <= (U8*)last_start
3342                                       && ! BITMAP_TEST(bitmap,*uc) )
3343                                {
3344                                    uc++;
3345                                }
3346                            }
3347                            s= (char *)uc;
3348                        }
3349                        if (uc >(U8*)last_start) break;
3350                    }
3351
3352                    if ( word ) {
3353                        U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3354                                                                    % maxlen ];
3355                        if (!leftmost || lpos < leftmost) {
3356                            DEBUG_r(accepted_word=word);
3357                            leftmost= lpos;
3358                        }
3359                        if (base==0) break;
3360
3361                    }
3362                    points[pointpos++ % maxlen]= uc;
3363                    if (foldlen || uc < (U8*)strend) {
3364                        REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3365                                             (U8 *) strend, uscan, len, uvc,
3366                                             charid, foldlen, foldbuf,
3367                                             uniflags);
3368                        DEBUG_TRIE_EXECUTE_r({
3369                            dump_exec_pos( (char *)uc, c, strend,
3370                                        real_start, s, utf8_target, 0);
3371                            Perl_re_printf( aTHX_
3372                                " Charid:%3u CP:%4" UVxf " ",
3373                                 charid, uvc);
3374                        });
3375                    }
3376                    else {
3377                        len = 0;
3378                        charid = 0;
3379                    }
3380
3381
3382                    do {
3383#ifdef DEBUGGING
3384                        word = aho->states[ state ].wordnum;
3385#endif
3386                        base = aho->states[ state ].trans.base;
3387
3388                        DEBUG_TRIE_EXECUTE_r({
3389                            if (failed)
3390                                dump_exec_pos((char *)uc, c, strend, real_start,
3391                                    s,   utf8_target, 0 );
3392                            Perl_re_printf( aTHX_
3393                                "%sState: %4" UVxf ", word=%" UVxf,
3394                                failed ? " Fail transition to " : "",
3395                                (UV)state, (UV)word);
3396                        });
3397                        if ( base ) {
3398                            U32 tmp;
3399                            I32 offset;
3400                            if (charid &&
3401                                 ( ((offset = base + charid
3402                                    - 1 - trie->uniquecharcount)) >= 0)
3403                                 && ((U32)offset < trie->lasttrans)
3404                                 && trie->trans[offset].check == state
3405                                 && (tmp=trie->trans[offset].next))
3406                            {
3407                                DEBUG_TRIE_EXECUTE_r(
3408                                    Perl_re_printf( aTHX_ " - legal\n"));
3409                                state = tmp;
3410                                break;
3411                            }
3412                            else {
3413                                DEBUG_TRIE_EXECUTE_r(
3414                                    Perl_re_printf( aTHX_ " - fail\n"));
3415                                failed = 1;
3416                                state = aho->fail[state];
3417                            }
3418                        }
3419                        else {
3420                            /* we must be accepting here */
3421                            DEBUG_TRIE_EXECUTE_r(
3422                                    Perl_re_printf( aTHX_ " - accepting\n"));
3423                            failed = 1;
3424                            break;
3425                        }
3426                    } while(state);
3427                    uc += len;
3428                    if (failed) {
3429                        if (leftmost)
3430                            break;
3431                        if (!state) state = 1;
3432                    }
3433                }
3434                if ( aho->states[ state ].wordnum ) {
3435                    U8 *lpos = points[ (pointpos
3436                                      - trie->wordinfo[aho->states[ state ]
3437                                                    .wordnum].len) % maxlen ];
3438                    if (!leftmost || lpos < leftmost) {
3439                        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3440                        leftmost = lpos;
3441                    }
3442                }
3443                if (leftmost) {
3444                    s = (char*)leftmost;
3445                    DEBUG_TRIE_EXECUTE_r({
3446                        Perl_re_printf( aTHX_  "Matches word #%" UVxf
3447                                        " at position %" IVdf ". Trying full"
3448                                        " pattern...\n",
3449                            (UV)accepted_word, (IV)(s - real_start)
3450                        );
3451                    });
3452                    if (reginfo->intuit || regtry(reginfo, &s)) {
3453                        FREETMPS;
3454                        LEAVE;
3455                        goto got_it;
3456                    }
3457                    if (s < reginfo->strend) {
3458                        s = HOPc(s,1);
3459                    }
3460                    DEBUG_TRIE_EXECUTE_r({
3461                        Perl_re_printf( aTHX_
3462                                       "Pattern failed. Looking for new start"
3463                                       " point...\n");
3464                    });
3465                } else {
3466                    DEBUG_TRIE_EXECUTE_r(
3467                        Perl_re_printf( aTHX_ "No match.\n"));
3468                    break;
3469                }
3470            }
3471            FREETMPS;
3472            LEAVE;
3473        }
3474        break;
3475
3476      case EXACTFU_REQ8_t8_pb:
3477      case EXACTFUP_tb_p8:
3478      case EXACTFUP_t8_p8:
3479      case EXACTF_tb_p8:
3480      case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3481      case EXACTFAA_NO_TRIE_tb_p8:
3482      case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3483                                      patterns */
3484        assert(0);
3485
3486      default:
3487        Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3488    } /* End of switch on node type */
3489
3490    return 0;
3491
3492  got_it:
3493    return s;
3494}
3495
3496/* set RX_SAVED_COPY, RX_SUBBEG etc.
3497 * flags have same meanings as with regexec_flags() */
3498
3499static void
3500S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3501                            char *strbeg,
3502                            char *strend,
3503                            SV *sv,
3504                            U32 flags,
3505                            bool utf8_target)
3506{
3507    struct regexp *const prog = ReANY(rx);
3508
3509    if (flags & REXEC_COPY_STR) {
3510#ifdef PERL_ANY_COW
3511        if (SvCANCOW(sv)) {
3512            DEBUG_C(Perl_re_printf( aTHX_
3513                              "Copy on write: regexp capture, type %d\n",
3514                                    (int) SvTYPE(sv)));
3515            /* Create a new COW SV to share the match string and store
3516             * in saved_copy, unless the current COW SV in saved_copy
3517             * is valid and suitable for our purpose */
3518            if ((   RXp_SAVED_COPY(prog)
3519                 && SvIsCOW(RXp_SAVED_COPY(prog))
3520                 && SvPOKp(RXp_SAVED_COPY(prog))
3521                 && SvIsCOW(sv)
3522                 && SvPOKp(sv)
3523                 && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog))))
3524            {
3525                /* just reuse saved_copy SV */
3526                if (RXp_MATCH_COPIED(prog)) {
3527                    Safefree(RXp_SUBBEG(prog));
3528                    RXp_MATCH_COPIED_off(prog);
3529                }
3530            }
3531            else {
3532                /* create new COW SV to share string */
3533                RXp_MATCH_COPY_FREE(prog);
3534                RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv);
3535            }
3536            RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog));
3537            assert (SvPOKp(RXp_SAVED_COPY(prog)));
3538            RXp_SUBLEN(prog)  = strend - strbeg;
3539            RXp_SUBOFFSET(prog) = 0;
3540            RXp_SUBCOFFSET(prog) = 0;
3541        } else
3542#endif
3543        {
3544            SSize_t min = 0;
3545            SSize_t max = strend - strbeg;
3546            SSize_t sublen;
3547
3548            if (    (flags & REXEC_COPY_SKIP_POST)
3549                && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3550                && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3551            ) { /* don't copy $' part of string */
3552                SSize_t offs_end;
3553                U32 n = 0;
3554                max = -1;
3555                /* calculate the right-most part of the string covered
3556                 * by a capture. Due to lookahead, this may be to
3557                 * the right of $&, so we have to scan all captures */
3558                while (n <= RXp_LASTPAREN(prog)) {
3559                    if ((offs_end = RXp_OFFS_END(prog,n)) > max)
3560                        max = offs_end;
3561                    n++;
3562                }
3563                if (max == -1)
3564                    max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3565                            ? RXp_OFFS_START(prog,0)
3566                            : 0;
3567                assert(max >= 0 && max <= strend - strbeg);
3568            }
3569
3570            if (    (flags & REXEC_COPY_SKIP_PRE)
3571                && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3572                && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3573            ) { /* don't copy $` part of string */
3574                U32 n = 0;
3575                min = max;
3576                /* calculate the left-most part of the string covered
3577                 * by a capture. Due to lookbehind, this may be to
3578                 * the left of $&, so we have to scan all captures */
3579                while (min && n <= RXp_LASTPAREN(prog)) {
3580                    I32 start = RXp_OFFS_START(prog,n);
3581                    if (   start != -1
3582                        && start < min)
3583                    {
3584                        min = start;
3585                    }
3586                    n++;
3587                }
3588                if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3589                    && min >  RXp_OFFS_END(prog,0)
3590                )
3591                    min = RXp_OFFS_END(prog,0);
3592
3593            }
3594
3595            assert(min >= 0 && min <= max && min <= strend - strbeg);
3596            sublen = max - min;
3597
3598            if (RXp_MATCH_COPIED(prog)) {
3599                if (sublen > RXp_SUBLEN(prog))
3600                    RXp_SUBBEG(prog) =
3601                            (char*)saferealloc(RXp_SUBBEG(prog), sublen+1);
3602            }
3603            else
3604                RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1);
3605            Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char);
3606            RXp_SUBBEG(prog)[sublen] = '\0';
3607            RXp_SUBOFFSET(prog) = min;
3608            RXp_SUBLEN(prog) = sublen;
3609            RXp_MATCH_COPIED_on(prog);
3610        }
3611        RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog);
3612        if (RXp_SUBOFFSET(prog) && utf8_target) {
3613            /* Convert byte offset to chars.
3614             * XXX ideally should only compute this if @-/@+
3615             * has been seen, a la PL_sawampersand ??? */
3616
3617            /* If there's a direct correspondence between the
3618             * string which we're matching and the original SV,
3619             * then we can use the utf8 len cache associated with
3620             * the SV. In particular, it means that under //g,
3621             * sv_pos_b2u() will use the previously cached
3622             * position to speed up working out the new length of
3623             * subcoffset, rather than counting from the start of
3624             * the string each time. This stops
3625             *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3626             * from going quadratic */
3627            if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3628                RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog),
3629                                                SV_GMAGIC|SV_CONST_RETURN);
3630            else
3631                RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg,
3632                                    (U8*)(strbeg+RXp_SUBOFFSET(prog)));
3633        }
3634    }
3635    else {
3636        RXp_MATCH_COPY_FREE(prog);
3637        RXp_SUBBEG(prog) = strbeg;
3638        RXp_SUBOFFSET(prog) = 0;
3639        RXp_SUBCOFFSET(prog) = 0;
3640        RXp_SUBLEN(prog) = strend - strbeg;
3641    }
3642}
3643
3644
3645
3646
3647/*
3648 - regexec_flags - match a regexp against a string
3649 */
3650I32
3651Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3652              char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3653/* stringarg: the point in the string at which to begin matching */
3654/* strend:    pointer to null at end of string */
3655/* strbeg:    real beginning of string */
3656/* minend:    end of match must be >= minend bytes after stringarg. */
3657/* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3658 *            itself is accessed via the pointers above */
3659/* data:      May be used for some additional optimizations.
3660              Currently unused. */
3661/* flags:     For optimizations. See REXEC_* in regexp.h */
3662
3663{
3664    struct regexp *const prog = ReANY(rx);
3665    char *s;
3666    regnode *c;
3667    char *startpos;
3668    SSize_t minlen;		/* must match at least this many chars */
3669    SSize_t dontbother = 0;	/* how many characters not to try at end */
3670    const bool utf8_target = cBOOL(DO_UTF8(sv));
3671    I32 multiline;
3672    RXi_GET_DECL(prog,progi);
3673    regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3674    regmatch_info *const reginfo = &reginfo_buf;
3675    regexp_paren_pair *swap = NULL;
3676    I32 oldsave;
3677    DECLARE_AND_GET_RE_DEBUG_FLAGS;
3678
3679    PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3680    PERL_UNUSED_ARG(data);
3681
3682    /* Be paranoid... */
3683    if (prog == NULL) {
3684        Perl_croak(aTHX_ "NULL regexp parameter");
3685    }
3686
3687    DEBUG_EXECUTE_r(
3688        debug_start_match(rx, utf8_target, stringarg, strend,
3689        "Matching");
3690    );
3691
3692    startpos = stringarg;
3693
3694    /* set these early as they may be used by the HOP macros below */
3695    reginfo->strbeg = strbeg;
3696    reginfo->strend = strend;
3697    reginfo->is_utf8_target = cBOOL(utf8_target);
3698
3699    if (prog->intflags & PREGf_GPOS_SEEN) {
3700        MAGIC *mg;
3701
3702        /* set reginfo->ganch, the position where \G can match */
3703
3704        reginfo->ganch =
3705            (flags & REXEC_IGNOREPOS)
3706            ? stringarg /* use start pos rather than pos() */
3707            : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3708              /* Defined pos(): */
3709            ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3710            : strbeg; /* pos() not defined; use start of string */
3711
3712        DEBUG_GPOS_r(Perl_re_printf( aTHX_
3713            "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3714
3715        /* in the presence of \G, we may need to start looking earlier in
3716         * the string than the suggested start point of stringarg:
3717         * if prog->gofs is set, then that's a known, fixed minimum
3718         * offset, such as
3719         * /..\G/:   gofs = 2
3720         * /ab|c\G/: gofs = 1
3721         * or if the minimum offset isn't known, then we have to go back
3722         * to the start of the string, e.g. /w+\G/
3723         */
3724
3725        if (prog->intflags & PREGf_ANCH_GPOS) {
3726            if (prog->gofs) {
3727                startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3728                if (!startpos ||
3729                    ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3730                {
3731                    DEBUG_GPOS_r(Perl_re_printf( aTHX_
3732                            "fail: ganch-gofs before earliest possible start\n"));
3733                    return 0;
3734                }
3735            }
3736            else
3737                startpos = reginfo->ganch;
3738        }
3739        else if (prog->gofs) {
3740            startpos = HOPBACKc(startpos, prog->gofs);
3741            if (!startpos)
3742                startpos = strbeg;
3743        }
3744        else if (prog->intflags & PREGf_GPOS_FLOAT)
3745            startpos = strbeg;
3746    }
3747
3748    minlen = prog->minlen;
3749    if ((startpos + minlen) > strend || startpos < strbeg) {
3750        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3751                        "Regex match can't succeed, so not even tried\n"));
3752        return 0;
3753    }
3754
3755    /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3756     * which will call destuctors to reset PL_regmatch_state, free higher
3757     * PL_regmatch_slabs, and clean up regmatch_info_aux and
3758     * regmatch_info_aux_eval */
3759
3760    oldsave = PL_savestack_ix;
3761
3762    s = startpos;
3763
3764    if ((prog->extflags & RXf_USE_INTUIT)
3765        && !(flags & REXEC_CHECKED))
3766    {
3767        s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3768                                    flags, NULL);
3769        if (!s)
3770            return 0;
3771
3772        if (prog->extflags & RXf_CHECK_ALL) {
3773            /* we can match based purely on the result of INTUIT.
3774             * Set up captures etc just for $& and $-[0]
3775             * (an intuit-only match wont have $1,$2,..) */
3776            assert(!prog->nparens);
3777
3778            /* s/// doesn't like it if $& is earlier than where we asked it to
3779             * start searching (which can happen on something like /.\G/) */
3780            if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3781                    && (s < stringarg))
3782            {
3783                /* this should only be possible under \G */
3784                assert(prog->intflags & PREGf_GPOS_SEEN);
3785                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3786                    "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3787                goto phooey;
3788            }
3789
3790            /* match via INTUIT shouldn't have any captures.
3791             * Let @-, @+, $^N know */
3792            RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0;
3793            RXp_MATCH_UTF8_set(prog, utf8_target);
3794            SSize_t match_start = s - strbeg;
3795            SSize_t match_end = utf8_target
3796                ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3797                : s - strbeg + prog->minlenret;
3798            CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end);
3799            if ( !(flags & REXEC_NOT_FIRST) )
3800                S_reg_set_capture_string(aTHX_ rx,
3801                                        strbeg, strend,
3802                                        sv, flags, utf8_target);
3803
3804            return 1;
3805        }
3806    }
3807
3808    multiline = prog->extflags & RXf_PMf_MULTILINE;
3809
3810    if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3811        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3812                              "String too short [regexec_flags]...\n"));
3813        goto phooey;
3814    }
3815
3816    /* Check validity of program. */
3817    if (UCHARAT(progi->program) != REG_MAGIC) {
3818        Perl_croak(aTHX_ "corrupted regexp program");
3819    }
3820
3821    RXp_MATCH_TAINTED_off(prog);
3822    RXp_MATCH_UTF8_set(prog, utf8_target);
3823
3824    reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3825    reginfo->intuit = 0;
3826    reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3827    reginfo->warned = FALSE;
3828    reginfo->sv = sv;
3829    reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3830    /* see how far we have to get to not match where we matched before */
3831    reginfo->till = stringarg + minend;
3832
3833    if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3834        /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3835           S_cleanup_regmatch_info_aux has executed (registered by
3836           SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3837           magic belonging to this SV.
3838           Not newSVsv, either, as it does not COW.
3839        */
3840        reginfo->sv = newSV_type(SVt_NULL);
3841        SvSetSV_nosteal(reginfo->sv, sv);
3842        SAVEFREESV(reginfo->sv);
3843    }
3844
3845    /* reserve next 2 or 3 slots in PL_regmatch_state:
3846     * slot N+0: may currently be in use: skip it
3847     * slot N+1: use for regmatch_info_aux struct
3848     * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3849     * slot N+3: ready for use by regmatch()
3850     */
3851
3852    {
3853        regmatch_state *old_regmatch_state;
3854        regmatch_slab  *old_regmatch_slab;
3855        int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3856
3857        /* on first ever match, allocate first slab */
3858        if (!PL_regmatch_slab) {
3859            Newx(PL_regmatch_slab, 1, regmatch_slab);
3860            PL_regmatch_slab->prev = NULL;
3861            PL_regmatch_slab->next = NULL;
3862            PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3863        }
3864
3865        old_regmatch_state = PL_regmatch_state;
3866        old_regmatch_slab  = PL_regmatch_slab;
3867
3868        for (i=0; i <= max; i++) {
3869            if (i == 1)
3870                reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3871            else if (i ==2)
3872                reginfo->info_aux_eval =
3873                reginfo->info_aux->info_aux_eval =
3874                            &(PL_regmatch_state->u.info_aux_eval);
3875
3876            if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3877                PL_regmatch_state = S_push_slab(aTHX);
3878        }
3879
3880        /* note initial PL_regmatch_state position; at end of match we'll
3881         * pop back to there and free any higher slabs */
3882
3883        reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3884        reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3885        reginfo->info_aux->poscache = NULL;
3886
3887        SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3888
3889        if ((prog->extflags & RXf_EVAL_SEEN))
3890            S_setup_eval_state(aTHX_ reginfo);
3891        else
3892            reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3893    }
3894
3895    if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3896        /* We have to be careful. If the previous successful match
3897           was from this regex we don't want a subsequent partially
3898           successful match to clobber the old results.
3899           So when we detect this possibility we add a swap buffer
3900           to the re, and switch the buffer each match. If we fail,
3901           we switch it back; otherwise we leave it swapped.
3902        */
3903        swap = RXp_OFFSp(prog);
3904        /* avoid leak if we die, or clean up anyway if match completes */
3905        SAVEFREEPV(swap);
3906        Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair);
3907        DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3908            "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3909            0,
3910            PTR2UV(prog),
3911            PTR2UV(swap),
3912            PTR2UV(RXp_OFFSp(prog))
3913        ));
3914    }
3915
3916    if (prog->recurse_locinput)
3917        Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3918
3919    /* Simplest case: anchored match (but not \G) need be tried only once,
3920     * or with MBOL, only at the beginning of each line.
3921     *
3922     * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3923     * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3924     * match at the start of the string then it won't match anywhere else
3925     * either; while with /.*.../, if it doesn't match at the beginning,
3926     * the earliest it could match is at the start of the next line */
3927
3928    if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3929        char *end;
3930
3931        if (regtry(reginfo, &s))
3932            goto got_it;
3933
3934        if (!(prog->intflags & PREGf_ANCH_MBOL))
3935            goto phooey;
3936
3937        /* didn't match at start, try at other newline positions */
3938
3939        if (minlen)
3940            dontbother = minlen - 1;
3941        end = HOP3c(strend, -dontbother, strbeg) - 1;
3942
3943        /* skip to next newline */
3944
3945        while (s <= end) { /* note it could be possible to match at the end of the string */
3946            /* NB: newlines are the same in unicode as they are in latin */
3947            if (*s++ != '\n')
3948                continue;
3949            if (prog->check_substr || prog->check_utf8) {
3950            /* note that with PREGf_IMPLICIT, intuit can only fail
3951             * or return the start position, so it's of limited utility.
3952             * Nevertheless, I made the decision that the potential for
3953             * quick fail was still worth it - DAPM */
3954                s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3955                if (!s)
3956                    goto phooey;
3957            }
3958            if (regtry(reginfo, &s))
3959                goto got_it;
3960        }
3961        goto phooey;
3962    } /* end anchored search */
3963
3964    /* anchored \G match */
3965    if (prog->intflags & PREGf_ANCH_GPOS)
3966    {
3967        /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3968        assert(prog->intflags & PREGf_GPOS_SEEN);
3969        /* For anchored \G, the only position it can match from is
3970         * (ganch-gofs); we already set startpos to this above; if intuit
3971         * moved us on from there, we can't possibly succeed */
3972        assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3973        if (s == startpos && regtry(reginfo, &s))
3974            goto got_it;
3975        goto phooey;
3976    }
3977
3978    /* Messy cases:  unanchored match. */
3979
3980    if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3981        /* we have /x+whatever/ */
3982        /* it must be a one character string (XXXX Except is_utf8_pat?) */
3983        char ch;
3984#ifdef DEBUGGING
3985        int did_match = 0;
3986#endif
3987        if (utf8_target) {
3988            if (! prog->anchored_utf8) {
3989                to_utf8_substr(prog);
3990            }
3991            ch = SvPVX_const(prog->anchored_utf8)[0];
3992            REXEC_FBC_UTF8_SCAN(
3993                if (*s == ch) {
3994                    DEBUG_EXECUTE_r( did_match = 1 );
3995                    if (regtry(reginfo, &s)) goto got_it;
3996                    s += UTF8_SAFE_SKIP(s, strend);
3997                    while (s < strend && *s == ch)
3998                        s += UTF8SKIP(s);
3999                }
4000            );
4001
4002        }
4003        else {
4004            if (! prog->anchored_substr) {
4005                if (! to_byte_substr(prog)) {
4006                    NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4007                }
4008            }
4009            ch = SvPVX_const(prog->anchored_substr)[0];
4010            REXEC_FBC_NON_UTF8_SCAN(
4011                if (*s == ch) {
4012                    DEBUG_EXECUTE_r( did_match = 1 );
4013                    if (regtry(reginfo, &s)) goto got_it;
4014                    s++;
4015                    while (s < strend && *s == ch)
4016                        s++;
4017                }
4018            );
4019        }
4020        DEBUG_EXECUTE_r(if (!did_match)
4021                Perl_re_printf( aTHX_
4022                                  "Did not find anchored character...\n")
4023               );
4024    }
4025    else if (prog->anchored_substr != NULL
4026              || prog->anchored_utf8 != NULL
4027              || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
4028                  && prog->float_max_offset < strend - s)) {
4029        SV *must;
4030        SSize_t back_max;
4031        SSize_t back_min;
4032        char *last;
4033        char *last1;		/* Last position checked before */
4034#ifdef DEBUGGING
4035        int did_match = 0;
4036#endif
4037        if (prog->anchored_substr || prog->anchored_utf8) {
4038            if (utf8_target) {
4039                if (! prog->anchored_utf8) {
4040                    to_utf8_substr(prog);
4041                }
4042                must = prog->anchored_utf8;
4043            }
4044            else {
4045                if (! prog->anchored_substr) {
4046                    if (! to_byte_substr(prog)) {
4047                        NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4048                    }
4049                }
4050                must = prog->anchored_substr;
4051            }
4052            back_max = back_min = prog->anchored_offset;
4053        } else {
4054            if (utf8_target) {
4055                if (! prog->float_utf8) {
4056                    to_utf8_substr(prog);
4057                }
4058                must = prog->float_utf8;
4059            }
4060            else {
4061                if (! prog->float_substr) {
4062                    if (! to_byte_substr(prog)) {
4063                        NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4064                    }
4065                }
4066                must = prog->float_substr;
4067            }
4068            back_max = prog->float_max_offset;
4069            back_min = prog->float_min_offset;
4070        }
4071
4072        if (back_min<0) {
4073            last = strend;
4074        } else {
4075            last = HOP3c(strend,	/* Cannot start after this */
4076                  -(SSize_t)(CHR_SVLEN(must)
4077                         - (SvTAIL(must) != 0) + back_min), strbeg);
4078        }
4079        if (s > reginfo->strbeg)
4080            last1 = HOPc(s, -1);
4081        else
4082            last1 = s - 1;	/* bogus */
4083
4084        /* XXXX check_substr already used to find "s", can optimize if
4085           check_substr==must. */
4086        dontbother = 0;
4087        strend = HOPc(strend, -dontbother);
4088        while ( (s <= last) &&
4089                (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
4090                                  (unsigned char*)strend, must,
4091                                  multiline ? FBMrf_MULTILINE : 0)) ) {
4092            DEBUG_EXECUTE_r( did_match = 1 );
4093            if (HOPc(s, -back_max) > last1) {
4094                last1 = HOPc(s, -back_min);
4095                s = HOPc(s, -back_max);
4096            }
4097            else {
4098                char * const t = (last1 >= reginfo->strbeg)
4099                                    ? HOPc(last1, 1) : last1 + 1;
4100
4101                last1 = HOPc(s, -back_min);
4102                s = t;
4103            }
4104            if (utf8_target) {
4105                while (s <= last1) {
4106                    if (regtry(reginfo, &s))
4107                        goto got_it;
4108                    if (s >= last1) {
4109                        s++; /* to break out of outer loop */
4110                        break;
4111                    }
4112                    s += UTF8SKIP(s);
4113                }
4114            }
4115            else {
4116                while (s <= last1) {
4117                    if (regtry(reginfo, &s))
4118                        goto got_it;
4119                    s++;
4120                }
4121            }
4122        }
4123        DEBUG_EXECUTE_r(if (!did_match) {
4124            RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4125                SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4126            Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
4127                              ((must == prog->anchored_substr || must == prog->anchored_utf8)
4128                               ? "anchored" : "floating"),
4129                quoted, RE_SV_TAIL(must));
4130        });
4131        goto phooey;
4132    }
4133    else if ( (c = progi->regstclass) ) {
4134        if (minlen) {
4135            const OPCODE op = OP(progi->regstclass);
4136            /* don't bother with what can't match */
4137            if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4138                strend = HOPc(strend, -(minlen - 1));
4139        }
4140        DEBUG_EXECUTE_r({
4141            SV * const prop = sv_newmortal();
4142            regprop(prog, prop, c, reginfo, NULL);
4143            {
4144                RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4145                    s,strend-s,PL_dump_re_max_len);
4146                Perl_re_printf( aTHX_
4147                    "Matching stclass %.*s against %s (%d bytes)\n",
4148                    (int)SvCUR(prop), SvPVX_const(prop),
4149                     quoted, (int)(strend - s));
4150            }
4151        });
4152        if (find_byclass(prog, c, s, strend, reginfo))
4153            goto got_it;
4154        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4155    }
4156    else {
4157        dontbother = 0;
4158        if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4159            /* Trim the end. */
4160            char *last= NULL;
4161            SV* float_real;
4162            STRLEN len;
4163            const char *little;
4164
4165            if (utf8_target) {
4166                if (! prog->float_utf8) {
4167                    to_utf8_substr(prog);
4168                }
4169                float_real = prog->float_utf8;
4170            }
4171            else {
4172                if (! prog->float_substr) {
4173                    if (! to_byte_substr(prog)) {
4174                        NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4175                    }
4176                }
4177                float_real = prog->float_substr;
4178            }
4179
4180            little = SvPV_const(float_real, len);
4181            if (SvTAIL(float_real)) {
4182                    /* This means that float_real contains an artificial \n on
4183                     * the end due to the presence of something like this:
4184                     * /foo$/ where we can match both "foo" and "foo\n" at the
4185                     * end of the string.  So we have to compare the end of the
4186                     * string first against the float_real without the \n and
4187                     * then against the full float_real with the string.  We
4188                     * have to watch out for cases where the string might be
4189                     * smaller than the float_real or the float_real without
4190                     * the \n. */
4191                    char *checkpos= strend - len;
4192                    DEBUG_OPTIMISE_r(
4193                        Perl_re_printf( aTHX_
4194                            "%sChecking for float_real.%s\n",
4195                            PL_colors[4], PL_colors[5]));
4196                    if (checkpos + 1 < strbeg) {
4197                        /* can't match, even if we remove the trailing \n
4198                         * string is too short to match */
4199                        DEBUG_EXECUTE_r(
4200                            Perl_re_printf( aTHX_
4201                                "%sString shorter than required trailing substring, cannot match.%s\n",
4202                                PL_colors[4], PL_colors[5]));
4203                        goto phooey;
4204                    } else if (memEQ(checkpos + 1, little, len - 1)) {
4205                        /* can match, the end of the string matches without the
4206                         * "\n" */
4207                        last = checkpos + 1;
4208                    } else if (checkpos < strbeg) {
4209                        /* cant match, string is too short when the "\n" is
4210                         * included */
4211                        DEBUG_EXECUTE_r(
4212                            Perl_re_printf( aTHX_
4213                                "%sString does not contain required trailing substring, cannot match.%s\n",
4214                                PL_colors[4], PL_colors[5]));
4215                        goto phooey;
4216                    } else if (!multiline) {
4217                        /* non multiline match, so compare with the "\n" at the
4218                         * end of the string */
4219                        if (memEQ(checkpos, little, len)) {
4220                            last= checkpos;
4221                        } else {
4222                            DEBUG_EXECUTE_r(
4223                                Perl_re_printf( aTHX_
4224                                    "%sString does not contain required trailing substring, cannot match.%s\n",
4225                                    PL_colors[4], PL_colors[5]));
4226                            goto phooey;
4227                        }
4228                    } else {
4229                        /* multiline match, so we have to search for a place
4230                         * where the full string is located */
4231                        goto find_last;
4232                    }
4233            } else {
4234                  find_last:
4235                    if (len)
4236                        last = rninstr(s, strend, little, little + len);
4237                    else
4238                        last = strend;	/* matching "$" */
4239            }
4240            if (!last) {
4241                /* at one point this block contained a comment which was
4242                 * probably incorrect, which said that this was a "should not
4243                 * happen" case.  Even if it was true when it was written I am
4244                 * pretty sure it is not anymore, so I have removed the comment
4245                 * and replaced it with this one. Yves */
4246                DEBUG_EXECUTE_r(
4247                    Perl_re_printf( aTHX_
4248                        "%sString does not contain required substring, cannot match.%s\n",
4249                        PL_colors[4], PL_colors[5]
4250                    ));
4251                goto phooey;
4252            }
4253            dontbother = strend - last + prog->float_min_offset;
4254        }
4255        if (minlen && (dontbother < minlen))
4256            dontbother = minlen - 1;
4257        strend -= dontbother; 		   /* this one's always in bytes! */
4258        /* We don't know much -- general case. */
4259        if (utf8_target) {
4260            for (;;) {
4261                if (regtry(reginfo, &s))
4262                    goto got_it;
4263                if (s >= strend)
4264                    break;
4265                s += UTF8SKIP(s);
4266            };
4267        }
4268        else {
4269            do {
4270                if (regtry(reginfo, &s))
4271                    goto got_it;
4272            } while (s++ < strend);
4273        }
4274    }
4275
4276    /* Failure. */
4277    goto phooey;
4278
4279  got_it:
4280    /* s/// doesn't like it if $& is earlier than where we asked it to
4281     * start searching (which can happen on something like /.\G/) */
4282    if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4283            && (RXp_OFFS_START(prog,0) < stringarg - strbeg))
4284    {
4285        /* this should only be possible under \G */
4286        assert(prog->intflags & PREGf_GPOS_SEEN);
4287        DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4288            "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4289        goto phooey;
4290    }
4291
4292    /* clean up; this will trigger destructors that will free all slabs
4293     * above the current one, and cleanup the regmatch_info_aux
4294     * and regmatch_info_aux_eval sructs */
4295
4296    LEAVE_SCOPE(oldsave);
4297
4298    if (RXp_PAREN_NAMES(prog))
4299        (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4300
4301    /* make sure $`, $&, $', and $digit will work later */
4302    if ( !(flags & REXEC_NOT_FIRST) )
4303        S_reg_set_capture_string(aTHX_ rx,
4304                                    strbeg, reginfo->strend,
4305                                    sv, flags, utf8_target);
4306
4307    return 1;
4308
4309  phooey:
4310    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4311                          PL_colors[4], PL_colors[5]));
4312
4313    if (swap) {
4314        /* we failed :-( roll it back.
4315         * Since the swap buffer will be freed on scope exit which follows
4316         * shortly, restore the old captures by copying 'swap's original
4317         * data to the new offs buffer
4318         */
4319        DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4320            "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4321            0,
4322            PTR2UV(prog),
4323            PTR2UV(RXp_OFFSp(prog)),
4324            PTR2UV(swap)
4325        ));
4326
4327        Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair);
4328    }
4329
4330    /* clean up; this will trigger destructors that will free all slabs
4331     * above the current one, and cleanup the regmatch_info_aux
4332     * and regmatch_info_aux_eval sructs */
4333
4334    LEAVE_SCOPE(oldsave);
4335
4336    return 0;
4337}
4338
4339
4340/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4341 * Do inc before dec, in case old and new rex are the same */
4342#define SET_reg_curpm(Re2)                          \
4343    if (reginfo->info_aux_eval) {                   \
4344        (void)ReREFCNT_inc(Re2);		    \
4345        ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
4346        PM_SETRE((PL_reg_curpm), (Re2));	    \
4347    }
4348
4349
4350/*
4351 - regtry - try match at specific point
4352 */
4353STATIC bool			/* 0 failure, 1 success */
4354S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4355{
4356    CHECKPOINT lastcp;
4357    REGEXP *const rx = reginfo->prog;
4358    regexp *const prog = ReANY(rx);
4359    SSize_t result;
4360#ifdef DEBUGGING
4361    U32 depth = 0; /* used by REGCP_SET */
4362#endif
4363    RXi_GET_DECL(prog,progi);
4364    DECLARE_AND_GET_RE_DEBUG_FLAGS;
4365
4366    PERL_ARGS_ASSERT_REGTRY;
4367
4368    reginfo->cutpoint=NULL;
4369
4370    RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg;
4371    RXp_LASTPAREN(prog) = 0;
4372    RXp_LASTCLOSEPAREN(prog) = 0;
4373
4374    /* XXXX What this code is doing here?!!!  There should be no need
4375       to do this again and again, RXp_LASTPAREN(prog) should take care of
4376       this!  --ilya*/
4377
4378    /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4379     * Actually, the code in regcppop() (which Ilya may be meaning by
4380     * RXp_LASTPAREN(prog)), is not needed at all by the test suite
4381     * (op/regexp, op/pat, op/split), but that code is needed otherwise
4382     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4383     * Meanwhile, this code *is* needed for the
4384     * above-mentioned test suite tests to succeed.  The common theme
4385     * on those tests seems to be returning null fields from matches.
4386     * --jhi updated by dapm */
4387
4388    /* After encountering a variant of the issue mentioned above I think
4389     * the point Ilya was making is that if we properly unwind whenever
4390     * we set lastparen to a smaller value then we should not need to do
4391     * this every time, only when needed. So if we have tests that fail if
4392     * we remove this, then it suggests somewhere else we are improperly
4393     * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4394     * places it is called, and related regcp() routines. - Yves */
4395#if 1
4396    if (prog->nparens) {
4397        regexp_paren_pair *pp = RXp_OFFSp(prog);
4398        I32 i;
4399        for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) {
4400            ++pp;
4401            pp->start = -1;
4402            pp->end = -1;
4403        }
4404    }
4405#endif
4406    REGCP_SET(lastcp);
4407    result = regmatch(reginfo, *startposp, progi->program + 1);
4408    if (result != -1) {
4409        RXp_OFFSp(prog)[0].end = result;
4410        return 1;
4411    }
4412    if (reginfo->cutpoint)
4413        *startposp= reginfo->cutpoint;
4414    REGCP_UNWIND(lastcp);
4415    return 0;
4416}
4417
4418/* this is used to determine how far from the left messages like
4419   'failed...' are printed in regexec.c. It should be set such that
4420   messages are inline with the regop output that created them.
4421*/
4422#define REPORT_CODE_OFF 29
4423#define INDENT_CHARS(depth) ((int)(depth) % 20)
4424#ifdef DEBUGGING
4425int
4426Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4427{
4428    va_list ap;
4429    int result;
4430    PerlIO *f= Perl_debug_log;
4431    PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4432    va_start(ap, depth);
4433    PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4434    result = PerlIO_vprintf(f, fmt, ap);
4435    va_end(ap);
4436    return result;
4437}
4438#endif /* DEBUGGING */
4439
4440/* grab a new slab and return the first slot in it */
4441
4442STATIC regmatch_state *
4443S_push_slab(pTHX)
4444{
4445    regmatch_slab *s = PL_regmatch_slab->next;
4446    if (!s) {
4447        Newx(s, 1, regmatch_slab);
4448        s->prev = PL_regmatch_slab;
4449        s->next = NULL;
4450        PL_regmatch_slab->next = s;
4451    }
4452    PL_regmatch_slab = s;
4453    return SLAB_FIRST(s);
4454}
4455
4456#ifdef DEBUGGING
4457
4458STATIC void
4459S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4460    const char *start, const char *end, const char *blurb)
4461{
4462    const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4463
4464    PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4465
4466    if (!PL_colorset)
4467            reginitcolors();
4468    {
4469        RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4470            RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4471
4472        RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4473            start, end - start, PL_dump_re_max_len);
4474
4475        Perl_re_printf( aTHX_
4476            "%s%s REx%s %s against %s\n",
4477                       PL_colors[4], blurb, PL_colors[5], s0, s1);
4478
4479        if (utf8_target||utf8_pat)
4480            Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4481                utf8_pat ? "pattern" : "",
4482                utf8_pat && utf8_target ? " and " : "",
4483                utf8_target ? "string" : ""
4484            );
4485    }
4486}
4487
4488STATIC void
4489S_dump_exec_pos(pTHX_ const char *locinput,
4490                      const regnode *scan,
4491                      const char *loc_regeol,
4492                      const char *loc_bostr,
4493                      const char *loc_reg_starttry,
4494                      const bool utf8_target,
4495                      const U32 depth
4496                )
4497{
4498    const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4499    const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4500    int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4501    /* The part of the string before starttry has one color
4502       (pref0_len chars), between starttry and current
4503       position another one (pref_len - pref0_len chars),
4504       after the current position the third one.
4505       We assume that pref0_len <= pref_len, otherwise we
4506       decrease pref0_len.  */
4507    int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4508        ? (5 + taill) - l : locinput - loc_bostr;
4509    int pref0_len;
4510
4511    PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4512
4513    if (utf8_target) {
4514        while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4515            pref_len++;
4516        }
4517    }
4518    pref0_len = pref_len  - (locinput - loc_reg_starttry);
4519    if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4520        l = ( loc_regeol - locinput > (5 + taill) - pref_len
4521              ? (5 + taill) - pref_len : loc_regeol - locinput);
4522    if (utf8_target) {
4523        while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4524            l--;
4525        }
4526    }
4527    if (pref0_len < 0)
4528        pref0_len = 0;
4529    if (pref0_len > pref_len)
4530        pref0_len = pref_len;
4531    {
4532        const int is_uni = utf8_target ? 1 : 0;
4533
4534        RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4535            (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4536
4537        RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4538                    (locinput - pref_len + pref0_len),
4539                    pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4540
4541        RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4542                    locinput, loc_regeol - locinput, 10, 0, 1);
4543
4544        const STRLEN tlen=len0+len1+len2;
4545        Perl_re_printf( aTHX_
4546                    "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4547                    (IV)(locinput - loc_bostr),
4548                    len0, s0,
4549                    len1, s1,
4550                    (docolor ? "" : "> <"),
4551                    len2, s2,
4552                    (int)(tlen > 19 ? 0 :  19 - tlen),
4553                    "",
4554                    (UV)depth);
4555    }
4556}
4557
4558#endif
4559
4560/* reg_check_named_buff_matched()
4561 * Checks to see if a named buffer has matched. The data array of
4562 * buffer numbers corresponding to the buffer is expected to reside
4563 * in the regexp->data->data array in the slot stored in the ARG1u() of
4564 * node involved. Note that this routine doesn't actually care about the
4565 * name, that information is not preserved from compilation to execution.
4566 * Returns the index of the leftmost defined buffer with the given name
4567 * or 0 if non of the buffers matched.
4568 */
4569STATIC I32
4570S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4571{
4572    I32 n;
4573    RXi_GET_DECL(rex,rexi);
4574    SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
4575    I32 *nums=(I32*)SvPVX(sv_dat);
4576
4577    PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4578
4579    for ( n=0; n<SvIVX(sv_dat); n++ ) {
4580        if ((I32)RXp_LASTPAREN(rex) >= nums[n] &&
4581            RXp_OFFS_END(rex,nums[n]) != -1)
4582        {
4583            return nums[n];
4584        }
4585    }
4586    return 0;
4587}
4588
4589static bool
4590S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4591                          struct next_matchable_info * m,
4592                          regmatch_info *reginfo)
4593{
4594    /* This function determines various characteristics about every possible
4595     * initial match of the passed-in EXACTish <text_node>, and stores them in
4596     * <*m>.
4597     *
4598     * That includes a match string and a parallel mask, such that if you AND
4599     * the target string with the mask and compare with the match string,
4600     * you'll have a pretty good idea, perhaps even perfect, if that portion of
4601     * the target matches or not.
4602     *
4603     * The motivation behind this function is to allow the caller to set up
4604     * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4605     * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4606     * beginning oF B, which is the passed in <text_node>  That's where this
4607     * function comes in.  The values it returns can quickly be used to rule
4608     * out many, or all, cases of possible matches not actually being the
4609     * beginning of B, <text_node>.  It is also used in regrepeat() where we
4610     * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4611     * determine where the span of 'A's stop.
4612     *
4613     * If <text_node> is of type EXACT, there is only one possible character
4614     * that can match its first character, and so the situation is quite
4615     * simple.  But things can get much more complicated if folding is
4616     * involved.  It may be that the first character of an EXACTFish node
4617     * doesn't participate in any possible fold, e.g., punctuation, so it can
4618     * be matched only by itself.  The vast majority of characters that are in
4619     * folds match just two things, their lower and upper-case equivalents.
4620     * But not all are like that; some have multiple possible matches, or match
4621     * sequences of more than one character.  This function sorts all that out.
4622     *
4623     * It returns information about all possibilities of what the first
4624     * character(s) of <text_node> could look like.  Again, if <text_node> is a
4625     * plain EXACT node, that's just the actual first bytes of the first
4626     * character; but otherwise it is the bytes, that when masked, match all
4627     * possible combinations of all the initial bytes of all the characters
4628     * that could match, folded.  (Actually, this is a slight over promise.  It
4629     * handles only up to the initial 5 bytes, which is enough for all Unicode
4630     * characters, but not for all non-Unicode ones.)
4631     *
4632     * Here's an example to clarify.  Suppose the first character of
4633     * <text_node> is the letter 'C', and we are under /i matching.  That means
4634     * 'c' also matches.  The representations of these two characters differ in
4635     * just one bit, so the mask would be a zero in that position and ones in
4636     * the other 7.  And the returned string would be the AND of these two
4637     * characters, and would be one byte long, since these characters are each
4638     * a single byte.  ANDing the target <text_node> with this mask will yield
4639     * the returned string if and only if <text_node> begins with one of these
4640     * two characters.  So, the function would also return that the definitive
4641     * length matched is 1 byte.
4642     *
4643     * Now, suppose instead of the letter 'C',  <text_node> begins with the
4644     * letter 'F'.  The situation is much more complicated because there are
4645     * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4646     * begins with 'f', and hence could match.  We add these into the returned
4647     * string and mask, but the result isn't definitive; the caller has to
4648     * check further if its AND and compare pass.  But the failure of that
4649     * compare will quickly rule out most possible inputs.
4650     *
4651     * Much of this could be done in regcomp.c at compile time, except for
4652     * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4653     * could be used for one or the other eventualities.
4654     *
4655     * If this function determines that no possible character in the target
4656     * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4657     * situation occurs if the first character in <text_node> requires UTF-8 to
4658     * represent, and the target string isn't in UTF-8.)
4659     *
4660     * Some analysis is in GH #18414, located at the time of this writing at:
4661     * https://github.com/Perl/perl5/issues/18414
4662     */
4663
4664    const bool utf8_target = reginfo->is_utf8_target;
4665    bool utf8_pat = reginfo->is_utf8_pat;
4666
4667    PERL_UINT_FAST8_T i;
4668
4669    /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4670     */
4671    U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4672    U8 lengths[MAX_MATCHES] = { 0 };
4673
4674    U8 index_of_longest = 0;
4675
4676    U8 *pat = (U8*)STRING(text_node);
4677    Size_t pat_len = STR_LEN(text_node);
4678    U8 op = OP(text_node);
4679
4680    U8 byte_mask[5]  = {0};
4681    U8 byte_anded[5] = {0};
4682
4683    /* There are some folds in Unicode to multiple characters.  This will hold
4684     * such characters that could fold to the beginning of 'text_node' */
4685    UV multi_fold_from = 0;
4686
4687    /* We may have to create a modified copy of the pattern */
4688    U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4689
4690    m->max_length = 0;
4691    m->min_length = 255;
4692    m->count = 0;
4693
4694    /* Even if the first character in the node can match something in Latin1,
4695     * if there is anything in the node that can't, the match must fail */
4696    if (! utf8_target && isEXACT_REQ8(op)) {
4697        return FALSE;
4698    }
4699
4700/* Define a temporary op for use in this function, using an existing one that
4701 * should never be a real op during execution */
4702#define TURKISH  PSEUDO
4703
4704    /* What to do about these two nodes had to be deferred to runtime (which is
4705     * now).  If the extra information we now have so indicates, turn them into
4706     * EXACTFU nodes */
4707    if (   (op == EXACTF && utf8_target)
4708        || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4709    {
4710        if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) {
4711            op = TURKISH;
4712        }
4713        else {
4714            op = EXACTFU;
4715        }
4716
4717        /* And certain situations are better handled if we create a modified
4718         * version of the pattern */
4719        if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4720                           specific problematic characters */
4721            if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4722
4723                /* The node could start with characters that are the first ones
4724                 * of a multi-character fold. */
4725                multi_fold_from
4726                          = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4727                if (multi_fold_from) {
4728
4729                    /* Here, they do form a sequence that matches the fold of a
4730                     * single character.  That single character then is a
4731                     * possible match.  Below we will look again at this, but
4732                     * the code below is expecting every character in the
4733                     * pattern to be folded, which the input isn't required to
4734                     * be in this case.  So, just fold the single character,
4735                     * and the result will be in the expected form. */
4736                    _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4737                                       FOLD_FLAGS_FULL);
4738                    pat = mod_pat;
4739                }
4740                         /* Turkish has a couple extra possibilities. */
4741                else if (   UNLIKELY(op == TURKISH)
4742                         &&  pat_len >= 3
4743                         &&  isALPHA_FOLD_EQ(pat[0], 'f')
4744                         && (   memBEGINs(pat + 1, pat_len - 1,
4745                                    LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4746                             || (   pat_len >= 4
4747                                 && isALPHA_FOLD_EQ(pat[1], 'f')
4748                                 && memBEGINs(pat + 2, pat_len - 2,
4749                                    LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4750                ))) {
4751                    /* The macros for finding a multi-char fold don't include
4752                     * the Turkish possibilities, in which U+130 folds to 'i'.
4753                     * Hard-code these.  It's very unlikely that Unicode will
4754                     * ever add any others.  */
4755                    if (pat[1] == 'f') {
4756                        pat_len = 3;
4757                        Copy("ffi", mod_pat, pat_len, U8);
4758                    }
4759                    else {
4760                        pat_len = 2;
4761                        Copy("fi", mod_pat, pat_len, U8);
4762                    }
4763                    pat = mod_pat;
4764                }
4765                else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4766                         &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4767                         &&  LIKELY(memNEs(pat, pat_len,
4768                                           LATIN_SMALL_LETTER_SHARP_S_UTF8))
4769                         && (LIKELY(op != TURKISH || *pat != 'I')))
4770                {
4771                    /* For all cases of things between 0-255, except the ones
4772                     * in the conditional above, the fold is just the lower
4773                     * case, which is faster than the more general case. */
4774                    mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4775                                                                     pat[1]));
4776                    pat_len = 1;
4777                    pat = mod_pat;
4778                    utf8_pat = FALSE;
4779                }
4780                else {  /* Code point above 255, or needs special handling */
4781                    _to_utf8_fold_flags(pat, pat + pat_len,
4782                                        mod_pat, &pat_len,
4783                                        FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4784                    pat = mod_pat;
4785                }
4786            }
4787        }
4788        else if /* Below is not a UTF-8 pattern; there's a somewhat different
4789                   set of problematic characters */
4790                ((multi_fold_from
4791                          = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4792        {
4793            /* We may have to canonicalize a multi-char fold, as in the UTF-8
4794             * case */
4795            _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4796                               FOLD_FLAGS_FULL);
4797            pat = mod_pat;
4798        }
4799        else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4800            mod_pat[0] = mod_pat[1] = 's';
4801            pat_len = 2;
4802            utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4803                                       chars, and speeds copying */
4804            pat = mod_pat;
4805        }
4806        else if (LIKELY(op != TURKISH || *pat != 'I')) {
4807            mod_pat[0] = toLOWER_L1(*pat);
4808            pat_len = 1;
4809            pat = mod_pat;
4810        }
4811    }
4812    else if /* Below isn't a node that we convert to UTF-8 */
4813            (     utf8_target
4814             && ! utf8_pat
4815             &&   op == EXACTFAA_NO_TRIE
4816             &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4817    {
4818        /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4819         * did this at compile time when the pattern was UTF-8 , but otherwise
4820         * we couldn't do it earlier, because it requires a UTF-8 target for
4821         * this match to be legal. */
4822        pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4823        Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4824             LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4825        pat = mod_pat;
4826        utf8_pat = TRUE;
4827    }
4828
4829    /* Here, we have taken care of the initial work for a few very problematic
4830     * situations, possibly creating a modified pattern.
4831     *
4832     * Now ready for the general case.  We build up all the possible things
4833     * that could match the first character of the pattern into the elements of
4834     * 'matches[]'
4835     *
4836     * Everything generally matches at least itself.  But if there is a
4837     * UTF8ness mismatch, we have to convert to that of the target string. */
4838    if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4839        matches[0][0] = pat[0];
4840        lengths[0] = 1;
4841        m->count++;
4842    }
4843    else if (utf8_target) {
4844        if (utf8_pat) {
4845            lengths[0] = UTF8SKIP(pat);
4846            Copy(pat, matches[0], lengths[0], U8);
4847            m->count++;
4848        }
4849        else {  /* target is UTF-8, pattern isn't */
4850            matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4851            matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4852            lengths[0] = 2;
4853            m->count++;
4854        }
4855    }
4856    else if (! utf8_pat) {  /* Neither is UTF-8 */
4857        matches[0][0] = pat[0];
4858        lengths[0] = 1;
4859        m->count++;
4860    }
4861    else     /* target isn't UTF-8; pattern is.  No match possible unless the
4862                pattern's first character can fit in a byte */
4863         if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4864    {
4865        matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4866        lengths[0] = 1;
4867        m->count++;
4868    }
4869
4870    /* Here we have taken care of any necessary node-type changes */
4871
4872    if (m->count) {
4873        m->max_length = lengths[0];
4874        m->min_length = lengths[0];
4875    }
4876
4877    /* For non-folding nodes, there are no other possible candidate matches,
4878     * but for foldable ones, we have to look further. */
4879    if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4880        UV folded;  /* The first character in the pattern, folded */
4881        U32 first_fold_from;    /* A character that folds to it */
4882        const U32 * remaining_fold_froms;   /* The remaining characters that
4883                                               fold to it, if any */
4884        Size_t folds_to_count;  /* The total number of characters that fold to
4885                                   'folded' */
4886
4887        /* If the node begins with a sequence of more than one character that
4888         * together form the fold of a single character, it is called a
4889         * 'multi-character fold', and the normal functions don't handle this
4890         * case.  We set 'multi_fold_from' to the single folded-from character,
4891         * which is handled in an extra iteration below */
4892        if (utf8_pat) {
4893            folded = valid_utf8_to_uvchr(pat, NULL);
4894            multi_fold_from
4895                          = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4896        }
4897        else {
4898            folded = *pat;
4899
4900            /* This may generate illegal combinations for things like EXACTF,
4901             * but rather than repeat the logic and exclude them here, all such
4902             * illegalities are checked for and skipped below in the loop */
4903            multi_fold_from
4904                        = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4905        }
4906
4907        /* Everything matches at least itself; initialize to that because the
4908         * only the branches below that set it are the ones where the number
4909         * isn't 1. */
4910        folds_to_count = 1;
4911
4912        /* There are a few special cases for locale-dependent nodes, where the
4913         * run-time context was needed before we could know what matched */
4914        if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4915            first_fold_from = PL_fold_locale[folded];
4916        }
4917        else if (   op == EXACTFL && utf8_target && utf8_pat
4918                 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4919                                            LATIN_SMALL_LETTER_LONG_S_UTF8))
4920        {
4921            first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4922        }
4923        else if (UNLIKELY(    op == TURKISH
4924                          && (   isALPHA_FOLD_EQ(folded, 'i')
4925                              || inRANGE(folded,
4926                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4927                                         LATIN_SMALL_LETTER_DOTLESS_I))))
4928        {   /* Turkish folding requires special handling */
4929            if (folded == 'i')
4930                first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4931            else if (folded == 'I')
4932                first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4933            else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4934                first_fold_from = 'i';
4935            else first_fold_from = 'I';
4936        }
4937        else {
4938            /* Here, isn't a special case: use the generic function to
4939             * calculate what folds to this */
4940          redo_multi:
4941            /* Look up what code points (besides itself) fold to 'folded';
4942             * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4943            folds_to_count = _inverse_folds(folded, &first_fold_from,
4944                                                       &remaining_fold_froms);
4945        }
4946
4947        /* Add each character that folds to 'folded' to the list of them,
4948         * subject to limitations based on the node type and target UTF8ness.
4949         * If there was a character that folded to multiple characters, do an
4950         * extra iteration for it.  (Note the extra iteration if there is a
4951         * multi-character fold) */
4952        for (i = 0; i < folds_to_count
4953                      + UNLIKELY(multi_fold_from != 0); i++)
4954        {
4955            UV fold_from = 0;
4956
4957            if (i >= folds_to_count) {  /* Final iteration: handle the
4958                                           multi-char */
4959                fold_from = multi_fold_from;
4960            }
4961            else if (i == 0) {
4962                fold_from = first_fold_from;
4963            }
4964            else if (i < folds_to_count) {
4965                fold_from = remaining_fold_froms[i-1];
4966            }
4967
4968            if (folded == fold_from) {  /* We already added the character
4969                                           itself */
4970                continue;
4971            }
4972
4973            /* EXACTF doesn't have any non-ascii folds */
4974            if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4975                continue;
4976            }
4977
4978            /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4979             * */
4980            if (    isASCII(folded) != isASCII(fold_from)
4981                &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4982
4983            {
4984                continue;
4985            }
4986
4987            /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4988             * locale, but those have been converted to EXACTFU above) */
4989            if (   op == EXACTFL
4990                && (folded < 256) != (fold_from < 256))
4991            {
4992                continue;
4993            }
4994
4995            /* If this triggers, it likely is because of the unlikely case
4996             * where a new Unicode standard has changed what MAX_MATCHES should
4997             * be set to */
4998            assert(m->count < MAX_MATCHES);
4999
5000            /* Add this character to the list of possible matches */
5001            if (utf8_target) {
5002                uvchr_to_utf8(matches[(U8) m->count], fold_from);
5003                lengths[m->count] = UVCHR_SKIP(fold_from);
5004                m->count++;
5005            }
5006            else { /* Non-UTF8 target: no code point above 255 can appear in it
5007                    */
5008                if (fold_from > 255) {
5009                    continue;
5010                }
5011
5012                matches[m->count][0] = fold_from;
5013                lengths[m->count] = 1;
5014                m->count++;
5015            }
5016
5017            /* Update min and mlengths */
5018            if (m->min_length > lengths[m->count-1]) {
5019                m->min_length = lengths[m->count-1];
5020            }
5021
5022            if (m->max_length < lengths[m->count-1]) {
5023                index_of_longest = m->count - 1;
5024                m->max_length = lengths[index_of_longest];
5025            }
5026        } /* looped through each potential fold */
5027
5028        /* If there is something that folded to an initial multi-character
5029         * fold, repeat, using it.  This catches some edge cases.  An example
5030         * of one is /ss/i when UTF-8 encoded.  The function
5031         * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
5032         * (LATIN SMALL SHARP S).  If it returned a list of characters, this
5033         * code wouldn't be needed.  But since it doesn't, we have to look what
5034         * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
5035         * */
5036        if (multi_fold_from) {
5037            folded = multi_fold_from;
5038            multi_fold_from = 0;
5039            goto redo_multi;
5040        }
5041    } /* End of finding things that participate in this fold */
5042
5043    if (m->count == 0) {    /* If nothing found, can't match */
5044        m->min_length = 0;
5045        return FALSE;
5046    }
5047
5048    /* Have calculated all possible matches.  Now calculate the mask and AND
5049     * values */
5050    m->initial_exact = 0;
5051    m->initial_definitive = 0;
5052
5053    {
5054        unsigned int mask_ones = 0;
5055        unsigned int possible_ones = 0;
5056        U8 j;
5057
5058        /* For each byte that is in all possible matches ... */
5059        for (j = 0; j < MIN(m->min_length, 5); j++) {
5060
5061            /* Initialize the accumulator for this byte */
5062            byte_mask[j] = 0xFF;
5063            byte_anded[j] = matches[0][j];
5064
5065            /* Then the rest of the rows (folds).  The mask is based on, like,
5066             * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
5067             * where they differ. */
5068            for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
5069                byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
5070                byte_anded[j] &= matches[i][j];
5071            }
5072
5073            /* Keep track of the number of initial mask bytes that are all one
5074             * bits.  The code calling this can use this number to know that
5075             * a string that matches this number of bytes in the pattern is an
5076             * exact match of that pattern for this number of bytes.  But also
5077             * counted are the number of initial bytes that in total have a
5078             * single zero bit.  If a string matches those, masked, it must be
5079             * one of two possibilites, both of which this function has
5080             * determined are legal.  (But if that single 0 is one of the
5081             * initial bits for masking a UTF-8 start byte, that could
5082             * incorrectly lead to different length strings appearing to be
5083             * equivalent, so only do this optimization when the matchables are
5084             * all the same length.  This was uncovered by testing
5085             * /\x{029E}/i.) */
5086            if (m->min_length == m->max_length) {
5087                mask_ones += PL_bitcount[byte_mask[j]];
5088                possible_ones += 8;
5089                if (mask_ones + 1 >= possible_ones) {
5090                    m->initial_definitive++;
5091                    if (mask_ones >= possible_ones) {
5092                        m->initial_exact++;
5093                    }
5094                }
5095            }
5096        }
5097    }
5098
5099    /* The first byte is separate for speed */
5100    m->first_byte_mask = byte_mask[0];
5101    m->first_byte_anded = byte_anded[0];
5102
5103    /* Then pack up to the next 4 bytes into a word */
5104    m->mask32 = m->anded32 = 0;
5105    for (i = 1; i < MIN(m->min_length, 5); i++) {
5106        U8 which = i;
5107        U8 shift = (which - 1) * 8;
5108        m->mask32  |= (U32) byte_mask[i]  << shift;
5109        m->anded32 |= (U32) byte_anded[i] << shift;
5110    }
5111
5112    /* Finally, take the match strings and place them sequentially into a
5113     * one-dimensional array.  (This is done to save significant space in the
5114     * structure.) Sort so the longest (presumably the least likely) is last.
5115     * XXX When this gets moved to regcomp, may want to fully sort shortest
5116     * first, but above we generally used the folded code point first, and
5117     * those tend to be no longer than their upper case values, so this is
5118     * already pretty well sorted by size.
5119     *
5120     * If the asserts fail, it's most likely because a new version of the
5121     * Unicode standard requires more space; simply increase the declaration
5122     * size. */
5123    {
5124        U8 cur_pos = 0;
5125        U8 output_index = 0;
5126
5127        if (m->count > 1) { /* No need to sort a single entry */
5128            for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5129
5130                /* Keep the same order for all but the longest.  (If the
5131                 * asserts fail, it could be because m->matches is declared too
5132                 * short, either because of a new Unicode release, or an
5133                 * overlooked test case, or it could be a bug.) */
5134                if (i != index_of_longest) {
5135                    assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5136                    Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5137                    cur_pos += lengths[i];
5138                    m->lengths[output_index++] = lengths[i];
5139                }
5140            }
5141        }
5142
5143        assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5144        Copy(matches[index_of_longest], m->matches + cur_pos,
5145             lengths[index_of_longest], U8);
5146
5147        /* Place the longest match last */
5148        m->lengths[output_index] = lengths[index_of_longest];
5149    }
5150
5151
5152    return TRUE;
5153}
5154
5155PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5156bool
5157S_test_EXACTISH_ST(const char * loc,
5158                   struct next_matchable_info info)
5159{
5160    /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5161     * bytes starting at 'loc' can match based on 'next_matchable_info' */
5162
5163    U32 input32 = 0;
5164
5165    /* Check the first byte */
5166    if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5167        return FALSE;
5168
5169    /* Pack the next up-to-4 bytes into a 32 bit word */
5170    switch (info.min_length) {
5171        default:
5172            input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5173            /* FALLTHROUGH */
5174        case 4:
5175            input32 |= (U8) loc[3] << 2 * 8;
5176            /* FALLTHROUGH */
5177        case 3:
5178            input32 |= (U8) loc[2] << 1 * 8;
5179            /* FALLTHROUGH */
5180        case 2:
5181            input32 |= (U8) loc[1];
5182            break;
5183        case 1:
5184            return TRUE;    /* We already tested and passed the 0th byte */
5185        case 0:
5186            ASSUME(0);
5187    }
5188
5189    /* And AND that with the mask and compare that with the assembled ANDED
5190     * values */
5191    return (input32 & info.mask32) == info.anded32;
5192}
5193
5194STATIC bool
5195S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5196{
5197    /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5198     * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5199
5200    PERL_ARGS_ASSERT_ISGCB;
5201
5202    switch (GCB_table[before][after]) {
5203        case GCB_BREAKABLE:
5204            return TRUE;
5205
5206        case GCB_NOBREAK:
5207            return FALSE;
5208
5209        case GCB_RI_then_RI:
5210            {
5211                int RI_count = 1;
5212                U8 * temp_pos = (U8 *) curpos;
5213
5214                /* Do not break within emoji flag sequences. That is, do not
5215                 * break between regional indicator (RI) symbols if there is an
5216                 * odd number of RI characters before the break point.
5217                 *  GB12   sot (RI RI)* RI �� RI
5218                 *  GB13 [^RI] (RI RI)* RI �� RI */
5219
5220                while (backup_one_GCB(strbeg,
5221                                    &temp_pos,
5222                                    utf8_target) == GCB_Regional_Indicator)
5223                {
5224                    RI_count++;
5225                }
5226
5227                return RI_count % 2 != 1;
5228            }
5229
5230        case GCB_EX_then_EM:
5231
5232            /* GB10  ( E_Base | E_Base_GAZ ) Extend* ��  E_Modifier */
5233            {
5234                U8 * temp_pos = (U8 *) curpos;
5235                GCB_enum prev;
5236
5237                do {
5238                    prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5239                }
5240                while (prev == GCB_Extend);
5241
5242                return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5243            }
5244
5245        case GCB_Maybe_Emoji_NonBreak:
5246
5247            {
5248
5249            /* Do not break within emoji modifier sequences or emoji zwj sequences.
5250              GB11 \p{Extended_Pictographic} Extend* ZWJ �� \p{Extended_Pictographic}
5251              */
5252                U8 * temp_pos = (U8 *) curpos;
5253                GCB_enum prev;
5254
5255                do {
5256                    prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5257                }
5258                while (prev == GCB_Extend);
5259
5260                return prev != GCB_ExtPict_XX;
5261            }
5262
5263        default:
5264            break;
5265    }
5266
5267#ifdef DEBUGGING
5268    Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5269                                  before, after, GCB_table[before][after]);
5270    assert(0);
5271#endif
5272    return TRUE;
5273}
5274
5275STATIC GCB_enum
5276S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5277{
5278    GCB_enum gcb;
5279
5280    PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5281
5282    if (*curpos < strbeg) {
5283        return GCB_EDGE;
5284    }
5285
5286    if (utf8_target) {
5287        U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5288        U8 * prev_prev_char_pos;
5289
5290        if (! prev_char_pos) {
5291            return GCB_EDGE;
5292        }
5293
5294        if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5295            gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5296            *curpos = prev_char_pos;
5297            prev_char_pos = prev_prev_char_pos;
5298        }
5299        else {
5300            *curpos = (U8 *) strbeg;
5301            return GCB_EDGE;
5302        }
5303    }
5304    else {
5305        if (*curpos - 2 < strbeg) {
5306            *curpos = (U8 *) strbeg;
5307            return GCB_EDGE;
5308        }
5309        (*curpos)--;
5310        gcb = getGCB_VAL_CP(*(*curpos - 1));
5311    }
5312
5313    return gcb;
5314}
5315
5316/* Combining marks attach to most classes that precede them, but this defines
5317 * the exceptions (from TR14) */
5318#define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5319                                     || prev == LB_Mandatory_Break      \
5320                                     || prev == LB_Carriage_Return      \
5321                                     || prev == LB_Line_Feed            \
5322                                     || prev == LB_Next_Line            \
5323                                     || prev == LB_Space                \
5324                                     || prev == LB_ZWSpace))
5325
5326STATIC bool
5327S_isLB(pTHX_ LB_enum before,
5328             LB_enum after,
5329             const U8 * const strbeg,
5330             const U8 * const curpos,
5331             const U8 * const strend,
5332             const bool utf8_target)
5333{
5334    U8 * temp_pos = (U8 *) curpos;
5335    LB_enum prev = before;
5336
5337    /* Is the boundary between 'before' and 'after' line-breakable?
5338     * Most of this is just a table lookup of a generated table from Unicode
5339     * rules.  But some rules require context to decide, and so have to be
5340     * implemented in code */
5341
5342    PERL_ARGS_ASSERT_ISLB;
5343
5344    /* Rule numbers in the comments below are as of Unicode 9.0 */
5345
5346  redo:
5347    before = prev;
5348    switch (LB_table[before][after]) {
5349        case LB_BREAKABLE:
5350            return TRUE;
5351
5352        case LB_NOBREAK:
5353        case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5354            return FALSE;
5355
5356        case LB_SP_foo + LB_BREAKABLE:
5357        case LB_SP_foo + LB_NOBREAK:
5358        case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5359
5360            /* When we have something following a SP, we have to look at the
5361             * context in order to know what to do.
5362             *
5363             * SP SP should not reach here because LB7: Do not break before
5364             * spaces.  (For two spaces in a row there is nothing that
5365             * overrides that) */
5366            assert(after != LB_Space);
5367
5368            /* Here we have a space followed by a non-space.  Mostly this is a
5369             * case of LB18: "Break after spaces".  But there are complications
5370             * as the handling of spaces is somewhat tricky.  They are in a
5371             * number of rules, which have to be applied in priority order, but
5372             * something earlier in the string can cause a rule to be skipped
5373             * and a lower priority rule invoked.  A prime example is LB7 which
5374             * says don't break before a space.  But rule LB8 (lower priority)
5375             * says that the first break opportunity after a ZW is after any
5376             * span of spaces immediately after it.  If a ZW comes before a SP
5377             * in the input, rule LB8 applies, and not LB7.  Other such rules
5378             * involve combining marks which are rules 9 and 10, but they may
5379             * override higher priority rules if they come earlier in the
5380             * string.  Since we're doing random access into the middle of the
5381             * string, we have to look for rules that should get applied based
5382             * on both string position and priority.  Combining marks do not
5383             * attach to either ZW nor SP, so we don't have to consider them
5384             * until later.
5385             *
5386             * To check for LB8, we have to find the first non-space character
5387             * before this span of spaces */
5388            do {
5389                prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5390            }
5391            while (prev == LB_Space);
5392
5393            /* LB8 Break before any character following a zero-width space,
5394             * even if one or more spaces intervene.
5395             *      ZW SP* ��
5396             * So if we have a ZW just before this span, and to get here this
5397             * is the final space in the span. */
5398            if (prev == LB_ZWSpace) {
5399                return TRUE;
5400            }
5401
5402            /* Here, not ZW SP+.  There are several rules that have higher
5403             * priority than LB18 and can be resolved now, as they don't depend
5404             * on anything earlier in the string (except ZW, which we have
5405             * already handled).  One of these rules is LB11 Do not break
5406             * before Word joiner, but we have specially encoded that in the
5407             * lookup table so it is caught by the single test below which
5408             * catches the other ones. */
5409            if (LB_table[LB_Space][after] - LB_SP_foo
5410                                            == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5411            {
5412                return FALSE;
5413            }
5414
5415            /* If we get here, we have to XXX consider combining marks. */
5416            if (prev == LB_Combining_Mark) {
5417
5418                /* What happens with these depends on the character they
5419                 * follow.  */
5420                do {
5421                    prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5422                }
5423                while (prev == LB_Combining_Mark);
5424
5425                /* Most times these attach to and inherit the characteristics
5426                 * of that character, but not always, and when not, they are to
5427                 * be treated as AL by rule LB10. */
5428                if (! LB_CM_ATTACHES_TO(prev)) {
5429                    prev = LB_Alphabetic;
5430                }
5431            }
5432
5433            /* Here, we have the character preceding the span of spaces all set
5434             * up.  We follow LB18: "Break after spaces" unless the table shows
5435             * that is overridden */
5436            return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5437
5438        case LB_CM_ZWJ_foo:
5439
5440            /* We don't know how to treat the CM except by looking at the first
5441             * non-CM character preceding it.  ZWJ is treated as CM */
5442            do {
5443                prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5444            }
5445            while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5446
5447            /* Here, 'prev' is that first earlier non-CM character.  If the CM
5448             * attaches to it, then it inherits the behavior of 'prev'.  If it
5449             * doesn't attach, it is to be treated as an AL */
5450            if (! LB_CM_ATTACHES_TO(prev)) {
5451                prev = LB_Alphabetic;
5452            }
5453
5454            goto redo;
5455
5456        case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5457        case LB_HY_or_BA_then_foo + LB_NOBREAK:
5458
5459            /* LB21a Don't break after Hebrew + Hyphen.
5460             * HL (HY | BA) �� */
5461
5462            if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5463                                                          == LB_Hebrew_Letter)
5464            {
5465                return FALSE;
5466            }
5467
5468            return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5469
5470        case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5471        case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5472
5473            /* LB25a (PR | PO) �� ( OP | HY )? NU */
5474            if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5475                return FALSE;
5476            }
5477
5478            return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5479                                                                == LB_BREAKABLE;
5480
5481        case LB_SY_or_IS_then_various + LB_BREAKABLE:
5482        case LB_SY_or_IS_then_various + LB_NOBREAK:
5483        {
5484            /* LB25d NU (SY | IS)* �� (NU | SY | IS | CL | CP ) */
5485
5486            LB_enum temp = prev;
5487            do {
5488                temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5489            }
5490            while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5491            if (temp == LB_Numeric) {
5492                return FALSE;
5493            }
5494
5495            return LB_table[prev][after] - LB_SY_or_IS_then_various
5496                                                               == LB_BREAKABLE;
5497        }
5498
5499        case LB_various_then_PO_or_PR + LB_BREAKABLE:
5500        case LB_various_then_PO_or_PR + LB_NOBREAK:
5501        {
5502            /* LB25e NU (SY | IS)* (CL | CP)? �� (PO | PR) */
5503
5504            LB_enum temp = prev;
5505            if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5506            {
5507                temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5508            }
5509            while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5510                temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5511            }
5512            if (temp == LB_Numeric) {
5513                return FALSE;
5514            }
5515            return LB_various_then_PO_or_PR;
5516        }
5517
5518        case LB_RI_then_RI + LB_NOBREAK:
5519        case LB_RI_then_RI + LB_BREAKABLE:
5520            {
5521                int RI_count = 1;
5522
5523                /* LB30a Break between two regional indicator symbols if and
5524                 * only if there are an even number of regional indicators
5525                 * preceding the position of the break.
5526                 *
5527                 *    sot (RI RI)* RI �� RI
5528                 *  [^RI] (RI RI)* RI �� RI */
5529
5530                while (backup_one_LB(strbeg,
5531                                     &temp_pos,
5532                                     utf8_target) == LB_Regional_Indicator)
5533                {
5534                    RI_count++;
5535                }
5536
5537                return RI_count % 2 == 0;
5538            }
5539
5540        default:
5541            break;
5542    }
5543
5544#ifdef DEBUGGING
5545    Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5546                                  before, after, LB_table[before][after]);
5547    assert(0);
5548#endif
5549    return TRUE;
5550}
5551
5552STATIC LB_enum
5553S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5554{
5555
5556    LB_enum lb;
5557
5558    PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5559
5560    if (*curpos >= strend) {
5561        return LB_EDGE;
5562    }
5563
5564    if (utf8_target) {
5565        *curpos += UTF8SKIP(*curpos);
5566        if (*curpos >= strend) {
5567            return LB_EDGE;
5568        }
5569        lb = getLB_VAL_UTF8(*curpos, strend);
5570    }
5571    else {
5572        (*curpos)++;
5573        if (*curpos >= strend) {
5574            return LB_EDGE;
5575        }
5576        lb = getLB_VAL_CP(**curpos);
5577    }
5578
5579    return lb;
5580}
5581
5582STATIC LB_enum
5583S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5584{
5585    LB_enum lb;
5586
5587    PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5588
5589    if (*curpos < strbeg) {
5590        return LB_EDGE;
5591    }
5592
5593    if (utf8_target) {
5594        U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5595        U8 * prev_prev_char_pos;
5596
5597        if (! prev_char_pos) {
5598            return LB_EDGE;
5599        }
5600
5601        if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5602            lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5603            *curpos = prev_char_pos;
5604            prev_char_pos = prev_prev_char_pos;
5605        }
5606        else {
5607            *curpos = (U8 *) strbeg;
5608            return LB_EDGE;
5609        }
5610    }
5611    else {
5612        if (*curpos - 2 < strbeg) {
5613            *curpos = (U8 *) strbeg;
5614            return LB_EDGE;
5615        }
5616        (*curpos)--;
5617        lb = getLB_VAL_CP(*(*curpos - 1));
5618    }
5619
5620    return lb;
5621}
5622
5623STATIC bool
5624S_isSB(pTHX_ SB_enum before,
5625             SB_enum after,
5626             const U8 * const strbeg,
5627             const U8 * const curpos,
5628             const U8 * const strend,
5629             const bool utf8_target)
5630{
5631    /* returns a boolean indicating if there is a Sentence Boundary Break
5632     * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5633
5634    U8 * lpos = (U8 *) curpos;
5635    bool has_para_sep = FALSE;
5636    bool has_sp = FALSE;
5637
5638    PERL_ARGS_ASSERT_ISSB;
5639
5640    /* Break at the start and end of text.
5641        SB1.  sot  ��
5642        SB2.  ��  eot
5643      But unstated in Unicode is don't break if the text is empty */
5644    if (before == SB_EDGE || after == SB_EDGE) {
5645        return before != after;
5646    }
5647
5648    /* SB 3: Do not break within CRLF. */
5649    if (before == SB_CR && after == SB_LF) {
5650        return FALSE;
5651    }
5652
5653    /* Break after paragraph separators.  CR and LF are considered
5654     * so because Unicode views text as like word processing text where there
5655     * are no newlines except between paragraphs, and the word processor takes
5656     * care of wrapping without there being hard line-breaks in the text *./
5657       SB4.  Sep | CR | LF  �� */
5658    if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5659        return TRUE;
5660    }
5661
5662    /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5663     * (See Section 6.2, Replacing Ignore Rules.)
5664        SB5.  X (Extend | Format)*  ���  X */
5665    if (after == SB_Extend || after == SB_Format) {
5666
5667        /* Implied is that the these characters attach to everything
5668         * immediately prior to them except for those separator-type
5669         * characters.  And the rules earlier have already handled the case
5670         * when one of those immediately precedes the extend char */
5671        return FALSE;
5672    }
5673
5674    if (before == SB_Extend || before == SB_Format) {
5675        U8 * temp_pos = lpos;
5676        const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5677        if (   backup != SB_EDGE
5678            && backup != SB_Sep
5679            && backup != SB_CR
5680            && backup != SB_LF)
5681        {
5682            before = backup;
5683            lpos = temp_pos;
5684        }
5685
5686        /* Here, both 'before' and 'backup' are these types; implied is that we
5687         * don't break between them */
5688        if (backup == SB_Extend || backup == SB_Format) {
5689            return FALSE;
5690        }
5691    }
5692
5693    /* Do not break after ambiguous terminators like period, if they are
5694     * immediately followed by a number or lowercase letter, if they are
5695     * between uppercase letters, if the first following letter (optionally
5696     * after certain punctuation) is lowercase, or if they are followed by
5697     * "continuation" punctuation such as comma, colon, or semicolon. For
5698     * example, a period may be an abbreviation or numeric period, and thus may
5699     * not mark the end of a sentence.
5700
5701     * SB6. ATerm  ��  Numeric */
5702    if (before == SB_ATerm && after == SB_Numeric) {
5703        return FALSE;
5704    }
5705
5706    /* SB7.  (Upper | Lower) ATerm  ��  Upper */
5707    if (before == SB_ATerm && after == SB_Upper) {
5708        U8 * temp_pos = lpos;
5709        SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5710        if (backup == SB_Upper || backup == SB_Lower) {
5711            return FALSE;
5712        }
5713    }
5714
5715    /* The remaining rules that aren't the final one, all require an STerm or
5716     * an ATerm after having backed up over some Close* Sp*, and in one case an
5717     * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5718     * So do that backup now, setting flags if either Sp or a paragraph
5719     * separator are found */
5720
5721    if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5722        has_para_sep = TRUE;
5723        before = backup_one_SB(strbeg, &lpos, utf8_target);
5724    }
5725
5726    if (before == SB_Sp) {
5727        has_sp = TRUE;
5728        do {
5729            before = backup_one_SB(strbeg, &lpos, utf8_target);
5730        }
5731        while (before == SB_Sp);
5732    }
5733
5734    while (before == SB_Close) {
5735        before = backup_one_SB(strbeg, &lpos, utf8_target);
5736    }
5737
5738    /* The next few rules apply only when the backed-up-to is an ATerm, and in
5739     * most cases an STerm */
5740    if (before == SB_STerm || before == SB_ATerm) {
5741
5742        /* So, here the lhs matches
5743         *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5744         * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5745         * The rules that apply here are:
5746         *
5747         * SB8    ATerm Close* Sp*  ��  ( ��(OLetter | Upper | Lower | Sep | CR
5748                                           | LF | STerm | ATerm) )* Lower
5749           SB8a  (STerm | ATerm) Close* Sp*  ��  (SContinue | STerm | ATerm)
5750           SB9   (STerm | ATerm) Close*  ��  (Close | Sp | Sep | CR | LF)
5751           SB10  (STerm | ATerm) Close* Sp*  ��  (Sp | Sep | CR | LF)
5752           SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ��
5753         */
5754
5755        /* And all but SB11 forbid having seen a paragraph separator */
5756        if (! has_para_sep) {
5757            if (before == SB_ATerm) {          /* SB8 */
5758                U8 * rpos = (U8 *) curpos;
5759                SB_enum later = after;
5760
5761                while (    later != SB_OLetter
5762                        && later != SB_Upper
5763                        && later != SB_Lower
5764                        && later != SB_Sep
5765                        && later != SB_CR
5766                        && later != SB_LF
5767                        && later != SB_STerm
5768                        && later != SB_ATerm
5769                        && later != SB_EDGE)
5770                {
5771                    later = advance_one_SB(&rpos, strend, utf8_target);
5772                }
5773                if (later == SB_Lower) {
5774                    return FALSE;
5775                }
5776            }
5777
5778            if (   after == SB_SContinue    /* SB8a */
5779                || after == SB_STerm
5780                || after == SB_ATerm)
5781            {
5782                return FALSE;
5783            }
5784
5785            if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5786                if (   after == SB_Close
5787                    || after == SB_Sp
5788                    || after == SB_Sep
5789                    || after == SB_CR
5790                    || after == SB_LF)
5791                {
5792                    return FALSE;
5793                }
5794            }
5795
5796            /* SB10.  This and SB9 could probably be combined some way, but khw
5797             * has decided to follow the Unicode rule book precisely for
5798             * simplified maintenance */
5799            if (   after == SB_Sp
5800                || after == SB_Sep
5801                || after == SB_CR
5802                || after == SB_LF)
5803            {
5804                return FALSE;
5805            }
5806        }
5807
5808        /* SB11.  */
5809        return TRUE;
5810    }
5811
5812    /* Otherwise, do not break.
5813    SB12.  Any  ��  Any */
5814
5815    return FALSE;
5816}
5817
5818STATIC SB_enum
5819S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5820{
5821    SB_enum sb;
5822
5823    PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5824
5825    if (*curpos >= strend) {
5826        return SB_EDGE;
5827    }
5828
5829    if (utf8_target) {
5830        do {
5831            *curpos += UTF8SKIP(*curpos);
5832            if (*curpos >= strend) {
5833                return SB_EDGE;
5834            }
5835            sb = getSB_VAL_UTF8(*curpos, strend);
5836        } while (sb == SB_Extend || sb == SB_Format);
5837    }
5838    else {
5839        do {
5840            (*curpos)++;
5841            if (*curpos >= strend) {
5842                return SB_EDGE;
5843            }
5844            sb = getSB_VAL_CP(**curpos);
5845        } while (sb == SB_Extend || sb == SB_Format);
5846    }
5847
5848    return sb;
5849}
5850
5851STATIC SB_enum
5852S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5853{
5854    SB_enum sb;
5855
5856    PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5857
5858    if (*curpos < strbeg) {
5859        return SB_EDGE;
5860    }
5861
5862    if (utf8_target) {
5863        U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5864        if (! prev_char_pos) {
5865            return SB_EDGE;
5866        }
5867
5868        /* Back up over Extend and Format.  curpos is always just to the right
5869         * of the character whose value we are getting */
5870        do {
5871            U8 * prev_prev_char_pos;
5872            if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5873                                                                      strbeg)))
5874            {
5875                sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5876                *curpos = prev_char_pos;
5877                prev_char_pos = prev_prev_char_pos;
5878            }
5879            else {
5880                *curpos = (U8 *) strbeg;
5881                return SB_EDGE;
5882            }
5883        } while (sb == SB_Extend || sb == SB_Format);
5884    }
5885    else {
5886        do {
5887            if (*curpos - 2 < strbeg) {
5888                *curpos = (U8 *) strbeg;
5889                return SB_EDGE;
5890            }
5891            (*curpos)--;
5892            sb = getSB_VAL_CP(*(*curpos - 1));
5893        } while (sb == SB_Extend || sb == SB_Format);
5894    }
5895
5896    return sb;
5897}
5898
5899STATIC bool
5900S_isWB(pTHX_ WB_enum previous,
5901             WB_enum before,
5902             WB_enum after,
5903             const U8 * const strbeg,
5904             const U8 * const curpos,
5905             const U8 * const strend,
5906             const bool utf8_target)
5907{
5908    /*  Return a boolean as to if the boundary between 'before' and 'after' is
5909     *  a Unicode word break, using their published algorithm, but tailored for
5910     *  Perl by treating spans of white space as one unit.  Context may be
5911     *  needed to make this determination.  If the value for the character
5912     *  before 'before' is known, it is passed as 'previous'; otherwise that
5913     *  should be set to WB_UNKNOWN.  The other input parameters give the
5914     *  boundaries and current position in the matching of the string.  That
5915     *  is, 'curpos' marks the position where the character whose wb value is
5916     *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5917
5918    U8 * before_pos = (U8 *) curpos;
5919    U8 * after_pos = (U8 *) curpos;
5920    WB_enum prev = before;
5921    WB_enum next;
5922
5923    PERL_ARGS_ASSERT_ISWB;
5924
5925    /* Rule numbers in the comments below are as of Unicode 9.0 */
5926
5927  redo:
5928    before = prev;
5929    switch (WB_table[before][after]) {
5930        case WB_BREAKABLE:
5931            return TRUE;
5932
5933        case WB_NOBREAK:
5934            return FALSE;
5935
5936        case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5937            next = advance_one_WB(&after_pos, strend, utf8_target,
5938                                 FALSE /* Don't skip Extend nor Format */ );
5939            /* A space immediately preceding an Extend or Format is attached
5940             * to by them, and hence gets separated from previous spaces.
5941             * Otherwise don't break between horizontal white space */
5942            return next == WB_Extend || next == WB_Format;
5943
5944        /* WB4 Ignore Format and Extend characters, except when they appear at
5945         * the beginning of a region of text.  This code currently isn't
5946         * general purpose, but it works as the rules are currently and likely
5947         * to be laid out.  The reason it works is that when 'they appear at
5948         * the beginning of a region of text', the rule is to break before
5949         * them, just like any other character.  Therefore, the default rule
5950         * applies and we don't have to look in more depth.  Should this ever
5951         * change, we would have to have 2 'case' statements, like in the rules
5952         * below, and backup a single character (not spacing over the extend
5953         * ones) and then see if that is one of the region-end characters and
5954         * go from there */
5955        case WB_Ex_or_FO_or_ZWJ_then_foo:
5956            prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5957            goto redo;
5958
5959        case WB_DQ_then_HL + WB_BREAKABLE:
5960        case WB_DQ_then_HL + WB_NOBREAK:
5961
5962            /* WB7c  Hebrew_Letter Double_Quote  ��  Hebrew_Letter */
5963
5964            if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5965                                                            == WB_Hebrew_Letter)
5966            {
5967                return FALSE;
5968            }
5969
5970             return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5971
5972        case WB_HL_then_DQ + WB_BREAKABLE:
5973        case WB_HL_then_DQ + WB_NOBREAK:
5974
5975            /* WB7b  Hebrew_Letter  ��  Double_Quote Hebrew_Letter */
5976
5977            if (advance_one_WB(&after_pos, strend, utf8_target,
5978                                       TRUE /* Do skip Extend and Format */ )
5979                                                            == WB_Hebrew_Letter)
5980            {
5981                return FALSE;
5982            }
5983
5984            return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5985
5986        case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5987        case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5988
5989            /* WB6  (ALetter | Hebrew_Letter)  ��  (MidLetter | MidNumLet
5990             *       | Single_Quote) (ALetter | Hebrew_Letter) */
5991
5992            next = advance_one_WB(&after_pos, strend, utf8_target,
5993                                       TRUE /* Do skip Extend and Format */ );
5994
5995            if (next == WB_ALetter || next == WB_Hebrew_Letter)
5996            {
5997                return FALSE;
5998            }
5999
6000            return WB_table[before][after]
6001                            - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
6002
6003        case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
6004        case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
6005
6006            /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
6007             *       | Single_Quote)  ��  (ALetter | Hebrew_Letter) */
6008
6009            prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
6010            if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
6011            {
6012                return FALSE;
6013            }
6014
6015            return WB_table[before][after]
6016                            - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
6017
6018        case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
6019        case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
6020
6021            /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ��  Numeric
6022             * */
6023
6024            if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
6025                                                            == WB_Numeric)
6026            {
6027                return FALSE;
6028            }
6029
6030            return WB_table[before][after]
6031                                - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
6032
6033        case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
6034        case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
6035
6036            /* WB12  Numeric  ��  (MidNum | MidNumLet | Single_Quote) Numeric */
6037
6038            if (advance_one_WB(&after_pos, strend, utf8_target,
6039                                       TRUE /* Do skip Extend and Format */ )
6040                                                            == WB_Numeric)
6041            {
6042                return FALSE;
6043            }
6044
6045            return WB_table[before][after]
6046                                - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
6047
6048        case WB_RI_then_RI + WB_NOBREAK:
6049        case WB_RI_then_RI + WB_BREAKABLE:
6050            {
6051                int RI_count = 1;
6052
6053                /* Do not break within emoji flag sequences. That is, do not
6054                 * break between regional indicator (RI) symbols if there is an
6055                 * odd number of RI characters before the potential break
6056                 * point.
6057                 *
6058                 * WB15   sot (RI RI)* RI �� RI
6059                 * WB16 [^RI] (RI RI)* RI �� RI */
6060
6061                while (backup_one_WB(&previous,
6062                                     strbeg,
6063                                     &before_pos,
6064                                     utf8_target) == WB_Regional_Indicator)
6065                {
6066                    RI_count++;
6067                }
6068
6069                return RI_count % 2 != 1;
6070            }
6071
6072        default:
6073            break;
6074    }
6075
6076#ifdef DEBUGGING
6077    Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
6078                                  before, after, WB_table[before][after]);
6079    assert(0);
6080#endif
6081    return TRUE;
6082}
6083
6084STATIC WB_enum
6085S_advance_one_WB(pTHX_ U8 ** curpos,
6086                       const U8 * const strend,
6087                       const bool utf8_target,
6088                       const bool skip_Extend_Format)
6089{
6090    WB_enum wb;
6091
6092    PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6093
6094    if (*curpos >= strend) {
6095        return WB_EDGE;
6096    }
6097
6098    if (utf8_target) {
6099
6100        /* Advance over Extend and Format */
6101        do {
6102            *curpos += UTF8SKIP(*curpos);
6103            if (*curpos >= strend) {
6104                return WB_EDGE;
6105            }
6106            wb = getWB_VAL_UTF8(*curpos, strend);
6107        } while (    skip_Extend_Format
6108                 && (wb == WB_Extend || wb == WB_Format));
6109    }
6110    else {
6111        do {
6112            (*curpos)++;
6113            if (*curpos >= strend) {
6114                return WB_EDGE;
6115            }
6116            wb = getWB_VAL_CP(**curpos);
6117        } while (    skip_Extend_Format
6118                 && (wb == WB_Extend || wb == WB_Format));
6119    }
6120
6121    return wb;
6122}
6123
6124STATIC WB_enum
6125S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6126{
6127    WB_enum wb;
6128
6129    PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6130
6131    /* If we know what the previous character's break value is, don't have
6132        * to look it up */
6133    if (*previous != WB_UNKNOWN) {
6134        wb = *previous;
6135
6136        /* But we need to move backwards by one */
6137        if (utf8_target) {
6138            *curpos = reghopmaybe3(*curpos, -1, strbeg);
6139            if (! *curpos) {
6140                *previous = WB_EDGE;
6141                *curpos = (U8 *) strbeg;
6142            }
6143            else {
6144                *previous = WB_UNKNOWN;
6145            }
6146        }
6147        else {
6148            (*curpos)--;
6149            *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6150        }
6151
6152        /* And we always back up over these three types */
6153        if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6154            return wb;
6155        }
6156    }
6157
6158    if (*curpos < strbeg) {
6159        return WB_EDGE;
6160    }
6161
6162    if (utf8_target) {
6163        U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6164        if (! prev_char_pos) {
6165            return WB_EDGE;
6166        }
6167
6168        /* Back up over Extend and Format.  curpos is always just to the right
6169         * of the character whose value we are getting */
6170        do {
6171            U8 * prev_prev_char_pos;
6172            if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6173                                                   -1,
6174                                                   strbeg)))
6175            {
6176                wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6177                *curpos = prev_char_pos;
6178                prev_char_pos = prev_prev_char_pos;
6179            }
6180            else {
6181                *curpos = (U8 *) strbeg;
6182                return WB_EDGE;
6183            }
6184        } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6185    }
6186    else {
6187        do {
6188            if (*curpos - 2 < strbeg) {
6189                *curpos = (U8 *) strbeg;
6190                return WB_EDGE;
6191            }
6192            (*curpos)--;
6193            wb = getWB_VAL_CP(*(*curpos - 1));
6194        } while (wb == WB_Extend || wb == WB_Format);
6195    }
6196
6197    return wb;
6198}
6199
6200/* Macros for regmatch(), using its internal variables */
6201#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6202#define NEXTCHR_IS_EOS (nextbyte < 0)
6203
6204#define SET_nextchr \
6205    nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6206
6207#define SET_locinput(p) \
6208    locinput = (p);  \
6209    SET_nextchr
6210
6211#define sayYES goto yes
6212#define sayNO goto no
6213#define sayNO_SILENT goto no_silent
6214
6215/* we don't use STMT_START/END here because it leads to
6216   "unreachable code" warnings, which are bogus, but distracting. */
6217#define CACHEsayNO \
6218    if (ST.cache_mask) \
6219       reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6220    sayNO
6221
6222#define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6223(                                                           \
6224    (   ( st )                                         ) && \
6225    (   ( st )->u.eval.close_paren                     ) && \
6226    ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6227)
6228
6229#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6230(                                                           \
6231    (   ( st )                                         ) && \
6232    (   ( st )->u.eval.close_paren                     ) && \
6233    (   ( expr )                                       ) && \
6234    ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6235)
6236
6237
6238#define EVAL_CLOSE_PAREN_SET(st,expr) \
6239    (st)->u.eval.close_paren = ( (expr) + 1 )
6240
6241#define EVAL_CLOSE_PAREN_CLEAR(st) \
6242    (st)->u.eval.close_paren = 0
6243
6244/* push a new state then goto it */
6245
6246#define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6247    pushinput = input; \
6248    pusheol = eol; \
6249    pushsr0 = sr0; \
6250    scan = node; \
6251    st->resume_state = state; \
6252    goto push_state;
6253
6254/* push a new state with success backtracking, then goto it */
6255
6256#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6257    pushinput = input; \
6258    pusheol = eol;     \
6259    pushsr0 = sr0; \
6260    scan = node; \
6261    st->resume_state = state; \
6262    goto push_yes_state;
6263
6264#define DEBUG_STATE_pp(pp)                                  \
6265    DEBUG_STATE_r({                                         \
6266        DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6267        Perl_re_printf( aTHX_                               \
6268            "%*s" pp " %s%s%s%s%s\n",                       \
6269            INDENT_CHARS(depth), "",                        \
6270            REGNODE_NAME(st->resume_state),                  \
6271            ((st==yes_state||st==mark_state) ? "[" : ""),   \
6272            ((st==yes_state) ? "Y" : ""),                   \
6273            ((st==mark_state) ? "M" : ""),                  \
6274            ((st==yes_state||st==mark_state) ? "]" : "")    \
6275        );                                                  \
6276    });
6277
6278/*
6279
6280regmatch() - main matching routine
6281
6282This is basically one big switch statement in a loop. We execute an op,
6283set 'next' to point the next op, and continue. If we come to a point which
6284we may need to backtrack to on failure such as (A|B|C), we push a
6285backtrack state onto the backtrack stack. On failure, we pop the top
6286state, and re-enter the loop at the state indicated. If there are no more
6287states to pop, we return failure.
6288
6289Sometimes we also need to backtrack on success; for example /A+/, where
6290after successfully matching one A, we need to go back and try to
6291match another one; similarly for lookahead assertions: if the assertion
6292completes successfully, we backtrack to the state just before the assertion
6293and then carry on.  In these cases, the pushed state is marked as
6294'backtrack on success too'. This marking is in fact done by a chain of
6295pointers, each pointing to the previous 'yes' state. On success, we pop to
6296the nearest yes state, discarding any intermediate failure-only states.
6297Sometimes a yes state is pushed just to force some cleanup code to be
6298called at the end of a successful match or submatch; e.g. (??{$re}) uses
6299it to free the inner regex.
6300
6301Note that failure backtracking rewinds the cursor position, while
6302success backtracking leaves it alone.
6303
6304A pattern is complete when the END op is executed, while a subpattern
6305such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6306ops trigger the "pop to last yes state if any, otherwise return true"
6307behaviour.
6308
6309A common convention in this function is to use A and B to refer to the two
6310subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6311the subpattern to be matched possibly multiple times, while B is the entire
6312rest of the pattern. Variable and state names reflect this convention.
6313
6314The states in the main switch are the union of ops and failure/success of
6315substates associated with that op.  For example, IFMATCH is the op
6316that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6317'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6318successfully matched A and IFMATCH_A_fail is a state saying that we have
6319just failed to match A. Resume states always come in pairs. The backtrack
6320state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6321at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6322on success or failure.
6323
6324The struct that holds a backtracking state is actually a big union, with
6325one variant for each major type of op. The variable st points to the
6326top-most backtrack struct. To make the code clearer, within each
6327block of code we #define ST to alias the relevant union.
6328
6329Here's a concrete example of a (vastly oversimplified) IFMATCH
6330implementation:
6331
6332    switch (state) {
6333    ....
6334
6335#define ST st->u.ifmatch
6336
6337    case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6338        ST.foo = ...; // some state we wish to save
6339        ...
6340        // push a yes backtrack state with a resume value of
6341        // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6342        // first node of A:
6343        PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6344        // NOTREACHED
6345
6346    case IFMATCH_A: // we have successfully executed A; now continue with B
6347        next = B;
6348        bar = ST.foo; // do something with the preserved value
6349        break;
6350
6351    case IFMATCH_A_fail: // A failed, so the assertion failed
6352        ...;   // do some housekeeping, then ...
6353        sayNO; // propagate the failure
6354
6355#undef ST
6356
6357    ...
6358    }
6359
6360For any old-timers reading this who are familiar with the old recursive
6361approach, the code above is equivalent to:
6362
6363    case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6364    {
6365        int foo = ...
6366        ...
6367        if (regmatch(A)) {
6368            next = B;
6369            bar = foo;
6370            break;
6371        }
6372        ...;   // do some housekeeping, then ...
6373        sayNO; // propagate the failure
6374    }
6375
6376The topmost backtrack state, pointed to by st, is usually free. If you
6377want to claim it, populate any ST.foo fields in it with values you wish to
6378save, then do one of
6379
6380        PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6381        PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6382
6383which sets that backtrack state's resume value to 'resume_state', pushes a
6384new free entry to the top of the backtrack stack, then goes to 'node'.
6385On backtracking, the free slot is popped, and the saved state becomes the
6386new free state. An ST.foo field in this new top state can be temporarily
6387accessed to retrieve values, but once the main loop is re-entered, it
6388becomes available for reuse.
6389
6390Note that the depth of the backtrack stack constantly increases during the
6391left-to-right execution of the pattern, rather than going up and down with
6392the pattern nesting. For example the stack is at its maximum at Z at the
6393end of the pattern, rather than at X in the following:
6394
6395    /(((X)+)+)+....(Y)+....Z/
6396
6397The only exceptions to this are lookahead/behind assertions and the cut,
6398(?>A), which pop all the backtrack states associated with A before
6399continuing.
6400
6401Backtrack state structs are allocated in slabs of about 4K in size.
6402PL_regmatch_state and st always point to the currently active state,
6403and PL_regmatch_slab points to the slab currently containing
6404PL_regmatch_state.  The first time regmatch() is called, the first slab is
6405allocated, and is never freed until interpreter destruction. When the slab
6406is full, a new one is allocated and chained to the end. At exit from
6407regmatch(), slabs allocated since entry are freed.
6408
6409In order to work with variable length lookbehinds, an upper limit is placed on
6410lookbehinds which is set to where the match position is at the end of where the
6411lookbehind would get to.  Nothing in the lookbehind should match above that,
6412except we should be able to look beyond if for things like \b, which need the
6413next character in the string to be able to determine if this is a boundary or
6414not.  We also can't match the end of string/line unless we are also at the end
6415of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6416that match a width, we have to add a condition that they are within the legal
6417bounds of our window into the string.
6418
6419*/
6420
6421/* returns -1 on failure, $+[0] on success */
6422STATIC SSize_t
6423S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6424{
6425    const bool utf8_target = reginfo->is_utf8_target;
6426    const U32 uniflags = UTF8_ALLOW_DEFAULT;
6427    REGEXP *rex_sv = reginfo->prog;
6428    regexp *rex = ReANY(rex_sv);
6429    RXi_GET_DECL(rex,rexi);
6430    /* the current state. This is a cached copy of PL_regmatch_state */
6431    regmatch_state *st;
6432    /* cache heavy used fields of st in registers */
6433    regnode *scan;
6434    regnode *next;
6435    U32 n = 0;	/* general value; init to avoid compiler warning */
6436    U32 utmp = 0;  /* tmp variable - valid for at most one opcode */
6437    SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6438    SSize_t endref = 0; /* offset of end of backref when ln is start */
6439    char *locinput = startpos;
6440    char *loceol = reginfo->strend;
6441    char *pushinput; /* where to continue after a PUSH */
6442    char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6443    U8   *pushsr0;   /* save starting pos of script run */
6444    PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6445                                     at EOS */
6446
6447    bool result = 0;	    /* return value of S_regmatch */
6448    U32 depth = 0;            /* depth of backtrack stack */
6449    U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6450    const U32 max_nochange_depth =
6451        (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6452        3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6453    regmatch_state *yes_state = NULL; /* state to pop to on success of
6454                                                            subpattern */
6455    /* mark_state piggy backs on the yes_state logic so that when we unwind
6456       the stack on success we can update the mark_state as we go */
6457    regmatch_state *mark_state = NULL; /* last mark state we have seen */
6458    regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6459    struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6460    U32 state_num;
6461    bool no_final = 0;      /* prevent failure from backtracking? */
6462    bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6463    char *startpoint = locinput;
6464    SV *popmark = NULL;     /* are we looking for a mark? */
6465    SV *sv_commit = NULL;   /* last mark name seen in failure */
6466    SV *sv_yes_mark = NULL; /* last mark name we have seen
6467                               during a successful match */
6468    U32 lastopen = 0;       /* last open we saw */
6469    bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6470    SV* const oreplsv = GvSVn(PL_replgv);
6471    /* these three flags are set by various ops to signal information to
6472     * the very next op. They have a useful lifetime of exactly one loop
6473     * iteration, and are not preserved or restored by state pushes/pops
6474     */
6475    bool sw = 0;	    /* the condition value in (?(cond)a|b) */
6476    bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
6477    int logical = 0;	    /* the following EVAL is:
6478                                0: (?{...})
6479                                1: (?(?{...})X|Y)
6480                                2: (??{...})
6481                               or the following IFMATCH/UNLESSM is:
6482                                false: plain (?=foo)
6483                                true:  used as a condition: (?(?=foo))
6484                            */
6485    PAD* last_pad = NULL;
6486    dMULTICALL;
6487    U8 gimme = G_SCALAR;
6488    CV *caller_cv = NULL;	/* who called us */
6489    CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
6490    U32 maxopenparen = 0;       /* max '(' index seen so far */
6491    int to_complement;  /* Invert the result? */
6492    char_class_number_ classnum;
6493    bool is_utf8_pat = reginfo->is_utf8_pat;
6494    bool match = FALSE;
6495    I32 orig_savestack_ix = PL_savestack_ix;
6496    U8 * script_run_begin = NULL;
6497    char *match_end= NULL; /* where a match MUST end to be considered successful */
6498    bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6499    re_fold_t folder = NULL;  /* used by various EXACTish regops */
6500    const U8 * fold_array = NULL; /* used by various EXACTish regops */
6501
6502/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6503#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6504#  define SOLARIS_BAD_OPTIMIZER
6505    const U32 *pl_charclass_dup = PL_charclass;
6506#  define PL_charclass pl_charclass_dup
6507#endif
6508
6509#ifdef DEBUGGING
6510    DECLARE_AND_GET_RE_DEBUG_FLAGS;
6511#endif
6512
6513    /* protect against undef(*^R) */
6514    SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6515
6516    /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6517    multicall_oldcatch = 0;
6518    PERL_UNUSED_VAR(multicall_cop);
6519
6520    PERL_ARGS_ASSERT_REGMATCH;
6521
6522    st = PL_regmatch_state;
6523
6524    /* Note that nextbyte is a byte even in UTF */
6525    SET_nextchr;
6526    scan = prog;
6527
6528    DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6529            DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6530            Perl_re_printf( aTHX_ "regmatch start\n" );
6531    }));
6532
6533    while (scan != NULL) {
6534        next = scan + NEXT_OFF(scan);
6535        if (next == scan)
6536            next = NULL;
6537        state_num = OP(scan);
6538
6539      reenter_switch:
6540        DEBUG_EXECUTE_r(
6541            if (state_num <= REGNODE_MAX) {
6542                SV * const prop = sv_newmortal();
6543                regnode *rnext = regnext(scan);
6544
6545                DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6546                regprop(rex, prop, scan, reginfo, NULL);
6547                Perl_re_printf( aTHX_
6548                    "%*s%" IVdf ":%s(%" IVdf ")\n",
6549                    INDENT_CHARS(depth), "",
6550                    (IV)(scan - rexi->program),
6551                    SvPVX_const(prop),
6552                    (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6553                        0 : (IV)(rnext - rexi->program));
6554            }
6555        );
6556
6557        to_complement = 0;
6558
6559        SET_nextchr;
6560        assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6561
6562        switch (state_num) {
6563            SV * anyofh_list;
6564
6565        case SBOL: /*  /^../ and /\A../  */
6566            if (locinput == reginfo->strbeg)
6567                break;
6568            sayNO;
6569
6570        case MBOL: /*  /^../m  */
6571            if (locinput == reginfo->strbeg ||
6572                (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6573            {
6574                break;
6575            }
6576            sayNO;
6577
6578        case GPOS: /*  \G  */
6579            if (locinput == reginfo->ganch)
6580                break;
6581            sayNO;
6582
6583        case KEEPS: /*   \K  */
6584            /* update the startpoint */
6585            st->u.keeper.val = RXp_OFFS_START(rex,0);
6586            RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg;
6587            PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6588                            script_run_begin);
6589            NOT_REACHED; /* NOTREACHED */
6590
6591        case KEEPS_next_fail:
6592            /* rollback the start point change */
6593            RXp_OFFSp(rex)[0].start = st->u.keeper.val;
6594            sayNO_SILENT;
6595            NOT_REACHED; /* NOTREACHED */
6596
6597        case MEOL: /* /..$/m  */
6598            if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6599                sayNO;
6600            break;
6601
6602        case SEOL: /* /..$/  */
6603            if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6604                sayNO;
6605            if (reginfo->strend - locinput > 1)
6606                sayNO;
6607            break;
6608
6609        case EOS: /*  \z  */
6610            if (!NEXTCHR_IS_EOS)
6611                sayNO;
6612            break;
6613
6614        case SANY: /*  /./s  */
6615            if (NEXTCHR_IS_EOS || locinput >= loceol)
6616                sayNO;
6617            goto increment_locinput;
6618
6619        case REG_ANY: /*  /./  */
6620            if (   NEXTCHR_IS_EOS
6621                || locinput >= loceol
6622                || nextbyte == '\n')
6623            {
6624                sayNO;
6625            }
6626            goto increment_locinput;
6627
6628
6629#undef  ST
6630#define ST st->u.trie
6631        case TRIEC: /* (ab|cd) with known charclass */
6632            /* In this case the charclass data is available inline so
6633               we can fail fast without a lot of extra overhead.
6634             */
6635            if ( !   NEXTCHR_IS_EOS
6636                &&   locinput < loceol
6637                && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6638            {
6639                DEBUG_EXECUTE_r(
6640                    Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6641                              depth, PL_colors[4], PL_colors[5])
6642                );
6643                sayNO_SILENT;
6644                NOT_REACHED; /* NOTREACHED */
6645            }
6646            /* FALLTHROUGH */
6647        case TRIE:  /* (ab|cd)  */
6648            /* the basic plan of execution of the trie is:
6649             * At the beginning, run though all the states, and
6650             * find the longest-matching word. Also remember the position
6651             * of the shortest matching word. For example, this pattern:
6652             *    1  2 3 4    5
6653             *    ab|a|x|abcd|abc
6654             * when matched against the string "abcde", will generate
6655             * accept states for all words except 3, with the longest
6656             * matching word being 4, and the shortest being 2 (with
6657             * the position being after char 1 of the string).
6658             *
6659             * Then for each matching word, in word order (i.e. 1,2,4,5),
6660             * we run the remainder of the pattern; on each try setting
6661             * the current position to the character following the word,
6662             * returning to try the next word on failure.
6663             *
6664             * We avoid having to build a list of words at runtime by
6665             * using a compile-time structure, wordinfo[].prev, which
6666             * gives, for each word, the previous accepting word (if any).
6667             * In the case above it would contain the mappings 1->2, 2->0,
6668             * 3->0, 4->5, 5->1.  We can use this table to generate, from
6669             * the longest word (4 above), a list of all words, by
6670             * following the list of prev pointers; this gives us the
6671             * unordered list 4,5,1,2. Then given the current word we have
6672             * just tried, we can go through the list and find the
6673             * next-biggest word to try (so if we just failed on word 2,
6674             * the next in the list is 4).
6675             *
6676             * Since at runtime we don't record the matching position in
6677             * the string for each word, we have to work that out for
6678             * each word we're about to process. The wordinfo table holds
6679             * the character length of each word; given that we recorded
6680             * at the start: the position of the shortest word and its
6681             * length in chars, we just need to move the pointer the
6682             * difference between the two char lengths. Depending on
6683             * Unicode status and folding, that's cheap or expensive.
6684             *
6685             * This algorithm is optimised for the case where are only a
6686             * small number of accept states, i.e. 0,1, or maybe 2.
6687             * With lots of accepts states, and having to try all of them,
6688             * it becomes quadratic on number of accept states to find all
6689             * the next words.
6690             */
6691
6692            {
6693                /* what type of TRIE am I? (utf8 makes this contextual) */
6694                DECL_TRIE_TYPE(scan);
6695
6696                /* what trie are we using right now */
6697                reg_trie_data * const trie
6698                    = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ];
6699                ST.before_paren = trie->before_paren;
6700                ST.after_paren = trie->after_paren;
6701                assert(ST.before_paren<=rex->nparens);
6702                assert(ST.after_paren<=rex->nparens);
6703
6704                HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]);
6705                U32 state = trie->startstate;
6706
6707                if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) {
6708                    CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6709                    if (utf8_target
6710                        && ! NEXTCHR_IS_EOS
6711                        && UTF8_IS_ABOVE_LATIN1(nextbyte)
6712                        && FLAGS(scan) == EXACTL)
6713                    {
6714                        /* We only output for EXACTL, as we let the folder
6715                         * output this message for EXACTFLU8 to avoid
6716                         * duplication */
6717                        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6718                                                               reginfo->strend);
6719                    }
6720                }
6721                if (   trie->bitmap
6722                    && (     NEXTCHR_IS_EOS
6723                        ||   locinput >= loceol
6724                        || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6725                {
6726                    if (trie->states[ state ].wordnum) {
6727                         DEBUG_EXECUTE_r(
6728                            Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6729                                          depth, PL_colors[4], PL_colors[5])
6730                        );
6731                        if (!trie->jump)
6732                            break;
6733                    } else {
6734                        DEBUG_EXECUTE_r(
6735                            Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6736                                          depth, PL_colors[4], PL_colors[5])
6737                        );
6738                        sayNO_SILENT;
6739                   }
6740                }
6741
6742            {
6743                U8 *uc = ( U8* )locinput;
6744
6745                STRLEN len = 0;
6746                STRLEN foldlen = 0;
6747                U8 *uscan = (U8*)NULL;
6748                U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6749                U32 charcount = 0; /* how many input chars we have matched */
6750                U32 accepted = 0; /* have we seen any accepting states? */
6751
6752                ST.jump = trie->jump;
6753                ST.j_before_paren = trie->j_before_paren;
6754                ST.j_after_paren= trie->j_after_paren;
6755                ST.me = scan;
6756                ST.firstpos = NULL;
6757                ST.longfold = FALSE; /* char longer if folded => it's harder */
6758                ST.nextword = 0;
6759
6760                /* fully traverse the TRIE; note the position of the
6761                   shortest accept state and the wordnum of the longest
6762                   accept state */
6763
6764                while ( state && uc <= (U8*)(loceol) ) {
6765                    U32 base = trie->states[ state ].trans.base;
6766                    UV uvc = 0;
6767                    U16 charid = 0;
6768                    U16 wordnum;
6769                    wordnum = trie->states[ state ].wordnum;
6770
6771                    if (wordnum) { /* it's an accept state */
6772                        if (!accepted) {
6773                            accepted = 1;
6774                            /* record first match position */
6775                            if (ST.longfold) {
6776                                ST.firstpos = (U8*)locinput;
6777                                ST.firstchars = 0;
6778                            }
6779                            else {
6780                                ST.firstpos = uc;
6781                                ST.firstchars = charcount;
6782                            }
6783                        }
6784                        if (!ST.nextword || wordnum < ST.nextword)
6785                            ST.nextword = wordnum;
6786                        ST.topword = wordnum;
6787                    }
6788
6789                    DEBUG_TRIE_EXECUTE_r({
6790                                DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6791                                /* HERE */
6792                                PerlIO_printf( Perl_debug_log,
6793                                    "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6794                                    INDENT_CHARS(depth), "", PL_colors[4],
6795                                    (UV)state, (accepted ? 'Y' : 'N'));
6796                    });
6797
6798                    /* read a char and goto next state */
6799                    if ( base && (foldlen || uc < (U8*)(loceol))) {
6800                        I32 offset;
6801                        REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6802                                             (U8 *) loceol, uscan,
6803                                             len, uvc, charid, foldlen,
6804                                             foldbuf, uniflags);
6805                        charcount++;
6806                        if (foldlen>0)
6807                            ST.longfold = TRUE;
6808                        if (charid &&
6809                             ( ((offset =
6810                              base + charid - 1 - trie->uniquecharcount)) >= 0)
6811
6812                             && ((U32)offset < trie->lasttrans)
6813                             && trie->trans[offset].check == state)
6814                        {
6815                            state = trie->trans[offset].next;
6816                        }
6817                        else {
6818                            state = 0;
6819                        }
6820                        uc += len;
6821
6822                    }
6823                    else {
6824                        state = 0;
6825                    }
6826                    DEBUG_TRIE_EXECUTE_r(
6827                        Perl_re_printf( aTHX_
6828                            "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6829                            charid, uvc, (UV)state, PL_colors[5] );
6830                    );
6831                }
6832                if (!accepted)
6833                   sayNO;
6834
6835                /* calculate total number of accept states */
6836                {
6837                    U16 w = ST.topword;
6838                    accepted = 0;
6839                    while (w) {
6840                        w = trie->wordinfo[w].prev;
6841                        accepted++;
6842                    }
6843                    ST.accepted = accepted;
6844                }
6845
6846                DEBUG_EXECUTE_r(
6847                    Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6848                        depth,
6849                        PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6850                );
6851                goto trie_first_try; /* jump into the fail handler */
6852            }}
6853            NOT_REACHED; /* NOTREACHED */
6854
6855        case TRIE_next_fail: /* we failed - try next alternative */
6856        {
6857            U8 *uc;
6858            if (RE_PESSIMISTIC_PARENS) {
6859                REGCP_UNWIND(ST.lastcp);
6860                regcppop(rex,&maxopenparen);
6861            }
6862            if ( ST.jump ) {
6863                /* undo any captures done in the tail part of a branch,
6864                 * e.g.
6865                 *    /(?:X(.)(.)|Y(.)).../
6866                 * where the trie just matches X then calls out to do the
6867                 * rest of the branch */
6868                REGCP_UNWIND(ST.cp);
6869                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6870                if (ST.after_paren) {
6871                    assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
6872                    CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail");
6873                }
6874            }
6875            if (!--ST.accepted) {
6876                DEBUG_EXECUTE_r({
6877                    Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6878                        depth,
6879                        PL_colors[4],
6880                        PL_colors[5] );
6881                });
6882                sayNO_SILENT;
6883            }
6884            {
6885                /* Find next-highest word to process.  Note that this code
6886                 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6887                U16 min = 0;
6888                U16 word;
6889                U16 const nextword = ST.nextword;
6890                reg_trie_wordinfo * const wordinfo
6891                    = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo;
6892                for (word=ST.topword; word; word=wordinfo[word].prev) {
6893                    if (word > nextword && (!min || word < min))
6894                        min = word;
6895                }
6896                ST.nextword = min;
6897            }
6898
6899          trie_first_try:
6900            if (do_cutgroup) {
6901                do_cutgroup = 0;
6902                no_final = 0;
6903            }
6904
6905            if ( ST.jump ) {
6906                ST.lastparen = RXp_LASTPAREN(rex);
6907                ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
6908                REGCP_SET(ST.cp);
6909            }
6910
6911            /* find start char of end of current word */
6912            {
6913                U32 chars; /* how many chars to skip */
6914                reg_trie_data * const trie
6915                    = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
6916
6917                assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6918                            >=  ST.firstchars);
6919                chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6920                            - ST.firstchars;
6921                uc = ST.firstpos;
6922
6923                if (ST.longfold) {
6924                    /* the hard option - fold each char in turn and find
6925                     * its folded length (which may be different */
6926                    U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6927                    STRLEN foldlen;
6928                    STRLEN len;
6929                    UV uvc;
6930                    U8 *uscan;
6931
6932                    while (chars) {
6933                        if (utf8_target) {
6934                            /* XXX This assumes the length is well-formed, as
6935                             * does the UTF8SKIP below */
6936                            uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6937                                                    uniflags);
6938                            uc += len;
6939                        }
6940                        else {
6941                            uvc = *uc;
6942                            uc++;
6943                        }
6944                        uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6945                        uscan = foldbuf;
6946                        while (foldlen) {
6947                            if (!--chars)
6948                                break;
6949                            uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6950                                                 uniflags);
6951                            uscan += len;
6952                            foldlen -= len;
6953                        }
6954                    }
6955                }
6956                else {
6957                    if (utf8_target)
6958                        uc = utf8_hop(uc, chars);
6959                    else
6960                        uc += chars;
6961                }
6962            }
6963            if (ST.jump && ST.jump[ST.nextword]) {
6964                scan = ST.me + ST.jump[ST.nextword];
6965                ST.before_paren = ST.j_before_paren[ST.nextword];
6966                assert(ST.before_paren <= rex->nparens);
6967                ST.after_paren = ST.j_after_paren[ST.nextword];
6968                assert(ST.after_paren <= rex->nparens);
6969            } else {
6970                scan = ST.me + NEXT_OFF(ST.me);
6971            }
6972
6973
6974            DEBUG_EXECUTE_r({
6975                Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6976                    depth,
6977                    PL_colors[4],
6978                    ST.nextword,
6979                    PL_colors[5]
6980                    );
6981            });
6982
6983            if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6984                if (RE_PESSIMISTIC_PARENS) {
6985                    (void)regcppush(rex, 0, maxopenparen);
6986                    REGCP_SET(ST.lastcp);
6987                }
6988                PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6989                                script_run_begin);
6990                NOT_REACHED; /* NOTREACHED */
6991            }
6992            /* only one choice left - just continue */
6993            DEBUG_EXECUTE_r({
6994                AV *const trie_words
6995                    = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]);
6996                SV ** const tmp = trie_words
6997                        ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6998                SV *sv= tmp ? sv_newmortal() : NULL;
6999
7000                Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
7001                    depth, PL_colors[4],
7002                    ST.nextword,
7003                    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
7004                            PL_colors[0], PL_colors[1],
7005                            (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
7006                        )
7007                    : "not compiled under -Dr",
7008                    PL_colors[5] );
7009            });
7010
7011            locinput = (char*)uc;
7012            continue; /* execute rest of RE */
7013            /* NOTREACHED */
7014        }
7015#undef  ST
7016
7017        case LEXACT_REQ8:
7018            if (! utf8_target) {
7019                sayNO;
7020            }
7021            /* FALLTHROUGH */
7022
7023        case LEXACT:
7024        {
7025            char *s;
7026
7027            s = STRINGl(scan);
7028            ln = STR_LENl(scan);
7029            goto join_short_long_exact;
7030
7031        case EXACTL:             /*  /abc/l       */
7032            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7033
7034            /* Complete checking would involve going through every character
7035             * matched by the string to see if any is above latin1.  But the
7036             * comparison otherwise might very well be a fast assembly
7037             * language routine, and I (khw) don't think slowing things down
7038             * just to check for this warning is worth it.  So this just checks
7039             * the first character */
7040            if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
7041                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
7042            }
7043            goto do_exact;
7044        case EXACT_REQ8:
7045            if (! utf8_target) {
7046                sayNO;
7047            }
7048            /* FALLTHROUGH */
7049
7050        case EXACT:             /*  /abc/        */
7051          do_exact:
7052            s = STRINGs(scan);
7053            ln = STR_LENs(scan);
7054
7055          join_short_long_exact:
7056            if (utf8_target != is_utf8_pat) {
7057                /* The target and the pattern have differing utf8ness. */
7058                char *l = locinput;
7059                const char * const e = s + ln;
7060
7061                if (utf8_target) {
7062                    /* The target is utf8, the pattern is not utf8.
7063                     * Above-Latin1 code points can't match the pattern;
7064                     * invariants match exactly, and the other Latin1 ones need
7065                     * to be downgraded to a single byte in order to do the
7066                     * comparison.  (If we could be confident that the target
7067                     * is not malformed, this could be refactored to have fewer
7068                     * tests by just assuming that if the first bytes match, it
7069                     * is an invariant, but there are tests in the test suite
7070                     * dealing with (??{...}) which violate this) */
7071                    while (s < e) {
7072                        if (   l >= loceol
7073                            || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
7074                        {
7075                            sayNO;
7076                        }
7077                        if (UTF8_IS_INVARIANT(*(U8*)l)) {
7078                            if (*l != *s) {
7079                                sayNO;
7080                            }
7081                            l++;
7082                        }
7083                        else {
7084                            if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
7085                            {
7086                                sayNO;
7087                            }
7088                            l += 2;
7089                        }
7090                        s++;
7091                    }
7092                }
7093                else {
7094                    /* The target is not utf8, the pattern is utf8. */
7095                    while (s < e) {
7096                        if (   l >= loceol
7097                            || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
7098                        {
7099                            sayNO;
7100                        }
7101                        if (UTF8_IS_INVARIANT(*(U8*)s)) {
7102                            if (*s != *l) {
7103                                sayNO;
7104                            }
7105                            s++;
7106                        }
7107                        else {
7108                            if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7109                            {
7110                                sayNO;
7111                            }
7112                            s += 2;
7113                        }
7114                        l++;
7115                    }
7116                }
7117                locinput = l;
7118            }
7119            else {
7120                /* The target and the pattern have the same utf8ness. */
7121                /* Inline the first character, for speed. */
7122                if (   loceol - locinput < ln
7123                    || UCHARAT(s) != nextbyte
7124                    || (ln > 1 && memNE(s, locinput, ln)))
7125                {
7126                    sayNO;
7127                }
7128                locinput += ln;
7129            }
7130            break;
7131            }
7132
7133        case EXACTFL:            /*  /abc/il      */
7134          {
7135            const char * s;
7136            U32 fold_utf8_flags;
7137
7138            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7139            folder = Perl_foldEQ_locale;
7140            fold_array = PL_fold_locale;
7141            fold_utf8_flags = FOLDEQ_LOCALE;
7142            goto do_exactf;
7143
7144        case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
7145                                      is effectively /u; hence to match, target
7146                                      must be UTF-8. */
7147            if (! utf8_target) {
7148                sayNO;
7149            }
7150            fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7151                                             | FOLDEQ_S2_FOLDS_SANE;
7152            folder = S_foldEQ_latin1_s2_folded;
7153            fold_array = PL_fold_latin1;
7154            goto do_exactf;
7155
7156        case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
7157            if (! utf8_target) {
7158                sayNO;
7159            }
7160            assert(is_utf8_pat);
7161            fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7162#ifdef DEBUGGING
7163            /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7164             * In theory neither of these variables should be used in this mode. */
7165            folder = NULL;
7166            fold_array = NULL;
7167#endif
7168            goto do_exactf;
7169
7170        case EXACTFUP:          /*  /foo/iu, and something is problematic in
7171                                    'foo' so can't take shortcuts. */
7172            assert(! is_utf8_pat);
7173            folder = Perl_foldEQ_latin1;
7174            fold_array = PL_fold_latin1;
7175            fold_utf8_flags = 0;
7176            goto do_exactf;
7177
7178        case EXACTFU:            /*  /abc/iu      */
7179            folder = S_foldEQ_latin1_s2_folded;
7180            fold_array = PL_fold_latin1;
7181            fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7182            goto do_exactf;
7183
7184        case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7185                                   patterns */
7186            assert(! is_utf8_pat);
7187            /* FALLTHROUGH */
7188        case EXACTFAA:            /*  /abc/iaa     */
7189            folder = S_foldEQ_latin1_s2_folded;
7190            fold_array = PL_fold_latin1;
7191            fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7192            if (is_utf8_pat || ! utf8_target) {
7193
7194                /* The possible presence of a MICRO SIGN in the pattern forbids
7195                 * us to view a non-UTF-8 pattern as folded when there is a
7196                 * UTF-8 target */
7197                fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7198                                  |FOLDEQ_S2_FOLDS_SANE;
7199            }
7200            goto do_exactf;
7201
7202
7203        case EXACTF:             /*  /abc/i    This node only generated for
7204                                               non-utf8 patterns */
7205            assert(! is_utf8_pat);
7206            folder = Perl_foldEQ;
7207            fold_array = PL_fold;
7208            fold_utf8_flags = 0;
7209
7210          do_exactf:
7211            s = STRINGs(scan);
7212            ln = STR_LENs(scan);
7213
7214            if (   utf8_target
7215                || is_utf8_pat
7216                || state_num == EXACTFUP
7217                || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7218            {
7219              /* Either target or the pattern are utf8, or has the issue where
7220               * the fold lengths may differ. */
7221                const char * const l = locinput;
7222                char *e = loceol;
7223
7224                if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7225                                        s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7226                {
7227                    sayNO;
7228                }
7229                locinput = e;
7230                break;
7231            }
7232
7233            /* Neither the target nor the pattern are utf8 */
7234            assert(fold_array);
7235            if (UCHARAT(s) != nextbyte
7236                && !NEXTCHR_IS_EOS
7237                && UCHARAT(s) != fold_array[nextbyte])
7238            {
7239                sayNO;
7240            }
7241            if (loceol - locinput < ln)
7242                sayNO;
7243            assert(folder);
7244            if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7245                sayNO;
7246            locinput += ln;
7247            break;
7248        }
7249
7250        case NBOUNDL: /*  /\B/l  */
7251            to_complement = 1;
7252            /* FALLTHROUGH */
7253
7254        case BOUNDL:  /*  /\b/l  */
7255        {
7256            bool b1, b2;
7257            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7258
7259            if (FLAGS(scan) != TRADITIONAL_BOUND) {
7260                CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7261                goto boundu;
7262            }
7263
7264            if (utf8_target) {
7265                if (locinput == reginfo->strbeg)
7266                    b1 = isWORDCHAR_LC('\n');
7267                else {
7268                    U8 *p = reghop3((U8*)locinput, -1,
7269                                    (U8*)(reginfo->strbeg));
7270                    b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7271                }
7272                b2 = (NEXTCHR_IS_EOS)
7273                    ? isWORDCHAR_LC('\n')
7274                    : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7275                                              (U8*) reginfo->strend);
7276            }
7277            else { /* Here the string isn't utf8 */
7278                b1 = (locinput == reginfo->strbeg)
7279                     ? isWORDCHAR_LC('\n')
7280                     : isWORDCHAR_LC(UCHARAT(locinput - 1));
7281                b2 = (NEXTCHR_IS_EOS)
7282                    ? isWORDCHAR_LC('\n')
7283                    : isWORDCHAR_LC(nextbyte);
7284            }
7285            if (to_complement ^ (b1 == b2)) {
7286                sayNO;
7287            }
7288            break;
7289        }
7290
7291        case NBOUND:  /*  /\B/   */
7292            to_complement = 1;
7293            /* FALLTHROUGH */
7294
7295        case BOUND:   /*  /\b/   */
7296            if (utf8_target) {
7297                goto bound_utf8;
7298            }
7299            goto bound_ascii_match_only;
7300
7301        case NBOUNDA: /*  /\B/a  */
7302            to_complement = 1;
7303            /* FALLTHROUGH */
7304
7305        case BOUNDA:  /*  /\b/a  */
7306        {
7307            bool b1, b2;
7308
7309          bound_ascii_match_only:
7310            /* Here the string isn't utf8, or is utf8 and only ascii characters
7311             * are to match \w.  In the latter case looking at the byte just
7312             * prior to the current one may be just the final byte of a
7313             * multi-byte character.  This is ok.  There are two cases:
7314             * 1) it is a single byte character, and then the test is doing
7315             *    just what it's supposed to.
7316             * 2) it is a multi-byte character, in which case the final byte is
7317             *    never mistakable for ASCII, and so the test will say it is
7318             *    not a word character, which is the correct answer. */
7319            b1 = (locinput == reginfo->strbeg)
7320                 ? isWORDCHAR_A('\n')
7321                 : isWORDCHAR_A(UCHARAT(locinput - 1));
7322            b2 = (NEXTCHR_IS_EOS)
7323                ? isWORDCHAR_A('\n')
7324                : isWORDCHAR_A(nextbyte);
7325            if (to_complement ^ (b1 == b2)) {
7326                sayNO;
7327            }
7328            break;
7329        }
7330
7331        case NBOUNDU: /*  /\B/u  */
7332            to_complement = 1;
7333            /* FALLTHROUGH */
7334
7335        case BOUNDU:  /*  /\b/u  */
7336
7337          boundu:
7338            if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7339                match = FALSE;
7340            }
7341            else if (utf8_target) {
7342              bound_utf8:
7343                switch((bound_type) FLAGS(scan)) {
7344                    case TRADITIONAL_BOUND:
7345                    {
7346                        bool b1, b2;
7347                        if (locinput == reginfo->strbeg) {
7348                            b1 = 0 /* isWORDCHAR_L1('\n') */;
7349                        }
7350                        else {
7351                            U8 *p = reghop3((U8*)locinput, -1,
7352                                            (U8*)(reginfo->strbeg));
7353
7354                            b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7355                        }
7356                        b2 = (NEXTCHR_IS_EOS)
7357                            ? 0 /* isWORDCHAR_L1('\n') */
7358                            : isWORDCHAR_utf8_safe((U8*)locinput,
7359                                                   (U8*) reginfo->strend);
7360                        match = cBOOL(b1 != b2);
7361                        break;
7362                    }
7363                    case GCB_BOUND:
7364                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7365                            match = TRUE; /* GCB always matches at begin and
7366                                             end */
7367                        }
7368                        else {
7369                            /* Find the gcb values of previous and current
7370                             * chars, then see if is a break point */
7371                            match = isGCB(getGCB_VAL_UTF8(
7372                                                reghop3((U8*)locinput,
7373                                                        -1,
7374                                                        (U8*)(reginfo->strbeg)),
7375                                                (U8*) reginfo->strend),
7376                                          getGCB_VAL_UTF8((U8*) locinput,
7377                                                        (U8*) reginfo->strend),
7378                                          (U8*) reginfo->strbeg,
7379                                          (U8*) locinput,
7380                                          utf8_target);
7381                        }
7382                        break;
7383
7384                    case LB_BOUND:
7385                        if (locinput == reginfo->strbeg) {
7386                            match = FALSE;
7387                        }
7388                        else if (NEXTCHR_IS_EOS) {
7389                            match = TRUE;
7390                        }
7391                        else {
7392                            match = isLB(getLB_VAL_UTF8(
7393                                                reghop3((U8*)locinput,
7394                                                        -1,
7395                                                        (U8*)(reginfo->strbeg)),
7396                                                (U8*) reginfo->strend),
7397                                          getLB_VAL_UTF8((U8*) locinput,
7398                                                        (U8*) reginfo->strend),
7399                                          (U8*) reginfo->strbeg,
7400                                          (U8*) locinput,
7401                                          (U8*) reginfo->strend,
7402                                          utf8_target);
7403                        }
7404                        break;
7405
7406                    case SB_BOUND: /* Always matches at begin and end */
7407                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7408                            match = TRUE;
7409                        }
7410                        else {
7411                            match = isSB(getSB_VAL_UTF8(
7412                                                reghop3((U8*)locinput,
7413                                                        -1,
7414                                                        (U8*)(reginfo->strbeg)),
7415                                                (U8*) reginfo->strend),
7416                                          getSB_VAL_UTF8((U8*) locinput,
7417                                                        (U8*) reginfo->strend),
7418                                          (U8*) reginfo->strbeg,
7419                                          (U8*) locinput,
7420                                          (U8*) reginfo->strend,
7421                                          utf8_target);
7422                        }
7423                        break;
7424
7425                    case WB_BOUND:
7426                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7427                            match = TRUE;
7428                        }
7429                        else {
7430                            match = isWB(WB_UNKNOWN,
7431                                         getWB_VAL_UTF8(
7432                                                reghop3((U8*)locinput,
7433                                                        -1,
7434                                                        (U8*)(reginfo->strbeg)),
7435                                                (U8*) reginfo->strend),
7436                                          getWB_VAL_UTF8((U8*) locinput,
7437                                                        (U8*) reginfo->strend),
7438                                          (U8*) reginfo->strbeg,
7439                                          (U8*) locinput,
7440                                          (U8*) reginfo->strend,
7441                                          utf8_target);
7442                        }
7443                        break;
7444                }
7445            }
7446            else {  /* Not utf8 target */
7447                switch((bound_type) FLAGS(scan)) {
7448                    case TRADITIONAL_BOUND:
7449                    {
7450                        bool b1, b2;
7451                        b1 = (locinput == reginfo->strbeg)
7452                            ? 0 /* isWORDCHAR_L1('\n') */
7453                            : isWORDCHAR_L1(UCHARAT(locinput - 1));
7454                        b2 = (NEXTCHR_IS_EOS)
7455                            ? 0 /* isWORDCHAR_L1('\n') */
7456                            : isWORDCHAR_L1(nextbyte);
7457                        match = cBOOL(b1 != b2);
7458                        break;
7459                    }
7460
7461                    case GCB_BOUND:
7462                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7463                            match = TRUE; /* GCB always matches at begin and
7464                                             end */
7465                        }
7466                        else {  /* Only CR-LF combo isn't a GCB in 0-255
7467                                   range */
7468                            match =    UCHARAT(locinput - 1) != '\r'
7469                                    || UCHARAT(locinput) != '\n';
7470                        }
7471                        break;
7472
7473                    case LB_BOUND:
7474                        if (locinput == reginfo->strbeg) {
7475                            match = FALSE;
7476                        }
7477                        else if (NEXTCHR_IS_EOS) {
7478                            match = TRUE;
7479                        }
7480                        else {
7481                            match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7482                                         getLB_VAL_CP(UCHARAT(locinput)),
7483                                         (U8*) reginfo->strbeg,
7484                                         (U8*) locinput,
7485                                         (U8*) reginfo->strend,
7486                                         utf8_target);
7487                        }
7488                        break;
7489
7490                    case SB_BOUND: /* Always matches at begin and end */
7491                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7492                            match = TRUE;
7493                        }
7494                        else {
7495                            match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7496                                         getSB_VAL_CP(UCHARAT(locinput)),
7497                                         (U8*) reginfo->strbeg,
7498                                         (U8*) locinput,
7499                                         (U8*) reginfo->strend,
7500                                         utf8_target);
7501                        }
7502                        break;
7503
7504                    case WB_BOUND:
7505                        if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7506                            match = TRUE;
7507                        }
7508                        else {
7509                            match = isWB(WB_UNKNOWN,
7510                                         getWB_VAL_CP(UCHARAT(locinput -1)),
7511                                         getWB_VAL_CP(UCHARAT(locinput)),
7512                                         (U8*) reginfo->strbeg,
7513                                         (U8*) locinput,
7514                                         (U8*) reginfo->strend,
7515                                         utf8_target);
7516                        }
7517                        break;
7518                }
7519            }
7520
7521            if (to_complement ^ ! match) {
7522                sayNO;
7523            }
7524            break;
7525
7526        case ANYOFPOSIXL:
7527        case ANYOFL:  /*  /[abc]/l      */
7528            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7529            CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7530
7531            /* FALLTHROUGH */
7532        case ANYOFD:  /*   /[abc]/d       */
7533        case ANYOF:  /*   /[abc]/       */
7534            if (NEXTCHR_IS_EOS || locinput >= loceol)
7535                sayNO;
7536            if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7537                && ! ANYOF_FLAGS(scan)
7538                && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7539            {
7540                if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7541                    sayNO;
7542                }
7543                locinput++;
7544            }
7545            else {
7546                if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7547                                                                   utf8_target))
7548                {
7549                    sayNO;
7550                }
7551                goto increment_locinput;
7552            }
7553            break;
7554
7555        case ANYOFM:
7556            if (   NEXTCHR_IS_EOS
7557                || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan)
7558                || locinput >= loceol)
7559            {
7560                sayNO;
7561            }
7562            locinput++; /* ANYOFM is always single byte */
7563            break;
7564
7565        case NANYOFM:
7566            if (   NEXTCHR_IS_EOS
7567                || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan)
7568                || locinput >= loceol)
7569            {
7570                sayNO;
7571            }
7572            goto increment_locinput;
7573            break;
7574
7575        case ANYOFH:
7576            if (   ! utf8_target
7577                ||   NEXTCHR_IS_EOS
7578                ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7579                || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7580                || ! _invlist_contains_cp(anyofh_list,
7581                                          utf8_to_uvchr_buf((U8 *) locinput,
7582                                                            (U8 *) loceol,
7583                                                            NULL)))
7584            {
7585                sayNO;
7586            }
7587            goto increment_locinput;
7588            break;
7589
7590        case ANYOFHb:
7591            if (   ! utf8_target
7592                ||   NEXTCHR_IS_EOS
7593                ||   ANYOF_FLAGS(scan) != (U8) *locinput
7594                || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7595                || ! _invlist_contains_cp(anyofh_list,
7596                                          utf8_to_uvchr_buf((U8 *) locinput,
7597                                                            (U8 *) loceol,
7598                                                            NULL)))
7599            {
7600                sayNO;
7601            }
7602            goto increment_locinput;
7603            break;
7604
7605        case ANYOFHbbm:
7606            if (   ! utf8_target
7607                ||   NEXTCHR_IS_EOS
7608                ||   ANYOF_FLAGS(scan) != (U8) locinput[0]
7609                ||   locinput >= reginfo->strend
7610                || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7611                                   (U8) locinput[1] & UTF_CONTINUATION_MASK))
7612            {
7613                sayNO;
7614            }
7615            goto increment_locinput;
7616            break;
7617
7618        case ANYOFHr:
7619            if (   ! utf8_target
7620                ||   NEXTCHR_IS_EOS
7621                || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7622                             LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7623                             HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7624                || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7625                || ! _invlist_contains_cp(anyofh_list,
7626                                          utf8_to_uvchr_buf((U8 *) locinput,
7627                                                            (U8 *) loceol,
7628                                                            NULL)))
7629            {
7630                sayNO;
7631            }
7632            goto increment_locinput;
7633            break;
7634
7635        case ANYOFHs:
7636            if (   ! utf8_target
7637                ||   NEXTCHR_IS_EOS
7638                ||   loceol - locinput < FLAGS(scan)
7639                ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7640                || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7641                || ! _invlist_contains_cp(anyofh_list,
7642                                          utf8_to_uvchr_buf((U8 *) locinput,
7643                                                            (U8 *) loceol,
7644                                                            NULL)))
7645            {
7646                sayNO;
7647            }
7648            goto increment_locinput;
7649            break;
7650
7651        case ANYOFR:
7652            if (NEXTCHR_IS_EOS) {
7653                sayNO;
7654            }
7655
7656            if (utf8_target) {
7657                if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7658                   || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7659                                                (U8 *) reginfo->strend,
7660                                                NULL),
7661                                    ANYOFRbase(scan), ANYOFRdelta(scan)))
7662                {
7663                    sayNO;
7664                }
7665            }
7666            else {
7667                if (! withinCOUNT((U8) *locinput,
7668                                  ANYOFRbase(scan), ANYOFRdelta(scan)))
7669                {
7670                    sayNO;
7671                }
7672            }
7673            goto increment_locinput;
7674            break;
7675
7676        case ANYOFRb:
7677            if (NEXTCHR_IS_EOS) {
7678                sayNO;
7679            }
7680
7681            if (utf8_target) {
7682                if (     ANYOF_FLAGS(scan) != (U8) *locinput
7683                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7684                                                (U8 *) reginfo->strend,
7685                                                NULL),
7686                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7687                {
7688                    sayNO;
7689                }
7690            }
7691            else {
7692                if (! withinCOUNT((U8) *locinput,
7693                                  ANYOFRbase(scan), ANYOFRdelta(scan)))
7694                {
7695                    sayNO;
7696                }
7697            }
7698            goto increment_locinput;
7699            break;
7700
7701        /* The argument (FLAGS) to all the POSIX node types is the class number
7702         * */
7703
7704        case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7705            to_complement = 1;
7706            /* FALLTHROUGH */
7707
7708        case POSIXL:    /* \w or [:punct:] etc. under /l */
7709            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7710            if (NEXTCHR_IS_EOS || locinput >= loceol)
7711                sayNO;
7712
7713            /* Use isFOO_lc() for characters within Latin1.  (Note that
7714             * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7715             * wouldn't be invariant) */
7716            if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7717                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7718                    sayNO;
7719                }
7720
7721                locinput++;
7722                break;
7723            }
7724
7725            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7726                /* An above Latin-1 code point, or malformed */
7727                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7728                                                       reginfo->strend);
7729                goto utf8_posix_above_latin1;
7730            }
7731
7732            /* Here is a UTF-8 variant code point below 256 and the target is
7733             * UTF-8 */
7734            if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7735                                            EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7736                                            *(locinput + 1))))))
7737            {
7738                sayNO;
7739            }
7740
7741            goto increment_locinput;
7742
7743        case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7744            to_complement = 1;
7745            /* FALLTHROUGH */
7746
7747        case POSIXD:    /* \w or [:punct:] etc. under /d */
7748            if (utf8_target) {
7749                goto utf8_posix;
7750            }
7751            goto posixa;
7752
7753        case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7754
7755            if (NEXTCHR_IS_EOS || locinput >= loceol) {
7756                sayNO;
7757            }
7758
7759            /* All UTF-8 variants match */
7760            if (! UTF8_IS_INVARIANT(nextbyte)) {
7761                goto increment_locinput;
7762            }
7763
7764            to_complement = 1;
7765            goto join_nposixa;
7766
7767        case POSIXA:    /* \w or [:punct:] etc. under /a */
7768
7769          posixa:
7770            /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7771             * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7772             * character is a single byte */
7773
7774            if (NEXTCHR_IS_EOS || locinput >= loceol) {
7775                sayNO;
7776            }
7777
7778          join_nposixa:
7779
7780            if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7781                                                                FLAGS(scan)))))
7782            {
7783                sayNO;
7784            }
7785
7786            /* Here we are either not in utf8, or we matched a utf8-invariant,
7787             * so the next char is the next byte */
7788            locinput++;
7789            break;
7790
7791        case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7792            to_complement = 1;
7793            /* FALLTHROUGH */
7794
7795        case POSIXU:    /* \w or [:punct:] etc. under /u */
7796          utf8_posix:
7797            if (NEXTCHR_IS_EOS || locinput >= loceol) {
7798                sayNO;
7799            }
7800
7801            /* Use generic_isCC_() for characters within Latin1.  (Note that
7802             * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7803             * wouldn't be invariant) */
7804            if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7805                if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7806                                                           FLAGS(scan)))))
7807                {
7808                    sayNO;
7809                }
7810                locinput++;
7811            }
7812            else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7813                if (! (to_complement
7814                       ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7815                                                               *(locinput + 1)),
7816                                             FLAGS(scan)))))
7817                {
7818                    sayNO;
7819                }
7820                locinput += 2;
7821            }
7822            else {  /* Handle above Latin-1 code points */
7823              utf8_posix_above_latin1:
7824                classnum = (char_class_number_) FLAGS(scan);
7825                switch (classnum) {
7826                    default:
7827                        if (! (to_complement
7828                           ^ cBOOL(_invlist_contains_cp(
7829                                      PL_XPosix_ptrs[classnum],
7830                                      utf8_to_uvchr_buf((U8 *) locinput,
7831                                                        (U8 *) reginfo->strend,
7832                                                        NULL)))))
7833                        {
7834                            sayNO;
7835                        }
7836                        break;
7837                    case CC_ENUM_SPACE_:
7838                        if (! (to_complement
7839                                    ^ cBOOL(is_XPERLSPACE_high(locinput))))
7840                        {
7841                            sayNO;
7842                        }
7843                        break;
7844                    case CC_ENUM_BLANK_:
7845                        if (! (to_complement
7846                                        ^ cBOOL(is_HORIZWS_high(locinput))))
7847                        {
7848                            sayNO;
7849                        }
7850                        break;
7851                    case CC_ENUM_XDIGIT_:
7852                        if (! (to_complement
7853                                        ^ cBOOL(is_XDIGIT_high(locinput))))
7854                        {
7855                            sayNO;
7856                        }
7857                        break;
7858                    case CC_ENUM_VERTSPACE_:
7859                        if (! (to_complement
7860                                        ^ cBOOL(is_VERTWS_high(locinput))))
7861                        {
7862                            sayNO;
7863                        }
7864                        break;
7865                    case CC_ENUM_CNTRL_:    /* These can't match above Latin1 */
7866                    case CC_ENUM_ASCII_:
7867                        if (! to_complement) {
7868                            sayNO;
7869                        }
7870                        break;
7871                }
7872                locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7873            }
7874            break;
7875
7876        case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7877                       a Unicode extended Grapheme Cluster */
7878            if (NEXTCHR_IS_EOS || locinput >= loceol)
7879                sayNO;
7880            if  (! utf8_target) {
7881
7882                /* Match either CR LF  or '.', as all the other possibilities
7883                 * require utf8 */
7884                locinput++;	    /* Match the . or CR */
7885                if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7886                                       match the LF */
7887                    && locinput <  loceol
7888                    && UCHARAT(locinput) == '\n')
7889                {
7890                    locinput++;
7891                }
7892            }
7893            else {
7894
7895                /* Get the gcb type for the current character */
7896                GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7897                                                       (U8*) reginfo->strend);
7898
7899                /* Then scan through the input until we get to the first
7900                 * character whose type is supposed to be a gcb with the
7901                 * current character.  (There is always a break at the
7902                 * end-of-input) */
7903                locinput += UTF8SKIP(locinput);
7904                while (locinput < loceol) {
7905                    GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7906                                                         (U8*) reginfo->strend);
7907                    if (isGCB(prev_gcb, cur_gcb,
7908                              (U8*) reginfo->strbeg, (U8*) locinput,
7909                              utf8_target))
7910                    {
7911                        break;
7912                    }
7913
7914                    prev_gcb = cur_gcb;
7915                    locinput += UTF8SKIP(locinput);
7916                }
7917
7918
7919            }
7920            break;
7921
7922        case REFFLN:  /*  /\g{name}/il  */
7923        {   /* The capture buffer cases.  The ones beginning with N for the
7924               named buffers just convert to the equivalent numbered and
7925               pretend they were called as the corresponding numbered buffer
7926               op.  */
7927            /* don't initialize these in the declaration, it makes C++
7928               unhappy */
7929            const char *s;
7930            char type;
7931            re_fold_t folder;
7932            const U8 *fold_array;
7933            UV utf8_fold_flags;
7934
7935            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7936            folder = Perl_foldEQ_locale;
7937            fold_array = PL_fold_locale;
7938            type = REFFL;
7939            utf8_fold_flags = FOLDEQ_LOCALE;
7940            goto do_nref;
7941
7942        case REFFAN:  /*  /\g{name}/iaa  */
7943            folder = Perl_foldEQ_latin1;
7944            fold_array = PL_fold_latin1;
7945            type = REFFA;
7946            utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7947            goto do_nref;
7948
7949        case REFFUN:  /*  /\g{name}/iu  */
7950            folder = Perl_foldEQ_latin1;
7951            fold_array = PL_fold_latin1;
7952            type = REFFU;
7953            utf8_fold_flags = 0;
7954            goto do_nref;
7955
7956        case REFFN:  /*  /\g{name}/i  */
7957            folder = Perl_foldEQ;
7958            fold_array = PL_fold;
7959            type = REFF;
7960            utf8_fold_flags = 0;
7961            goto do_nref;
7962
7963        case REFN:  /*  /\g{name}/   */
7964            type = REF;
7965            folder = NULL;
7966            fold_array = NULL;
7967            utf8_fold_flags = 0;
7968          do_nref:
7969
7970            /* For the named back references, find the corresponding buffer
7971             * number */
7972            n = reg_check_named_buff_matched(rex,scan);
7973
7974            if ( ! n ) {
7975                sayNO;
7976            }
7977            goto do_nref_ref_common;
7978
7979        case REFFL:  /*  /\1/il  */
7980            CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7981            folder = Perl_foldEQ_locale;
7982            fold_array = PL_fold_locale;
7983            utf8_fold_flags = FOLDEQ_LOCALE;
7984            goto do_ref;
7985
7986        case REFFA:  /*  /\1/iaa  */
7987            folder = Perl_foldEQ_latin1;
7988            fold_array = PL_fold_latin1;
7989            utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7990            goto do_ref;
7991
7992        case REFFU:  /*  /\1/iu  */
7993            folder = Perl_foldEQ_latin1;
7994            fold_array = PL_fold_latin1;
7995            utf8_fold_flags = 0;
7996            goto do_ref;
7997
7998        case REFF:  /*  /\1/i  */
7999            folder = Perl_foldEQ;
8000            fold_array = PL_fold;
8001            utf8_fold_flags = 0;
8002            goto do_ref;
8003
8004#undef  ST
8005#define ST st->u.backref
8006        case REF:  /*  /\1/    */
8007            folder = NULL;
8008            fold_array = NULL;
8009            utf8_fold_flags = 0;
8010
8011          do_ref:
8012            type = OP(scan);
8013            n = ARG1u(scan);  /* which paren pair */
8014            if (rex->logical_to_parno) {
8015                n = rex->logical_to_parno[n];
8016                do {
8017                    if ( RXp_LASTPAREN(rex) < n ||
8018                         RXp_OFFS_START(rex,n) == -1 ||
8019                         RXp_OFFS_END(rex,n) == -1
8020                    ) {
8021                        n = rex->parno_to_logical_next[n];
8022                    }
8023                    else {
8024                        break;
8025                    }
8026                } while(n);
8027
8028                if (!n) /* this means there is nothing that matched */
8029                    sayNO;
8030            }
8031
8032          do_nref_ref_common:
8033            reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8034            if (RXp_LASTPAREN(rex) < n)
8035                sayNO;
8036
8037            ln = RXp_OFFSp(rex)[n].start;
8038            endref = RXp_OFFSp(rex)[n].end;
8039            if (ln == -1 || endref == -1)
8040                sayNO;			/* Do not match unless seen CLOSEn. */
8041
8042            if (ln == endref)
8043                goto ref_yes;
8044
8045            s = reginfo->strbeg + ln;
8046            if (type != REF	/* REF can do byte comparison */
8047                && (utf8_target || type == REFFU || type == REFFL))
8048            {
8049                char * limit = loceol;
8050
8051                /* This call case insensitively compares the entire buffer
8052                    * at s, with the current input starting at locinput, but
8053                    * not going off the end given by loceol, and
8054                    * returns in <limit> upon success, how much of the
8055                    * current input was matched */
8056                if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
8057                                    locinput, &limit, 0, utf8_target, utf8_fold_flags))
8058                {
8059                    sayNO;
8060                }
8061                locinput = limit;
8062                goto ref_yes;
8063            }
8064
8065            /* Not utf8:  Inline the first character, for speed. */
8066            if ( ! NEXTCHR_IS_EOS
8067                && locinput < loceol
8068                && UCHARAT(s) != nextbyte
8069                && (   type == REF
8070                    || UCHARAT(s) != fold_array[nextbyte]))
8071            {
8072                sayNO;
8073            }
8074            ln = endref - ln;
8075            if (locinput + ln > loceol)
8076                sayNO;
8077            if (ln > 1 && (type == REF
8078                           ? memNE(s, locinput, ln)
8079                           : ! folder(aTHX_ locinput, s, ln)))
8080                sayNO;
8081            locinput += ln;
8082        }
8083        ref_yes:
8084            if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */
8085                ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen);
8086                REGCP_SET(ST.lastcp);
8087                PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
8088                                script_run_begin);
8089            }
8090            break;
8091            NOT_REACHED; /* NOTREACHED */
8092
8093        case REF_next:
8094            sayYES;
8095            break;
8096
8097        case REF_next_fail:
8098            REGCP_UNWIND(ST.lastcp);
8099            regcppop(rex, &maxopenparen);
8100            sayNO;
8101            break;
8102
8103        case NOTHING: /* null op; e.g. the 'nothing' following
8104                       * the '*' in m{(a+|b)*}' */
8105            break;
8106        case TAIL: /* placeholder while compiling (A|B|C) */
8107            break;
8108
8109#undef  ST
8110#define ST st->u.eval
8111#define CUR_EVAL cur_eval->u.eval
8112
8113        {
8114            SV *ret;
8115            REGEXP *re_sv;
8116            regexp *re;
8117            regexp_internal *rei;
8118            regnode *startpoint;
8119            U32 arg;
8120
8121        case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
8122            arg = ARG1u(scan);
8123            if (cur_eval && cur_eval->locinput == locinput) {
8124                if ( ++nochange_depth > max_nochange_depth )
8125                    Perl_croak(aTHX_
8126                        "Pattern subroutine nesting without pos change"
8127                        " exceeded limit in regex");
8128            } else {
8129                nochange_depth = 0;
8130            }
8131            re_sv = rex_sv;
8132            re = rex;
8133            rei = rexi;
8134            startpoint = scan + ARG2i(scan);
8135            EVAL_CLOSE_PAREN_SET( st, arg );
8136            /* Detect infinite recursion
8137             *
8138             * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
8139             * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
8140             * So we track the position in the string we are at each time
8141             * we recurse and if we try to enter the same routine twice from
8142             * the same position we throw an error.
8143             */
8144            if ( rex->recurse_locinput[arg] == locinput ) {
8145                /* FIXME: we should show the regop that is failing as part
8146                 * of the error message. */
8147                Perl_croak(aTHX_ "Infinite recursion in regex");
8148            } else {
8149                ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8150                rex->recurse_locinput[arg]= locinput;
8151
8152                DEBUG_r({
8153                    DECLARE_AND_GET_RE_DEBUG_FLAGS;
8154                    DEBUG_STACK_r({
8155                        Perl_re_exec_indentf( aTHX_
8156                            "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8157                            depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8158                        );
8159                    });
8160                });
8161            }
8162
8163            /* Save all the positions seen so far. */
8164            ST.cp = regcppush(rex, 0, maxopenparen);
8165            REGCP_SET(ST.lastcp);
8166
8167            /* and then jump to the code we share with EVAL */
8168            goto eval_recurse_doit;
8169            /* NOTREACHED */
8170
8171        case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
8172            if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8173                if ( ++nochange_depth > max_nochange_depth )
8174                    Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8175            } else {
8176                nochange_depth = 0;
8177            }
8178            {
8179                /* execute the code in the {...} */
8180
8181                dSP;
8182                IV before;
8183                OP * const oop = PL_op;
8184                COP * const ocurcop = PL_curcop;
8185                OP *nop;
8186                CV *newcv;
8187
8188                /* save *all* paren positions */
8189                ST.cp = regcppush(rex, 0, maxopenparen);
8190                REGCP_SET(ST.lastcp);
8191
8192                if (!caller_cv)
8193                    caller_cv = find_runcv(NULL);
8194
8195                n = ARG1u(scan);
8196
8197                if (rexi->data->what[n] == 'r') { /* code from an external qr */
8198                    newcv = (ReANY(
8199                                    (REGEXP*)(rexi->data->data[n])
8200                            ))->qr_anoncv;
8201                    nop = (OP*)rexi->data->data[n+1];
8202                }
8203                else if (rexi->data->what[n] == 'l') { /* literal code */
8204                    newcv = caller_cv;
8205                    nop = (OP*)rexi->data->data[n];
8206                    assert(CvDEPTH(newcv));
8207                }
8208                else {
8209                    /* literal with own CV */
8210                    assert(rexi->data->what[n] == 'L');
8211                    newcv = rex->qr_anoncv;
8212                    nop = (OP*)rexi->data->data[n];
8213                }
8214
8215                /* Some notes about MULTICALL and the context and save stacks.
8216                 *
8217                 * In something like
8218                 *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8219                 * since codeblocks don't introduce a new scope (so that
8220                 * local() etc accumulate), at the end of a successful
8221                 * match there will be a SAVEt_CLEARSV on the savestack
8222                 * for each of $x, $y, $z. If the three code blocks above
8223                 * happen to have come from different CVs (e.g. via
8224                 * embedded qr//s), then we must ensure that during any
8225                 * savestack unwinding, PL_comppad always points to the
8226                 * right pad at each moment. We achieve this by
8227                 * interleaving SAVEt_COMPPAD's on the savestack whenever
8228                 * there is a change of pad.
8229                 * In theory whenever we call a code block, we should
8230                 * push a CXt_SUB context, then pop it on return from
8231                 * that code block. This causes a bit of an issue in that
8232                 * normally popping a context also clears the savestack
8233                 * back to cx->blk_oldsaveix, but here we specifically
8234                 * don't want to clear the save stack on exit from the
8235                 * code block.
8236                 * Also for efficiency we don't want to keep pushing and
8237                 * popping the single SUB context as we backtrack etc.
8238                 * So instead, we push a single context the first time
8239                 * we need, it, then hang onto it until the end of this
8240                 * function. Whenever we encounter a new code block, we
8241                 * update the CV etc if that's changed. During the times
8242                 * in this function where we're not executing a code
8243                 * block, having the SUB context still there is a bit
8244                 * naughty - but we hope that no-one notices.
8245                 * When the SUB context is initially pushed, we fake up
8246                 * cx->blk_oldsaveix to be as if we'd pushed this context
8247                 * on first entry to S_regmatch rather than at some random
8248                 * point during the regexe execution. That way if we
8249                 * croak, popping the context stack will ensure that
8250                 * *everything* SAVEd by this function is undone and then
8251                 * the context popped, rather than e.g., popping the
8252                 * context (and restoring the original PL_comppad) then
8253                 * popping more of the savestack and restoring a bad
8254                 * PL_comppad.
8255                 */
8256
8257                /* If this is the first EVAL, push a MULTICALL. On
8258                 * subsequent calls, if we're executing a different CV, or
8259                 * if PL_comppad has got messed up from backtracking
8260                 * through SAVECOMPPADs, then refresh the context.
8261                 */
8262                if (newcv != last_pushed_cv || PL_comppad != last_pad)
8263                {
8264                    U8 flags = (CXp_SUB_RE |
8265                                ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8266                    SAVECOMPPAD();
8267                    if (last_pushed_cv) {
8268                        CHANGE_MULTICALL_FLAGS(newcv, flags);
8269                    }
8270                    else {
8271                        PUSH_MULTICALL_FLAGS(newcv, flags);
8272                    }
8273                    /* see notes above */
8274                    CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8275
8276                    last_pushed_cv = newcv;
8277                }
8278                else {
8279                    /* these assignments are just to silence compiler
8280                     * warnings */
8281                    multicall_cop = NULL;
8282                }
8283                last_pad = PL_comppad;
8284
8285                /* the initial nextstate you would normally execute
8286                 * at the start of an eval (which would cause error
8287                 * messages to come from the eval), may be optimised
8288                 * away from the execution path in the regex code blocks;
8289                 * so manually set PL_curcop to it initially */
8290                {
8291                    OP *o = cUNOPx(nop)->op_first;
8292                    assert(o->op_type == OP_NULL);
8293                    if (o->op_targ == OP_SCOPE) {
8294                        o = cUNOPo->op_first;
8295                    }
8296                    else {
8297                        assert(o->op_targ == OP_LEAVE);
8298                        o = cUNOPo->op_first;
8299                        assert(o->op_type == OP_ENTER);
8300                        o = OpSIBLING(o);
8301                    }
8302
8303                    if (o->op_type != OP_STUB) {
8304                        assert(    o->op_type == OP_NEXTSTATE
8305                                || o->op_type == OP_DBSTATE
8306                                || (o->op_type == OP_NULL
8307                                    &&  (  o->op_targ == OP_NEXTSTATE
8308                                        || o->op_targ == OP_DBSTATE
8309                                        )
8310                                    )
8311                        );
8312                        PL_curcop = (COP*)o;
8313                    }
8314                }
8315                nop = nop->op_next;
8316
8317                DEBUG_STATE_r( Perl_re_printf( aTHX_
8318                    "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8319
8320                RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg;
8321                if (reginfo->info_aux_eval->pos_magic)
8322                    MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8323                                  reginfo->sv, reginfo->strbeg,
8324                                  locinput - reginfo->strbeg);
8325
8326                if (sv_yes_mark) {
8327                    SV *sv_mrk = get_sv("REGMARK", 1);
8328                    sv_setsv(sv_mrk, sv_yes_mark);
8329                }
8330
8331                /* we don't use MULTICALL here as we want to call the
8332                 * first op of the block of interest, rather than the
8333                 * first op of the sub. Also, we don't want to free
8334                 * the savestack frame */
8335                before = (IV)(SP-PL_stack_base);
8336                PL_op = nop;
8337                CALLRUNOPS(aTHX);			/* Scalar context. */
8338                SPAGAIN;
8339                if ((IV)(SP-PL_stack_base) == before)
8340                    ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8341                else {
8342                    ret = POPs;
8343                    PUTBACK;
8344                }
8345
8346                /* before restoring everything, evaluate the returned
8347                 * value, so that 'uninit' warnings don't use the wrong
8348                 * PL_op or pad. Also need to process any magic vars
8349                 * (e.g. $1) *before* parentheses are restored */
8350
8351                PL_op = NULL;
8352
8353                re_sv = NULL;
8354                if (logical == 0) {       /* /(?{ ... })/ and /(*{ ... })/ */
8355                    SV *replsv = save_scalar(PL_replgv);
8356                    sv_setsv(replsv, ret); /* $^R */
8357                    SvSETMAGIC(replsv);
8358                }
8359                else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8360                    sw = cBOOL(SvTRUE_NN(ret));
8361                    logical = 0;
8362                }
8363                else {                   /*  /(??{ ... })  */
8364                    /*  if its overloaded, let the regex compiler handle
8365                     *  it; otherwise extract regex, or stringify  */
8366                    if (SvGMAGICAL(ret))
8367                        ret = sv_mortalcopy(ret);
8368                    if (!SvAMAGIC(ret)) {
8369                        SV *sv = ret;
8370                        if (SvROK(sv))
8371                            sv = SvRV(sv);
8372                        if (SvTYPE(sv) == SVt_REGEXP)
8373                            re_sv = (REGEXP*) sv;
8374                        else if (SvSMAGICAL(ret)) {
8375                            MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8376                            if (mg)
8377                                re_sv = (REGEXP *) mg->mg_obj;
8378                        }
8379
8380                        /* force any undef warnings here */
8381                        if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8382                            ret = sv_mortalcopy(ret);
8383                            (void) SvPV_force_nolen(ret);
8384                        }
8385                    }
8386
8387                }
8388
8389                /* *** Note that at this point we don't restore
8390                 * PL_comppad, (or pop the CxSUB) on the assumption it may
8391                 * be used again soon. This is safe as long as nothing
8392                 * in the regexp code uses the pad ! */
8393                PL_op = oop;
8394                PL_curcop = ocurcop;
8395                regcp_restore(rex, ST.lastcp, &maxopenparen);
8396                PL_curpm_under = PL_curpm;
8397                PL_curpm = PL_reg_curpm;
8398
8399                if (logical != 2) {
8400                    PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8401                                    script_run_begin);
8402                    /* NOTREACHED */
8403                }
8404            }
8405
8406                /* only /(??{ ... })/  from now on */
8407                logical = 0;
8408                {
8409                    /* extract RE object from returned value; compiling if
8410                     * necessary */
8411
8412                    if (re_sv) {
8413                        re_sv = reg_temp_copy(NULL, re_sv);
8414                    }
8415                    else {
8416                        U32 pm_flags = 0;
8417
8418                        if (SvUTF8(ret) && IN_BYTES) {
8419                            /* In use 'bytes': make a copy of the octet
8420                             * sequence, but without the flag on */
8421                            STRLEN len;
8422                            const char *const p = SvPV(ret, len);
8423                            ret = newSVpvn_flags(p, len, SVs_TEMP);
8424                        }
8425                        if (rex->intflags & PREGf_USE_RE_EVAL)
8426                            pm_flags |= PMf_USE_RE_EVAL;
8427
8428                        /* if we got here, it should be an engine which
8429                         * supports compiling code blocks and stuff */
8430                        assert(rex->engine && rex->engine->op_comp);
8431                        assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME));
8432                        re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8433                                    rex->engine, NULL, NULL,
8434                                    /* copy /msixn etc to inner pattern */
8435                                    ARG2i(scan),
8436                                    pm_flags);
8437
8438                        if (!(SvFLAGS(ret)
8439                              & (SVs_TEMP | SVs_GMG | SVf_ROK))
8440                         && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8441                            /* This isn't a first class regexp. Instead, it's
8442                               caching a regexp onto an existing, Perl visible
8443                               scalar.  */
8444                            sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8445                        }
8446                    }
8447                    SAVEFREESV(re_sv);
8448                    re = ReANY(re_sv);
8449                }
8450                RXp_MATCH_COPIED_off(re);
8451                RXp_SUBBEG(re) = RXp_SUBBEG(rex);
8452                RXp_SUBLEN(re) = RXp_SUBLEN(rex);
8453                RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex);
8454                RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex);
8455                RXp_LASTPAREN(re) = 0;
8456                RXp_LASTCLOSEPAREN(re) = 0;
8457                rei = RXi_GET(re);
8458                DEBUG_EXECUTE_r(
8459                    debug_start_match(re_sv, utf8_target, locinput,
8460                                    reginfo->strend, "EVAL/GOSUB: Matching embedded");
8461                );
8462                startpoint = rei->program + 1;
8463                EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8464                                             * close_paren only for GOSUB */
8465                ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8466
8467                /* note we saved the paren state earlier:
8468                ST.cp = regcppush(rex, 0, maxopenparen);
8469                REGCP_SET(ST.lastcp);
8470                */
8471                /* and set maxopenparen to 0, since we are starting a "fresh" match */
8472                maxopenparen = 0;
8473                /* run the pattern returned from (??{...}) */
8474
8475              eval_recurse_doit: /* Share code with GOSUB below this line
8476                            * At this point we expect the stack context to be
8477                            * set up correctly */
8478
8479                /* invalidate the S-L poscache. We're now executing a
8480                 * different set of WHILEM ops (and their associated
8481                 * indexes) against the same string, so the bits in the
8482                 * cache are meaningless. Setting maxiter to zero forces
8483                 * the cache to be invalidated and zeroed before reuse.
8484                 * XXX This is too dramatic a measure. Ideally we should
8485                 * save the old cache and restore when running the outer
8486                 * pattern again */
8487                reginfo->poscache_maxiter = 0;
8488
8489                /* the new regexp might have a different is_utf8_pat than we do */
8490                is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8491
8492                ST.prev_rex = rex_sv;
8493                ST.prev_curlyx = cur_curlyx;
8494                rex_sv = re_sv;
8495                SET_reg_curpm(rex_sv);
8496                rex = re;
8497                rexi = rei;
8498                cur_curlyx = NULL;
8499                ST.B = next;
8500                ST.prev_eval = cur_eval;
8501                cur_eval = st;
8502                /* now continue from first node in postoned RE */
8503                PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8504                                    loceol, script_run_begin);
8505                NOT_REACHED; /* NOTREACHED */
8506        }
8507
8508        case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8509            /* note: this is called twice; first after popping B, then A */
8510            DEBUG_STACK_r({
8511                Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8512                    depth, cur_eval, ST.prev_eval);
8513            });
8514
8515#define SET_RECURSE_LOCINPUT(STR,VAL)\
8516            if ( cur_eval && CUR_EVAL.close_paren ) {\
8517                DEBUG_STACK_r({ \
8518                    Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8519                        depth,    \
8520                        CUR_EVAL.close_paren - 1,\
8521                        cur_eval, \
8522                        VAL);     \
8523                });               \
8524                rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8525            }
8526
8527            SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8528
8529            rex_sv = ST.prev_rex;
8530            is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8531            SET_reg_curpm(rex_sv);
8532            rex = ReANY(rex_sv);
8533            rexi = RXi_GET(rex);
8534            {
8535                /* preserve $^R across LEAVE's. See Bug 121070. */
8536                SV *save_sv= GvSV(PL_replgv);
8537                SV *replsv;
8538                SvREFCNT_inc(save_sv);
8539                regcpblow(ST.cp); /* LEAVE in disguise */
8540                /* don't move this initialization up */
8541                replsv = GvSV(PL_replgv);
8542                sv_setsv(replsv, save_sv);
8543                SvSETMAGIC(replsv);
8544                SvREFCNT_dec(save_sv);
8545            }
8546            cur_eval = ST.prev_eval;
8547            cur_curlyx = ST.prev_curlyx;
8548
8549            /* Invalidate cache. See "invalidate" comment above. */
8550            reginfo->poscache_maxiter = 0;
8551            if ( nochange_depth )
8552                nochange_depth--;
8553
8554            SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8555            sayYES;
8556
8557
8558        case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8559            REGCP_UNWIND(ST.lastcp);
8560            regcppop(rex, &maxopenparen);
8561            sayNO;
8562
8563        case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8564            /* note: this is called twice; first after popping B, then A */
8565            DEBUG_STACK_r({
8566                Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8567                    depth, cur_eval, ST.prev_eval);
8568            });
8569
8570            SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8571
8572            rex_sv = ST.prev_rex;
8573            is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8574            SET_reg_curpm(rex_sv);
8575            rex = ReANY(rex_sv);
8576            rexi = RXi_GET(rex);
8577
8578            REGCP_UNWIND(ST.lastcp);
8579            regcppop(rex, &maxopenparen);
8580            cur_eval = ST.prev_eval;
8581            cur_curlyx = ST.prev_curlyx;
8582
8583            /* Invalidate cache. See "invalidate" comment above. */
8584            reginfo->poscache_maxiter = 0;
8585            if ( nochange_depth )
8586                nochange_depth--;
8587
8588            SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8589            sayNO_SILENT;
8590#undef ST
8591
8592        case OPEN: /*  (  */
8593            n = PARNO(scan);  /* which paren pair */
8594            RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg;
8595            if (n > maxopenparen)
8596                maxopenparen = n;
8597            DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8598                "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8599                depth,
8600                PTR2UV(rex),
8601                PTR2UV(RXp_OFFSp(rex)),
8602                (UV)n,
8603                (IV)RXp_OFFSp(rex)[n].start_tmp,
8604                (UV)maxopenparen
8605            ));
8606            lastopen = n;
8607            break;
8608
8609        case SROPEN: /*  (*SCRIPT_RUN:  */
8610            script_run_begin = (U8 *) locinput;
8611            break;
8612
8613
8614        case CLOSE:  /*  )  */
8615            n = PARNO(scan);  /* which paren pair */
8616            CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8617                             locinput - reginfo->strbeg);
8618            if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8619                goto fake_end;
8620
8621            break;
8622
8623        case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8624
8625            if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8626            {
8627                sayNO;
8628            }
8629
8630            break;
8631
8632
8633        case ACCEPT:  /*  (*ACCEPT)  */
8634            is_accepted = true;
8635            if (FLAGS(scan))
8636                sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
8637            utmp = ARG2u(scan);
8638
8639            if ( utmp ) {
8640                regnode *cursor;
8641                for (
8642                    cursor = scan;
8643                    cursor && ( OP(cursor) != END );
8644                    cursor = (
8645                               REGNODE_TYPE( OP(cursor) ) == END
8646                               || REGNODE_TYPE( OP(cursor) ) == WHILEM
8647                             )
8648                             ? REGNODE_AFTER(cursor)
8649                             : regnext(cursor)
8650                ){
8651                    if ( OP(cursor) != CLOSE )
8652                        continue;
8653
8654                    n = PARNO(cursor);
8655
8656                    if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8657                        continue;       /* so skip this one */
8658
8659                    CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8660                                     locinput - reginfo->strbeg);
8661
8662                    if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8663                        break;
8664                }
8665            }
8666            goto fake_end;
8667            /* NOTREACHED */
8668
8669        case GROUPP:  /*  (?(1))  */
8670            n = ARG1u(scan);  /* which paren pair */
8671            sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1);
8672            break;
8673
8674        case GROUPPN:  /*  (?(<name>))  */
8675            /* reg_check_named_buff_matched returns 0 for no match */
8676            sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8677            break;
8678
8679        case INSUBP:   /*  (?(R))  */
8680            n = ARG1u(scan);
8681            /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8682             * of SCAN is already set up as matches a eval.close_paren */
8683            sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8684            break;
8685
8686        case DEFINEP:  /*  (?(DEFINE))  */
8687            sw = 0;
8688            break;
8689
8690        case IFTHEN:   /*  (?(cond)A|B)  */
8691            reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8692            if (sw)
8693                next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8694            else {
8695                next = scan + ARG1u(scan);
8696                if (OP(next) == IFTHEN) /* Fake one. */
8697                    next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8698            }
8699            break;
8700
8701        case LOGICAL:  /* modifier for EVAL and IFMATCH */
8702            logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */
8703            break;
8704
8705/*******************************************************************
8706
8707The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8708pattern, where A and B are subpatterns. (For simple A, CURLYM or
8709STAR/PLUS/CURLY/CURLYN are used instead.)
8710
8711A*B is compiled as <CURLYX><A><WHILEM><B>
8712
8713On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8714state, which contains the current count, initialised to -1. It also sets
8715cur_curlyx to point to this state, with any previous value saved in the
8716state block.
8717
8718CURLYX then jumps straight to the WHILEM op, rather than executing A,
8719since the pattern may possibly match zero times (i.e. it's a while {} loop
8720rather than a do {} while loop).
8721
8722Each entry to WHILEM represents a successful match of A. The count in the
8723CURLYX block is incremented, another WHILEM state is pushed, and execution
8724passes to A or B depending on greediness and the current count.
8725
8726For example, if matching against the string a1a2a3b (where the aN are
8727substrings that match /A/), then the match progresses as follows: (the
8728pushed states are interspersed with the bits of strings matched so far):
8729
8730    <CURLYX cnt=-1>
8731    <CURLYX cnt=0><WHILEM>
8732    <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8733    <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8734    <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8735    <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8736
8737(Contrast this with something like CURLYM, which maintains only a single
8738backtrack state:
8739
8740    <CURLYM cnt=0> a1
8741    a1 <CURLYM cnt=1> a2
8742    a1 a2 <CURLYM cnt=2> a3
8743    a1 a2 a3 <CURLYM cnt=3> b
8744)
8745
8746Each WHILEM state block marks a point to backtrack to upon partial failure
8747of A or B, and also contains some minor state data related to that
8748iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8749overall state, such as the count, and pointers to the A and B ops.
8750
8751This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8752must always point to the *current* CURLYX block, the rules are:
8753
8754When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8755and set cur_curlyx to point the new block.
8756
8757When popping the CURLYX block after a successful or unsuccessful match,
8758restore the previous cur_curlyx.
8759
8760When WHILEM is about to execute B, save the current cur_curlyx, and set it
8761to the outer one saved in the CURLYX block.
8762
8763When popping the WHILEM block after a successful or unsuccessful B match,
8764restore the previous cur_curlyx.
8765
8766Here's an example for the pattern (AI* BI)*BO
8767I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8768
8769cur_
8770curlyx backtrack stack
8771------ ---------------
8772NULL
8773CO     <CO prev=NULL> <WO>
8774CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8775CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8776NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8777
8778At this point the pattern succeeds, and we work back down the stack to
8779clean up, restoring as we go:
8780
8781CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8782CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8783CO     <CO prev=NULL> <WO>
8784NULL
8785
8786*******************************************************************/
8787
8788#define ST st->u.curlyx
8789
8790        case CURLYX:    /* start of /A*B/  (for complex A) */
8791        {
8792            /* No need to save/restore up to this paren */
8793            I32 parenfloor = FLAGS(scan);
8794
8795            assert(next); /* keep Coverity happy */
8796            if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8797                next += ARG1u(next);
8798
8799            /* XXXX Probably it is better to teach regpush to support
8800               parenfloor > maxopenparen ... */
8801            if (parenfloor > (I32)RXp_LASTPAREN(rex))
8802                parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */
8803
8804            ST.prev_curlyx= cur_curlyx;
8805            cur_curlyx = st;
8806            ST.cp = PL_savestack_ix;
8807
8808            /* these fields contain the state of the current curly.
8809             * they are accessed by subsequent WHILEMs */
8810            ST.parenfloor = parenfloor;
8811            ST.me = scan;
8812            ST.B = next;
8813            ST.minmod = minmod;
8814            minmod = 0;
8815            ST.count = -1;	/* this will be updated by WHILEM */
8816            ST.lastloc = NULL;  /* this will be updated by WHILEM */
8817
8818            PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8819                                script_run_begin);
8820            NOT_REACHED; /* NOTREACHED */
8821        }
8822
8823        case CURLYX_end: /* just finished matching all of A*B */
8824            cur_curlyx = ST.prev_curlyx;
8825            sayYES;
8826            NOT_REACHED; /* NOTREACHED */
8827
8828        case CURLYX_end_fail: /* just failed to match all of A*B */
8829            regcpblow(ST.cp);
8830            cur_curlyx = ST.prev_curlyx;
8831            sayNO;
8832            NOT_REACHED; /* NOTREACHED */
8833
8834
8835#undef ST
8836#define ST st->u.whilem
8837
8838        case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8839        {
8840            /* see the discussion above about CURLYX/WHILEM */
8841            I32 n;
8842            int min, max;
8843            /* U16 first_paren, last_paren; */
8844            regnode *A;
8845
8846            assert(cur_curlyx); /* keep Coverity happy */
8847
8848            min = ARG1i(cur_curlyx->u.curlyx.me);
8849            max = ARG2i(cur_curlyx->u.curlyx.me);
8850            /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */
8851            /* last_paren = ARG3b(cur_curlyx->u.curlyx.me);  */
8852            A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8853            n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8854            ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8855            ST.cache_offset = 0;
8856            ST.cache_mask = 0;
8857
8858            DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8859                  depth, (long)n, min, max)
8860            );
8861
8862            /* First just match a string of min A's. */
8863            if (n < min) {
8864                ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8865                cur_curlyx->u.curlyx.lastloc = locinput;
8866                REGCP_SET(ST.lastcp);
8867
8868                PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8869                                script_run_begin);
8870                NOT_REACHED; /* NOTREACHED */
8871            }
8872
8873            /* If degenerate A matches "", assume A done. */
8874
8875            if (locinput == cur_curlyx->u.curlyx.lastloc) {
8876                DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8877                   depth)
8878                );
8879                goto do_whilem_B_max;
8880            }
8881
8882            /* super-linear cache processing.
8883             *
8884             * The idea here is that for certain types of CURLYX/WHILEM -
8885             * principally those whose upper bound is infinity (and
8886             * excluding regexes that have things like \1 and other very
8887             * non-regular expressiony things), then if a pattern like
8888             * /....A*.../ fails and we backtrack to the WHILEM, then we
8889             * make a note that this particular WHILEM op was at string
8890             * position 47 (say) when the rest of pattern failed. Then, if
8891             * we ever find ourselves back at that WHILEM, and at string
8892             * position 47 again, we can just fail immediately rather than
8893             * running the rest of the pattern again.
8894             *
8895             * This is very handy when patterns start to go
8896             * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8897             * with a combinatorial explosion of backtracking.
8898             *
8899             * The cache is implemented as a bit array, with one bit per
8900             * string byte position per WHILEM op (up to 16) - so its
8901             * between 0.25 and 2x the string size.
8902             *
8903             * To avoid allocating a poscache buffer every time, we do an
8904             * initially countdown; only after we have  executed a WHILEM
8905             * op (string-length x #WHILEMs) times do we allocate the
8906             * cache.
8907             *
8908             * The top 4 bits of FLAGS(scan) byte say how many different
8909             * relevant CURLLYX/WHILEM op pairs there are, while the
8910             * bottom 4-bits is the identifying index number of this
8911             * WHILEM.
8912             */
8913
8914            if (FLAGS(scan)) {
8915
8916                if (!reginfo->poscache_maxiter) {
8917                    /* start the countdown: Postpone detection until we
8918                     * know the match is not *that* much linear. */
8919                    reginfo->poscache_maxiter
8920                        =    (reginfo->strend - reginfo->strbeg + 1)
8921                           * (FLAGS(scan)>>4);
8922                    /* possible overflow for long strings and many CURLYX's */
8923                    if (reginfo->poscache_maxiter < 0)
8924                        reginfo->poscache_maxiter = I32_MAX;
8925                    reginfo->poscache_iter = reginfo->poscache_maxiter;
8926                }
8927
8928                if (reginfo->poscache_iter-- == 0) {
8929                    /* initialise cache */
8930                    const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8931                    regmatch_info_aux *const aux = reginfo->info_aux;
8932                    if (aux->poscache) {
8933                        if ((SSize_t)reginfo->poscache_size < size) {
8934                            Renew(aux->poscache, size, char);
8935                            reginfo->poscache_size = size;
8936                        }
8937                        Zero(aux->poscache, size, char);
8938                    }
8939                    else {
8940                        reginfo->poscache_size = size;
8941                        Newxz(aux->poscache, size, char);
8942                    }
8943                    DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8944      "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8945                              PL_colors[4], PL_colors[5])
8946                    );
8947                }
8948
8949                if (reginfo->poscache_iter < 0) {
8950                    /* have we already failed at this position? */
8951                    SSize_t offset, mask;
8952
8953                    reginfo->poscache_iter = -1; /* stop eventual underflow */
8954                    offset  = (FLAGS(scan) & 0xf) - 1
8955                                +   (locinput - reginfo->strbeg)
8956                                  * (FLAGS(scan)>>4);
8957                    mask    = 1 << (offset % 8);
8958                    offset /= 8;
8959                    if (reginfo->info_aux->poscache[offset] & mask) {
8960                        DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8961                            depth)
8962                        );
8963                        cur_curlyx->u.curlyx.count--;
8964                        sayNO; /* cache records failure */
8965                    }
8966                    ST.cache_offset = offset;
8967                    ST.cache_mask   = mask;
8968                }
8969            }
8970
8971            /* Prefer B over A for minimal matching. */
8972
8973            if (cur_curlyx->u.curlyx.minmod) {
8974                ST.save_curlyx = cur_curlyx;
8975                cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8976                PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8977                                    locinput, loceol, script_run_begin);
8978                NOT_REACHED; /* NOTREACHED */
8979            }
8980
8981            /* Prefer A over B for maximal matching. */
8982
8983            if (n < max) { /* More greed allowed? */
8984                ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8985                            maxopenparen);
8986                cur_curlyx->u.curlyx.lastloc = locinput;
8987                REGCP_SET(ST.lastcp);
8988                PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8989                                script_run_begin);
8990                NOT_REACHED; /* NOTREACHED */
8991            }
8992            goto do_whilem_B_max;
8993        }
8994        NOT_REACHED; /* NOTREACHED */
8995
8996        case WHILEM_B_min: /* just matched B in a minimal match */
8997        case WHILEM_B_max: /* just matched B in a maximal match */
8998            cur_curlyx = ST.save_curlyx;
8999            sayYES;
9000            NOT_REACHED; /* NOTREACHED */
9001
9002        case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
9003            cur_curlyx = ST.save_curlyx;
9004            cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9005            cur_curlyx->u.curlyx.count--;
9006            CACHEsayNO;
9007            NOT_REACHED; /* NOTREACHED */
9008
9009        case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
9010            /* FALLTHROUGH */
9011        case WHILEM_A_pre_fail: /* just failed to match even minimal A */
9012            REGCP_UNWIND(ST.lastcp);
9013            regcppop(rex, &maxopenparen);
9014            cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9015            cur_curlyx->u.curlyx.count--;
9016            CACHEsayNO;
9017            NOT_REACHED; /* NOTREACHED */
9018
9019        case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
9020            REGCP_UNWIND(ST.lastcp);
9021            regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9022            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
9023                depth)
9024            );
9025
9026          do_whilem_B_max:
9027            /* now try B */
9028            ST.save_curlyx = cur_curlyx;
9029            cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
9030            PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
9031                                locinput, loceol, script_run_begin);
9032            NOT_REACHED; /* NOTREACHED */
9033
9034        case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
9035            cur_curlyx = ST.save_curlyx;
9036
9037            if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) {
9038                /* Maximum greed exceeded */
9039                cur_curlyx->u.curlyx.count--;
9040                CACHEsayNO;
9041            }
9042
9043            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
9044            );
9045            /* Try grabbing another A and see if it helps. */
9046            cur_curlyx->u.curlyx.lastloc = locinput;
9047            ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
9048            REGCP_SET(ST.lastcp);
9049            PUSH_STATE_GOTO(WHILEM_A_min,
9050                /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
9051                locinput, loceol, script_run_begin);
9052            NOT_REACHED; /* NOTREACHED */
9053
9054#undef  ST
9055#define ST st->u.branch
9056
9057        case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
9058            next = scan + ARG1u(scan);
9059            if (next == scan)
9060                next = NULL;
9061            ST.before_paren = ARG2a(scan);
9062            ST.after_paren = ARG2b(scan);
9063            goto branch_logic;
9064            NOT_REACHED; /* NOTREACHED */
9065
9066        case BRANCH:	    /*  /(...|A|...)/ */
9067            ST.before_paren = ARG1a(scan);
9068            ST.after_paren = ARG1b(scan);
9069          branch_logic:
9070            scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
9071            assert(scan);
9072            ST.lastparen = RXp_LASTPAREN(rex);
9073            ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9074            ST.next_branch = next;
9075            REGCP_SET(ST.cp);
9076            if (RE_PESSIMISTIC_PARENS) {
9077                regcppush(rex, 0, maxopenparen);
9078                REGCP_SET(ST.lastcp);
9079            }
9080
9081            /* Now go into the branch */
9082            if (has_cutgroup) {
9083                PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9084                                    script_run_begin);
9085            } else {
9086                PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9087                                script_run_begin);
9088            }
9089            NOT_REACHED; /* NOTREACHED */
9090
9091        case CUTGROUP:  /*  /(*THEN)/  */
9092            sv_yes_mark = st->u.mark.mark_name = FLAGS(scan)
9093                ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ])
9094                : NULL;
9095            PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
9096                            script_run_begin);
9097            NOT_REACHED; /* NOTREACHED */
9098
9099        case CUTGROUP_next_fail:
9100            do_cutgroup = 1;
9101            no_final = 1;
9102            if (st->u.mark.mark_name)
9103                sv_commit = st->u.mark.mark_name;
9104            sayNO;
9105            NOT_REACHED; /* NOTREACHED */
9106
9107        case BRANCH_next:
9108            sayYES;
9109            NOT_REACHED; /* NOTREACHED */
9110
9111        case BRANCH_next_fail: /* that branch failed; try the next, if any */
9112            if (do_cutgroup) {
9113                do_cutgroup = 0;
9114                no_final = 0;
9115            }
9116            if (RE_PESSIMISTIC_PARENS) {
9117                REGCP_UNWIND(ST.lastcp);
9118                regcppop(rex,&maxopenparen);
9119            }
9120            REGCP_UNWIND(ST.cp);
9121            UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9122            CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail");
9123            scan = ST.next_branch;
9124            /* no more branches? */
9125            if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
9126                DEBUG_EXECUTE_r({
9127                    Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
9128                        depth,
9129                        PL_colors[4],
9130                        PL_colors[5] );
9131                });
9132                sayNO_SILENT;
9133            }
9134            continue; /* execute next BRANCH[J] op */
9135            /* NOTREACHED */
9136
9137        case MINMOD: /* next op will be non-greedy, e.g. A*?  */
9138            minmod = 1;
9139            break;
9140
9141#undef  ST
9142#define ST st->u.curlym
9143
9144        case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
9145
9146            /* This is an optimisation of CURLYX that enables us to push
9147             * only a single backtracking state, no matter how many matches
9148             * there are in {m,n}. It relies on the pattern being constant
9149             * length, with no parens to influence future backrefs
9150             */
9151
9152            ST.me = scan;
9153            scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
9154
9155            ST.lastparen      = RXp_LASTPAREN(rex);
9156            ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9157
9158            /* if paren positive, emulate an OPEN/CLOSE around A */
9159            if (FLAGS(ST.me)) {
9160                U32 paren = FLAGS(ST.me);
9161                lastopen = paren;
9162                if (paren > maxopenparen)
9163                    maxopenparen = paren;
9164                scan += NEXT_OFF(scan); /* Skip former OPEN. */
9165            }
9166            ST.A = scan;
9167            ST.B = next;
9168            ST.alen = 0;
9169            ST.count = 0;
9170            ST.minmod = minmod;
9171            minmod = 0;
9172            ST.Binfo.count = -1;
9173            REGCP_SET(ST.cp);
9174
9175            if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */
9176                goto curlym_do_B;
9177
9178          curlym_do_A: /* execute the A in /A{m,n}B/  */
9179            PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9180                                script_run_begin);
9181            NOT_REACHED; /* NOTREACHED */
9182
9183        case CURLYM_A: /* we've just matched an A */
9184            ST.count++;
9185            /* after first match, determine A's length: u.curlym.alen */
9186            if (ST.count == 1) {
9187                if (reginfo->is_utf8_target) {
9188                    char *s = st->locinput;
9189                    while (s < locinput) {
9190                        ST.alen++;
9191                        s += UTF8SKIP(s);
9192                    }
9193                }
9194                else {
9195                    ST.alen = locinput - st->locinput;
9196                }
9197                if (ST.alen == 0)
9198                    ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me);
9199            }
9200            DEBUG_EXECUTE_r(
9201                Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9202                          depth, (IV) ST.count, (IV)ST.alen)
9203            );
9204
9205            if (FLAGS(ST.me)) {
9206                /* emulate CLOSE: mark current A as captured */
9207                U32 paren = (U32)FLAGS(ST.me);
9208                CLOSE_CAPTURE(rex, paren,
9209                    HOPc(locinput, -ST.alen) - reginfo->strbeg,
9210                    locinput - reginfo->strbeg);
9211            }
9212
9213            if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9214                goto fake_end;
9215
9216
9217            if (!is_accepted) {
9218                I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me));
9219                if ( max == REG_INFTY || ST.count < max )
9220                    goto curlym_do_A; /* try to match another A */
9221            }
9222            goto curlym_do_B; /* try to match B */
9223
9224        case CURLYM_A_fail: /* just failed to match an A */
9225            REGCP_UNWIND(ST.cp);
9226
9227
9228            if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/
9229                || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9230                sayNO;
9231
9232          curlym_do_B: /* execute the B in /A{m,n}B/  */
9233            if (is_accepted)
9234                goto curlym_close_B;
9235
9236            if (ST.Binfo.count < 0) {
9237                /* calculate possible match of 1st char following curly */
9238                assert(ST.B);
9239                if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9240                    regnode *text_node = ST.B;
9241                    if (! HAS_TEXT(text_node))
9242                        FIND_NEXT_IMPT(text_node);
9243                    if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9244                        if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9245                                                        &ST.Binfo, reginfo))
9246                        {
9247                            sayNO;
9248                        }
9249                    }
9250                }
9251            }
9252
9253            DEBUG_EXECUTE_r(
9254                Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
9255                    depth, (IV)ST.count)
9256            );
9257            if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9258                assert(ST.Binfo.count > 0);
9259
9260                /* Do a quick test to hopefully rule out most non-matches */
9261                if (     locinput + ST.Binfo.min_length > loceol
9262                    || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9263                {
9264                    DEBUG_OPTIMISE_r(
9265                        Perl_re_exec_indentf( aTHX_
9266                            "CURLYM Fast bail next target=0x%X anded==0x%X"
9267                                                                " mask=0x%X\n",
9268                            depth,
9269                            (int) nextbyte, ST.Binfo.first_byte_anded,
9270                                            ST.Binfo.first_byte_mask)
9271                    );
9272                    state_num = CURLYM_B_fail;
9273                    goto reenter_switch;
9274                }
9275            }
9276
9277          curlym_close_B:
9278            if (FLAGS(ST.me)) {
9279                /* emulate CLOSE: mark current A as captured */
9280                U32 paren = (U32)FLAGS(ST.me);
9281                if (ST.count || is_accepted) {
9282                    CLOSE_CAPTURE(rex, paren,
9283                        HOPc(locinput, -ST.alen) - reginfo->strbeg,
9284                        locinput - reginfo->strbeg);
9285                }
9286                else
9287                    RXp_OFFSp(rex)[paren].end = -1;
9288
9289                if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9290                {
9291                    if (ST.count || is_accepted)
9292                        goto fake_end;
9293                    else
9294                        sayNO;
9295                }
9296            }
9297
9298            if (is_accepted)
9299                goto fake_end;
9300
9301            PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9302                            script_run_begin);
9303            NOT_REACHED; /* NOTREACHED */
9304
9305        case CURLYM_B_fail: /* just failed to match a B */
9306            REGCP_UNWIND(ST.cp);
9307            UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9308            if (ST.minmod) {
9309                I32 max = ARG2i(ST.me);
9310                if (max != REG_INFTY && ST.count == max)
9311                    sayNO;
9312                goto curlym_do_A; /* try to match a further A */
9313            }
9314            /* backtrack one A */
9315            if (ST.count == ARG1i(ST.me) /* min */)
9316                sayNO;
9317            ST.count--;
9318            SET_locinput(HOPc(locinput, -ST.alen));
9319            goto curlym_do_B; /* try to match B */
9320
9321#undef ST
9322#define ST st->u.curly
9323
9324#define CURLY_SETPAREN(paren, success)                                      \
9325    if (paren) {                                                            \
9326        if (success) {                                                      \
9327            CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \
9328                                 locinput - reginfo->strbeg);               \
9329        }                                                                   \
9330        else {                                                              \
9331            RXp_OFFSp(rex)[paren].end = -1;                                 \
9332            RXp_LASTPAREN(rex)  = ST.lastparen;                             \
9333            RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen;                    \
9334        }                                                                   \
9335    }
9336
9337        case STAR:		/*  /A*B/ where A is width 1 char */
9338            ST.paren = 0;
9339            ST.min = 0;
9340            ST.max = REG_INFTY;
9341            scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9342            goto repeat;
9343
9344        case PLUS:		/*  /A+B/ where A is width 1 char */
9345            ST.paren = 0;
9346            ST.min = 1;
9347            ST.max = REG_INFTY;
9348            scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9349            goto repeat;
9350
9351        case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
9352            ST.paren = FLAGS(scan);     /* Which paren to set */
9353            ST.lastparen      = RXp_LASTPAREN(rex);
9354            ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9355            if (ST.paren > maxopenparen)
9356                maxopenparen = ST.paren;
9357            ST.min = ARG1i(scan);  /* min to match */
9358            ST.max = ARG2i(scan);  /* max to match */
9359            scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9360
9361            /* handle the single-char capture called as a GOSUB etc */
9362            if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9363            {
9364                char *li = locinput;
9365                if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9366                    sayNO;
9367                SET_locinput(li);
9368                goto fake_end;
9369            }
9370
9371            goto repeat;
9372
9373        case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
9374            ST.paren = 0;
9375            ST.min = ARG1i(scan);  /* min to match */
9376            ST.max = ARG2i(scan);  /* max to match */
9377            scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9378          repeat:
9379            /*
9380            * Lookahead to avoid useless match attempts
9381            * when we know what character comes next.
9382            *
9383            * Used to only do .*x and .*?x, but now it allows
9384            * for )'s, ('s and (?{ ... })'s to be in the way
9385            * of the quantifier and the EXACT-like node.  -- japhy
9386            */
9387
9388            assert(ST.min <= ST.max);
9389            if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9390                ST.Binfo.count = 0;
9391            }
9392            else {
9393                regnode *text_node = next;
9394
9395                if (! HAS_TEXT(text_node))
9396                    FIND_NEXT_IMPT(text_node);
9397
9398                if (! HAS_TEXT(text_node))
9399                    ST.Binfo.count = 0;
9400                else {
9401                    if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9402                        ST.Binfo.count = 0;
9403                    }
9404                    else {
9405                        if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9406                                                        &ST.Binfo, reginfo))
9407                        {
9408                            sayNO;
9409                        }
9410                    }
9411                }
9412            }
9413
9414            ST.A = scan;
9415            ST.B = next;
9416            if (minmod) {
9417                char *li = locinput;
9418                minmod = 0;
9419                if (ST.min &&
9420                        regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9421                            < ST.min)
9422                    sayNO;
9423                SET_locinput(li);
9424                ST.count = ST.min;
9425                REGCP_SET(ST.cp);
9426
9427                if (ST.Binfo.count <= 0)
9428                    goto curly_try_B_min;
9429
9430                ST.oldloc = locinput;
9431
9432                /* set ST.maxpos to the furthest point along the
9433                 * string that could possibly match, i.e., that a match could
9434                 * start at. */
9435                if  (ST.max == REG_INFTY) {
9436                    ST.maxpos = loceol - 1;
9437                    if (utf8_target)
9438                        while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9439                            ST.maxpos--;
9440                }
9441                else if (utf8_target) {
9442                    int m = ST.max - ST.min;
9443                    for (ST.maxpos = locinput;
9444                         m >0 && ST.maxpos <  loceol; m--)
9445                        ST.maxpos += UTF8SKIP(ST.maxpos);
9446                }
9447                else {
9448                    ST.maxpos = locinput + ST.max - ST.min;
9449                    if (ST.maxpos >=  loceol)
9450                        ST.maxpos =  loceol - 1;
9451                }
9452                goto curly_try_B_min_known;
9453
9454            }
9455            else {
9456                /* avoid taking address of locinput, so it can remain
9457                 * a register var */
9458                char *li = locinput;
9459                if (ST.max)
9460                    ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9461                else
9462                    ST.count = 0;
9463                if (ST.count < ST.min)
9464                    sayNO;
9465                SET_locinput(li);
9466                if ((ST.count > ST.min)
9467                    && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9468                {
9469                    /* A{m,n} must come at the end of the string, there's
9470                     * no point in backing off ... */
9471                    ST.min = ST.count;
9472                    /* ...except that $ and \Z can match before *and* after
9473                       newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9474                       We may back off by one in this case. */
9475                    if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9476                        ST.min--;
9477                }
9478                REGCP_SET(ST.cp);
9479                goto curly_try_B_max;
9480            }
9481            NOT_REACHED; /* NOTREACHED */
9482
9483        case CURLY_B_min_fail:
9484            /* failed to find B in a non-greedy match. */
9485            if (RE_PESSIMISTIC_PARENS) {
9486                REGCP_UNWIND(ST.lastcp);
9487                regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9488            }
9489            REGCP_UNWIND(ST.cp);
9490            if (ST.paren) {
9491                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9492            }
9493
9494            if (ST.Binfo.count == 0) {
9495                /* failed -- move forward one */
9496                char *li = locinput;
9497                if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9498                    sayNO;
9499                }
9500                locinput = li;
9501                ST.count++;
9502                if (!(   ST.count <= ST.max
9503                        /* count overflow ? */
9504                     || (ST.max == REG_INFTY && ST.count > 0))
9505                )
9506                    sayNO;
9507            }
9508            else {
9509                int n;
9510                /* Couldn't or didn't -- move forward. */
9511                ST.oldloc = locinput;
9512                if (utf8_target)
9513                    locinput += UTF8SKIP(locinput);
9514                else
9515                    locinput++;
9516                ST.count++;
9517
9518              curly_try_B_min_known:
9519                /* find the next place where 'B' could work, then call B */
9520                if (locinput + ST.Binfo.initial_exact < loceol) {
9521                    if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9522
9523                        /* Here, the mask is all 1's for the entire length of
9524                         * any possible match.  (That actually means that there
9525                         * is only one possible match.)  Look for the next
9526                         * occurrence */
9527                        locinput = ninstr(locinput, loceol,
9528                                        (char *) ST.Binfo.matches,
9529                                        (char *) ST.Binfo.matches
9530                                                    + ST.Binfo.initial_exact);
9531                        if (locinput == NULL) {
9532                            sayNO;
9533                        }
9534                    }
9535                    else do {
9536                        /* If the first byte(s) of the mask are all ones, it
9537                         * means those bytes must match identically, so can use
9538                         * ninstr() to find the next possible matchpoint */
9539                        if (ST.Binfo.initial_exact > 0) {
9540                            locinput = ninstr(locinput, loceol,
9541                                              (char *) ST.Binfo.matches,
9542                                              (char *) ST.Binfo.matches
9543                                                     + ST.Binfo.initial_exact);
9544                        }
9545                        else { /* Otherwise find the next byte that matches,
9546                                  masked */
9547                            locinput = (char *) find_next_masked(
9548                                                (U8 *) locinput, (U8 *) loceol,
9549                                                ST.Binfo.first_byte_anded,
9550                                                ST.Binfo.first_byte_mask);
9551                            /* Advance to the end of a multi-byte character */
9552                            if (utf8_target) {
9553                                while (   locinput < loceol
9554                                    && UTF8_IS_CONTINUATION(*locinput))
9555                                {
9556                                    locinput++;
9557                                }
9558                            }
9559                        }
9560                        if (   locinput == NULL
9561                            || locinput + ST.Binfo.min_length > loceol)
9562                        {
9563                            sayNO;
9564                        }
9565
9566                        /* Here, we have found a possible match point; if can't
9567                         * rule it out, quit the loop so can check fully */
9568                        if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9569                            break;
9570                        }
9571
9572                        locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9573
9574                    } while (locinput <= ST.maxpos);
9575                }
9576
9577                if (locinput > ST.maxpos)
9578                    sayNO;
9579
9580                n = (utf8_target)
9581                    ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9582                    : (STRLEN) (locinput - ST.oldloc);
9583
9584
9585                /* Here is at the beginning of a character that meets the mask
9586                 * criteria.  Need to make sure that some real possibility */
9587
9588                if (n) {
9589                    /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9590                     * at what may be the beginning of b; check that everything
9591                     * between oldloc and locinput matches */
9592                    char *li = ST.oldloc;
9593                    ST.count += n;
9594                    if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9595                        sayNO;
9596                    assert(n == REG_INFTY || locinput == li);
9597                }
9598            }
9599
9600          curly_try_B_min:
9601            if (RE_PESSIMISTIC_PARENS) {
9602                (void)regcppush(rex, 0, maxopenparen);
9603                REGCP_SET(ST.lastcp);
9604            }
9605            CURLY_SETPAREN(ST.paren, ST.count);
9606            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9607                            script_run_begin);
9608            NOT_REACHED; /* NOTREACHED */
9609
9610
9611          curly_try_B_max:
9612            /* a successful greedy match: now try to match B */
9613            if (        ST.Binfo.count <= 0
9614                || (    ST.Binfo.count > 0
9615                    &&  locinput + ST.Binfo.min_length <= loceol
9616                    &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9617            {
9618                if (RE_PESSIMISTIC_PARENS) {
9619                    (void)regcppush(rex, 0, maxopenparen);
9620                    REGCP_SET(ST.lastcp);
9621                }
9622                CURLY_SETPAREN(ST.paren, ST.count);
9623                PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9624                                script_run_begin);
9625                NOT_REACHED; /* NOTREACHED */
9626            }
9627            goto CURLY_B_all_failed;
9628            NOT_REACHED; /* NOTREACHED */
9629
9630        case CURLY_B_max_fail:
9631            /* failed to find B in a greedy match */
9632
9633            if (RE_PESSIMISTIC_PARENS) {
9634                REGCP_UNWIND(ST.lastcp);
9635                regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9636            }
9637          CURLY_B_all_failed:
9638            REGCP_UNWIND(ST.cp);
9639            if (ST.paren) {
9640                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9641            }
9642            /*  back up. */
9643            if (--ST.count < ST.min)
9644                sayNO;
9645            locinput = HOPc(locinput, -1);
9646            goto curly_try_B_max;
9647
9648#undef ST
9649
9650        case END: /*  last op of main pattern  */
9651          fake_end:
9652            if (cur_eval) {
9653                /* we've just finished A in /(??{A})B/; now continue with B */
9654                is_accepted= false;
9655                SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9656                st->u.eval.prev_rex = rex_sv;		/* inner */
9657
9658                /* Save *all* the positions. */
9659                st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9660                rex_sv = CUR_EVAL.prev_rex;
9661                is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9662                SET_reg_curpm(rex_sv);
9663                rex = ReANY(rex_sv);
9664                rexi = RXi_GET(rex);
9665
9666                st->u.eval.prev_curlyx = cur_curlyx;
9667                cur_curlyx = CUR_EVAL.prev_curlyx;
9668
9669                REGCP_SET(st->u.eval.lastcp);
9670
9671                /* Restore parens of the outer rex without popping the
9672                 * savestack */
9673                regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9674
9675                st->u.eval.prev_eval = cur_eval;
9676                cur_eval = CUR_EVAL.prev_eval;
9677                DEBUG_EXECUTE_r(
9678                    Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9679                                      depth, cur_eval););
9680                if ( nochange_depth )
9681                    nochange_depth--;
9682
9683                SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9684
9685                PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9686                                    st->u.eval.prev_eval->u.eval.B,
9687                                    locinput, loceol, script_run_begin);
9688            }
9689
9690            if (locinput < reginfo->till) {
9691                DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9692                                      "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9693                                      PL_colors[4],
9694                                      (long)(locinput - startpos),
9695                                      (long)(reginfo->till - startpos),
9696                                      PL_colors[5]));
9697
9698                sayNO_SILENT;		/* Cannot match: too short. */
9699            }
9700            sayYES;			/* Success! */
9701
9702        case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9703                                matches end at the right spot, required for
9704                                variable length matches. */
9705            if (match_end && locinput != match_end)
9706            {
9707                DEBUG_EXECUTE_r(
9708                Perl_re_exec_indentf( aTHX_
9709                    "%sLOOKBEHIND_END: subpattern failed...%s\n",
9710                    depth, PL_colors[4], PL_colors[5]));
9711                sayNO;            /* Variable length match didn't line up */
9712            }
9713            /* FALLTHROUGH */
9714
9715        case SUCCEED: /* successful SUSPEND/CURLYM and
9716                                            *lookahead* IFMATCH/UNLESSM*/
9717            DEBUG_EXECUTE_r(
9718            Perl_re_exec_indentf( aTHX_
9719                "%sSUCCEED: subpattern success...%s\n",
9720                depth, PL_colors[4], PL_colors[5]));
9721            sayYES;			/* Success! */
9722
9723#undef  ST
9724#define ST st->u.ifmatch
9725
9726        case SUSPEND:	/* (?>A) */
9727            ST.wanted = 1;
9728            ST.start = locinput;
9729            ST.end = loceol;
9730            ST.count = 1;
9731            goto do_ifmatch;
9732
9733        case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9734            ST.wanted = 0;
9735            goto ifmatch_trivial_fail_test;
9736
9737        case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9738            ST.wanted = 1;
9739          ifmatch_trivial_fail_test:
9740            ST.prev_match_end= match_end;
9741            ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be
9742                                              lookbehind count, requires
9743                                              non-zero flags */
9744            if (! FLAGS(scan)) {    /* 'flags' zero means lookahed */
9745
9746                /* Lookahead starts here and ends at the normal place */
9747                ST.start = locinput;
9748                ST.end = loceol;
9749                match_end = NULL;
9750            }
9751            else {
9752                PERL_UINT_FAST8_T back_count = FLAGS(scan);
9753                char * s;
9754                match_end = locinput;
9755
9756                /* Lookbehind can look beyond the current position */
9757                ST.end = loceol;
9758
9759                /* ... and starts at the first place in the input that is in
9760                 * the range of the possible start positions */
9761                for (; ST.count > 0; ST.count--, back_count--) {
9762                    s = HOPBACKc(locinput, back_count);
9763                    if (s) {
9764                        ST.start = s;
9765                        goto do_ifmatch;
9766                    }
9767                }
9768
9769                /* If the lookbehind doesn't start in the actual string, is a
9770                 * trivial match failure */
9771                match_end = ST.prev_match_end;
9772                if (logical) {
9773                    logical = 0;
9774                    sw = 1 - cBOOL(ST.wanted);
9775                }
9776                else if (ST.wanted)
9777                    sayNO;
9778
9779                /* Here, we didn't want it to match, so is actually success */
9780                next = scan + ARG1u(scan);
9781                if (next == scan)
9782                    next = NULL;
9783                break;
9784            }
9785
9786          do_ifmatch:
9787            ST.me = scan;
9788            ST.logical = logical;
9789            logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9790
9791            /* execute body of (?...A) */
9792            PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9793                                ST.end, script_run_begin);
9794            NOT_REACHED; /* NOTREACHED */
9795
9796        {
9797            bool matched;
9798
9799        case IFMATCH_A_fail: /* body of (?...A) failed */
9800            if (! ST.logical && ST.count > 1) {
9801
9802                /* It isn't a real failure until we've tried all starting
9803                 * positions.  Move to the next starting position and retry */
9804                ST.count--;
9805                ST.start = HOPc(ST.start, 1);
9806                scan = ST.me;
9807                logical = ST.logical;
9808                goto do_ifmatch;
9809            }
9810
9811            /* Here, all starting positions have been tried. */
9812            matched = FALSE;
9813            goto ifmatch_done;
9814
9815        case IFMATCH_A: /* body of (?...A) succeeded */
9816            matched = TRUE;
9817          ifmatch_done:
9818            sw = matched == ST.wanted;
9819            match_end = ST.prev_match_end;
9820            if (! ST.logical && !sw) {
9821                sayNO;
9822            }
9823
9824            if (OP(ST.me) != SUSPEND) {
9825                /* restore old position except for (?>...) */
9826                locinput = st->locinput;
9827                loceol = st->loceol;
9828                script_run_begin = st->sr0;
9829            }
9830            scan = ST.me + ARG1u(ST.me);
9831            if (scan == ST.me)
9832                scan = NULL;
9833            continue; /* execute B */
9834        }
9835
9836#undef ST
9837
9838        case LONGJMP: /*  alternative with many branches compiles to
9839                       * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9840            next = scan + ARG1u(scan);
9841            if (next == scan)
9842                next = NULL;
9843            break;
9844
9845        case COMMIT:  /*  (*COMMIT)  */
9846            reginfo->cutpoint = loceol;
9847            /* FALLTHROUGH */
9848
9849        case PRUNE:   /*  (*PRUNE)   */
9850            if (FLAGS(scan))
9851                sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9852            PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9853                            script_run_begin);
9854            NOT_REACHED; /* NOTREACHED */
9855
9856        case COMMIT_next_fail:
9857            no_final = 1;
9858            /* FALLTHROUGH */
9859            sayNO;
9860            NOT_REACHED; /* NOTREACHED */
9861
9862        case OPFAIL:   /* (*FAIL)  */
9863            if (FLAGS(scan))
9864                sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9865            if (logical) {
9866                /* deal with (?(?!)X|Y) properly,
9867                 * make sure we trigger the no branch
9868                 * of the trailing IFTHEN structure*/
9869                sw= 0;
9870                break;
9871            } else {
9872                sayNO;
9873            }
9874            NOT_REACHED; /* NOTREACHED */
9875
9876#define ST st->u.mark
9877        case MARKPOINT: /*  (*MARK:foo)  */
9878            ST.prev_mark = mark_state;
9879            ST.mark_name = sv_commit = sv_yes_mark
9880                = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9881            mark_state = st;
9882            ST.mark_loc = locinput;
9883            PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9884                                script_run_begin);
9885            NOT_REACHED; /* NOTREACHED */
9886
9887        case MARKPOINT_next:
9888            mark_state = ST.prev_mark;
9889            sayYES;
9890            NOT_REACHED; /* NOTREACHED */
9891
9892        case MARKPOINT_next_fail:
9893            if (popmark && sv_eq(ST.mark_name,popmark))
9894            {
9895                if (ST.mark_loc > startpoint)
9896                    reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9897                popmark = NULL; /* we found our mark */
9898                sv_commit = ST.mark_name;
9899
9900                DEBUG_EXECUTE_r({
9901                        Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9902                            depth,
9903                            PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9904                });
9905            }
9906            mark_state = ST.prev_mark;
9907            sv_yes_mark = mark_state ?
9908                mark_state->u.mark.mark_name : NULL;
9909            sayNO;
9910            NOT_REACHED; /* NOTREACHED */
9911
9912        case SKIP:  /*  (*SKIP)  */
9913            if (!FLAGS(scan)) {
9914                /* (*SKIP) : if we fail we cut here*/
9915                ST.mark_name = NULL;
9916                ST.mark_loc = locinput;
9917                PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9918                                script_run_begin);
9919            } else {
9920                /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9921                   otherwise do nothing.  Meaning we need to scan
9922                 */
9923                regmatch_state *cur = mark_state;
9924                SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9925
9926                while (cur) {
9927                    if ( sv_eq( cur->u.mark.mark_name,
9928                                find ) )
9929                    {
9930                        ST.mark_name = find;
9931                        PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9932                                         script_run_begin);
9933                    }
9934                    cur = cur->u.mark.prev_mark;
9935                }
9936            }
9937            /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9938            break;
9939
9940        case SKIP_next_fail:
9941            if (ST.mark_name) {
9942                /* (*CUT:NAME) - Set up to search for the name as we
9943                   collapse the stack*/
9944                popmark = ST.mark_name;
9945            } else {
9946                /* (*CUT) - No name, we cut here.*/
9947                if (ST.mark_loc > startpoint)
9948                    reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9949                /* but we set sv_commit to latest mark_name if there
9950                   is one so they can test to see how things lead to this
9951                   cut */
9952                if (mark_state)
9953                    sv_commit=mark_state->u.mark.mark_name;
9954            }
9955            no_final = 1;
9956            sayNO;
9957            NOT_REACHED; /* NOTREACHED */
9958#undef ST
9959
9960        case LNBREAK: /* \R */
9961            if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9962                locinput += n;
9963            } else
9964                sayNO;
9965            break;
9966
9967        default:
9968            PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9969                          PTR2UV(scan), OP(scan));
9970            Perl_croak(aTHX_ "regexp memory corruption");
9971
9972        /* this is a point to jump to in order to increment
9973         * locinput by one character */
9974          increment_locinput:
9975            assert(!NEXTCHR_IS_EOS);
9976            if (utf8_target) {
9977                locinput += PL_utf8skip[nextbyte];
9978                /* locinput is allowed to go 1 char off the end (signifying
9979                 * EOS), but not 2+ */
9980                if (locinput >  loceol)
9981                    sayNO;
9982            }
9983            else
9984                locinput++;
9985            break;
9986
9987        } /* end switch */
9988
9989        /* switch break jumps here */
9990        scan = next; /* prepare to execute the next op and ... */
9991        continue;    /* ... jump back to the top, reusing st */
9992        /* NOTREACHED */
9993
9994      push_yes_state:
9995        /* push a state that backtracks on success */
9996        st->u.yes.prev_yes_state = yes_state;
9997        yes_state = st;
9998        /* FALLTHROUGH */
9999      push_state:
10000        /* push a new regex state, then continue at scan  */
10001        {
10002            regmatch_state *newst;
10003            DECLARE_AND_GET_RE_DEBUG_FLAGS;
10004
10005            DEBUG_r( /* DEBUG_STACK_r */
10006              if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
10007                regmatch_state *cur = st;
10008                regmatch_state *curyes = yes_state;
10009                U32 i;
10010                regmatch_slab *slab = PL_regmatch_slab;
10011                for (i = 0; i < 3 && i <= depth; cur--,i++) {
10012                    if (cur < SLAB_FIRST(slab)) {
10013                        slab = slab->prev;
10014                        cur = SLAB_LAST(slab);
10015                    }
10016                    Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
10017                        depth,
10018                        i ? "    " : "push",
10019                        depth - i, REGNODE_NAME(cur->resume_state),
10020                        (curyes == cur) ? "yes" : ""
10021                    );
10022                    if (curyes == cur)
10023                        curyes = cur->u.yes.prev_yes_state;
10024                }
10025            } else {
10026                DEBUG_STATE_pp("push")
10027            });
10028            depth++;
10029            st->locinput = locinput;
10030            st->loceol = loceol;
10031            st->sr0 = script_run_begin;
10032            newst = st+1;
10033            if (newst >  SLAB_LAST(PL_regmatch_slab))
10034                newst = S_push_slab(aTHX);
10035            PL_regmatch_state = newst;
10036
10037            locinput = pushinput;
10038            loceol = pusheol;
10039            script_run_begin = pushsr0;
10040            st = newst;
10041            continue;
10042            /* NOTREACHED */
10043        }
10044    }
10045#ifdef SOLARIS_BAD_OPTIMIZER
10046#  undef PL_charclass
10047#endif
10048
10049    /*
10050    * We get here only if there's trouble -- normally "case END" is
10051    * the terminating point.
10052    */
10053    Perl_croak(aTHX_ "corrupted regexp pointers");
10054    NOT_REACHED; /* NOTREACHED */
10055
10056  yes:
10057    if (yes_state) {
10058        /* we have successfully completed a subexpression, but we must now
10059         * pop to the state marked by yes_state and continue from there */
10060        assert(st != yes_state);
10061#ifdef DEBUGGING
10062        while (st != yes_state) {
10063            st--;
10064            if (st < SLAB_FIRST(PL_regmatch_slab)) {
10065                PL_regmatch_slab = PL_regmatch_slab->prev;
10066                st = SLAB_LAST(PL_regmatch_slab);
10067            }
10068            DEBUG_STATE_r({
10069                if (no_final) {
10070                    DEBUG_STATE_pp("pop (no final)");
10071                } else {
10072                    DEBUG_STATE_pp("pop (yes)");
10073                }
10074            });
10075            depth--;
10076        }
10077#else
10078        while (yes_state < SLAB_FIRST(PL_regmatch_slab)
10079            || yes_state > SLAB_LAST(PL_regmatch_slab))
10080        {
10081            /* not in this slab, pop slab */
10082            depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
10083            PL_regmatch_slab = PL_regmatch_slab->prev;
10084            st = SLAB_LAST(PL_regmatch_slab);
10085        }
10086        depth -= (st - yes_state);
10087#endif
10088        st = yes_state;
10089        yes_state = st->u.yes.prev_yes_state;
10090        PL_regmatch_state = st;
10091
10092        if (no_final) {
10093            locinput= st->locinput;
10094            loceol= st->loceol;
10095            script_run_begin = st->sr0;
10096        }
10097        state_num = st->resume_state + no_final;
10098        goto reenter_switch;
10099    }
10100
10101    DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
10102                          PL_colors[4], PL_colors[5]));
10103
10104    if (reginfo->info_aux_eval) {
10105        /* each successfully executed (?{...}) block does the equivalent of
10106         *   local $^R = do {...}
10107         * When popping the save stack, all these locals would be undone;
10108         * bypass this by setting the outermost saved $^R to the latest
10109         * value */
10110        /* I don't know if this is needed or works properly now.
10111         * see code related to PL_replgv elsewhere in this file.
10112         * Yves
10113         */
10114        if (oreplsv != GvSV(PL_replgv)) {
10115            sv_setsv(oreplsv, GvSV(PL_replgv));
10116            SvSETMAGIC(oreplsv);
10117        }
10118    }
10119    result = 1;
10120    goto final_exit;
10121
10122  no:
10123    DEBUG_EXECUTE_r(
10124        Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
10125            depth,
10126            PL_colors[4], PL_colors[5])
10127        );
10128
10129  no_silent:
10130    if (no_final) {
10131        if (yes_state) {
10132            goto yes;
10133        } else {
10134            goto final_exit;
10135        }
10136    }
10137    if (depth) {
10138        /* there's a previous state to backtrack to */
10139        st--;
10140        if (st < SLAB_FIRST(PL_regmatch_slab)) {
10141            PL_regmatch_slab = PL_regmatch_slab->prev;
10142            st = SLAB_LAST(PL_regmatch_slab);
10143        }
10144        PL_regmatch_state = st;
10145        locinput= st->locinput;
10146        loceol= st->loceol;
10147        script_run_begin = st->sr0;
10148
10149        DEBUG_STATE_pp("pop");
10150        depth--;
10151        if (yes_state == st)
10152            yes_state = st->u.yes.prev_yes_state;
10153
10154        state_num = st->resume_state + 1; /* failure = success + 1 */
10155        PERL_ASYNC_CHECK();
10156        goto reenter_switch;
10157    }
10158    result = 0;
10159
10160  final_exit:
10161    if (rex->intflags & PREGf_VERBARG_SEEN) {
10162        SV *sv_err = get_sv("REGERROR", 1);
10163        SV *sv_mrk = get_sv("REGMARK", 1);
10164        if (result) {
10165            sv_commit = &PL_sv_no;
10166            if (!sv_yes_mark)
10167                sv_yes_mark = &PL_sv_yes;
10168        } else {
10169            if (!sv_commit)
10170                sv_commit = &PL_sv_yes;
10171            sv_yes_mark = &PL_sv_no;
10172        }
10173        assert(sv_err);
10174        assert(sv_mrk);
10175        sv_setsv(sv_err, sv_commit);
10176        sv_setsv(sv_mrk, sv_yes_mark);
10177    }
10178
10179
10180    if (last_pushed_cv) {
10181        dSP;
10182        /* see "Some notes about MULTICALL" above */
10183        POP_MULTICALL;
10184        PERL_UNUSED_VAR(SP);
10185    }
10186    else
10187        LEAVE_SCOPE(orig_savestack_ix);
10188
10189    assert(!result ||  locinput - reginfo->strbeg >= 0);
10190    return result ?  locinput - reginfo->strbeg : -1;
10191}
10192
10193/*
10194 - regrepeat - repeatedly match something simple, report how many
10195 *
10196 * What 'simple' means is a node which can be the operand of a quantifier like
10197 * '+', or {1,3}
10198 *
10199 * startposp - pointer to a pointer to the start position.  This is updated
10200 *             to point to the byte following the highest successful
10201 *             match.
10202 * p         - the regnode to be repeatedly matched against.
10203 * loceol    - pointer to the end position beyond which we aren't supposed to
10204 *             look.
10205 * reginfo   - struct holding match state, such as utf8_target
10206 * max       - maximum number of things to match.
10207 * depth     - (for debugging) backtracking depth.
10208 */
10209STATIC I32
10210S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10211            char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH)
10212{
10213    char *scan;     /* Pointer to current position in target string */
10214    I32 c;
10215    char *this_eol = loceol;   /* potentially adjusted version. */
10216    I32 hardcount = 0;  /* How many matches so far */
10217    bool utf8_target = reginfo->is_utf8_target;
10218    unsigned int to_complement = 0;  /* Invert the result? */
10219    char_class_number_ classnum;
10220
10221    PERL_ARGS_ASSERT_REGREPEAT;
10222
10223    /* This routine is structured so that we switch on the input OP.  Each OP
10224     * case: statement contains a loop to repeatedly apply the OP, advancing
10225     * the input until it fails, or reaches the end of the input, or until it
10226     * reaches the upper limit of matches. */
10227
10228    scan = *startposp;
10229    if (max == REG_INFTY)   /* This is a special marker to go to the platform's
10230                               max */
10231        max = I32_MAX;
10232    else if (! utf8_target && this_eol - scan > max)
10233        this_eol = scan + max;
10234
10235    /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10236     * down to the maximum of how far we should go in it (but leaving it set to
10237     * the real end if the maximum permissible would take us beyond that).
10238     * This allows us to make the loop exit condition that we haven't gone past
10239     * <this_eol> to also mean that we haven't exceeded the max permissible
10240     * count, saving a test each time through the loop.  But it assumes that
10241     * the OP matches a single byte, which is true for most of the OPs below
10242     * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
10243     * have this characteristic have to compensate.
10244     *
10245     * There is no such adjustment for UTF-8 targets, since the number of bytes
10246     * per character can vary.  OPs will have to test both that the count is
10247     * less than the max permissible (using <hardcount> to keep track), and
10248     * that we are still within the bounds of the string (using <this_eol>.  A
10249     * few OPs match a single byte no matter what the encoding.  They can omit
10250     * the max test if, for the UTF-8 case, they do the adjustment that was
10251     * skipped above.
10252     *
10253     * Thus, the code above sets things up for the common case; and exceptional
10254     * cases need extra work; the common case is to make sure <scan> doesn't go
10255     * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10256     * count doesn't exceed the maximum permissible */
10257
10258    switch (with_t_UTF8ness(OP(p), utf8_target)) {
10259        SV * anyofh_list;
10260
10261      case REG_ANY_t8:
10262        while (scan < this_eol && hardcount < max && *scan != '\n') {
10263            scan += UTF8SKIP(scan);
10264            hardcount++;
10265        }
10266        break;
10267
10268      case REG_ANY_tb:
10269        scan = (char *) memchr(scan, '\n', this_eol - scan);
10270        if (! scan) {
10271            scan = this_eol;
10272        }
10273        break;
10274
10275      case SANY_t8:
10276        while (scan < this_eol && hardcount < max) {
10277            scan += UTF8SKIP(scan);
10278            hardcount++;
10279        }
10280        break;
10281
10282      case SANY_tb:
10283        scan = this_eol;
10284        break;
10285
10286      case EXACT_REQ8_tb:
10287      case LEXACT_REQ8_tb:
10288      case EXACTFU_REQ8_tb:
10289        break;
10290
10291      case EXACTL_t8:
10292        if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10293            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10294        }
10295        /* FALLTHROUGH */
10296
10297      case EXACTL_tb:
10298      case EXACTFL_t8:
10299      case EXACTFL_tb:
10300      case EXACTFLU8_t8:
10301      case EXACTFLU8_tb:
10302        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10303        /* FALLTHROUGH */
10304
10305      case EXACT_REQ8_t8:
10306      case LEXACT_REQ8_t8:
10307      case EXACTFU_REQ8_t8:
10308      case LEXACT_t8:
10309      case LEXACT_tb:
10310      case EXACT_t8:
10311      case EXACT_tb:
10312      case EXACTF_t8:
10313      case EXACTF_tb:
10314      case EXACTFAA_NO_TRIE_t8:
10315      case EXACTFAA_NO_TRIE_tb:
10316      case EXACTFAA_t8:
10317      case EXACTFAA_tb:
10318      case EXACTFU_t8:
10319      case EXACTFU_tb:
10320      case EXACTFUP_t8:
10321      case EXACTFUP_tb:
10322
10323      {
10324        struct next_matchable_info Binfo;
10325        PERL_UINT_FAST8_T definitive_len;
10326
10327        assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10328
10329        /* Set up termination info, and quit if we can rule out that we've
10330         * gotten a match of the termination criteria */
10331        if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10332            ||   scan + Binfo.min_length > this_eol
10333            || ! S_test_EXACTISH_ST(scan, Binfo))
10334        {
10335            break;
10336        }
10337
10338        definitive_len = Binfo.initial_definitive;
10339
10340        /* Here there are potential matches, and the first byte(s) matched our
10341         * filter
10342         *
10343         * If we got a definitive match of some initial bytes, there is no
10344         * possibility of false positives as far as it got */
10345        if (definitive_len > 0) {
10346
10347            /* If as far as it got is the maximum possible, there were no false
10348             * positives at all.  Since we have everything set up, see how many
10349             * repeats there are. */
10350            if (definitive_len >= Binfo.max_length) {
10351
10352                /* We've already found one match */
10353                scan += definitive_len;
10354                hardcount++;
10355
10356                /* If want more than the one match, and there is room for more,
10357                 * see if there are any */
10358                if (hardcount < max && scan + definitive_len <= this_eol) {
10359
10360                    /* If the character is only a single byte long, just span
10361                     * all such bytes. */
10362                    if (definitive_len == 1) {
10363                        const char * orig_scan = scan;
10364
10365                        if (this_eol - (scan - hardcount) > max) {
10366                            this_eol = scan - hardcount + max;
10367                        }
10368
10369                        /* Use different routines depending on whether it's an
10370                         * exact match or matches with a mask */
10371                        if (Binfo.initial_exact == 1) {
10372                            scan = (char *) find_span_end((U8 *) scan,
10373                                                          (U8 *) this_eol,
10374                                                          Binfo.matches[0]);
10375                        }
10376                        else {
10377                            scan = (char *) find_span_end_mask(
10378                                                       (U8 *) scan,
10379                                                       (U8 *) this_eol,
10380                                                       Binfo.first_byte_anded,
10381                                                       Binfo.first_byte_mask);
10382                        }
10383
10384                        hardcount += scan - orig_scan;
10385                    }
10386                    else { /* Here, the full character definitive match is more
10387                              than one byte */
10388                        while (   hardcount < max
10389                               && scan + definitive_len <= this_eol
10390                               && S_test_EXACTISH_ST(scan, Binfo))
10391                        {
10392                                scan += definitive_len;
10393                                hardcount++;
10394                        }
10395                    }
10396                }
10397
10398                break;
10399            }   /* End of a full character is definitively matched */
10400
10401            /* Here, an initial portion of the character matched definitively,
10402             * and the rest matched as well, but could have false positives */
10403
10404            do {
10405                int i;
10406                U8 * matches = Binfo.matches;
10407
10408                /* The first bytes were definitive.  Look at the remaining */
10409                for (i = 0; i < Binfo.count; i++) {
10410                    if (memEQ(scan + definitive_len,
10411                              matches + definitive_len,
10412                              Binfo.lengths[i] - definitive_len))
10413                    {
10414                        goto found_a_completion;
10415                    }
10416
10417                    matches += Binfo.lengths[i];
10418                }
10419
10420                /* Didn't find anything to complete our initial match.  Stop
10421                 * here */
10422                break;
10423
10424              found_a_completion:
10425
10426                /* Here, matched a full character, Include it in the result,
10427                 * and then look to see if the next char matches */
10428                hardcount++;
10429                scan += Binfo.lengths[i];
10430
10431            } while (   hardcount < max
10432                     && scan + definitive_len < this_eol
10433                     && S_test_EXACTISH_ST(scan, Binfo));
10434
10435            /* Here, have advanced as far as possible */
10436            break;
10437        } /* End of found some initial bytes that definitively matched */
10438
10439        /* Here, we can't rule out that we have found the beginning of 'B', but
10440         * there were no initial bytes that could rule out anything
10441         * definitively. Use brute force to examine all the possibilities */
10442        while (scan < this_eol && hardcount < max) {
10443            int i;
10444            U8 * matches = Binfo.matches;
10445
10446            for (i = 0; i < Binfo.count; i++) {
10447                if (memEQ(scan, matches, Binfo.lengths[i])) {
10448                    goto found1;
10449                }
10450
10451                matches += Binfo.lengths[i];
10452            }
10453
10454            break;
10455
10456          found1:
10457            hardcount++;
10458            scan += Binfo.lengths[i];
10459        }
10460
10461        break;
10462      }
10463
10464      case ANYOFPOSIXL_t8:
10465      case ANYOFL_t8:
10466        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10467        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10468
10469        /* FALLTHROUGH */
10470      case ANYOFD_t8:
10471      case ANYOF_t8:
10472        while (   hardcount < max
10473               && scan < this_eol
10474               && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10475        {
10476            scan += UTF8SKIP(scan);
10477            hardcount++;
10478        }
10479        break;
10480
10481      case ANYOFPOSIXL_tb:
10482      case ANYOFL_tb:
10483        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10484        CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10485        /* FALLTHROUGH */
10486
10487      case ANYOFD_tb:
10488      case ANYOF_tb:
10489        if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10490            while (   scan < this_eol
10491                   && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10492                scan++;
10493        }
10494        else {
10495            while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10496                scan++;
10497        }
10498        break;
10499
10500      case ANYOFM_t8:
10501        if (this_eol - scan > max) {
10502
10503            /* We didn't adjust <this_eol> at the beginning of this routine
10504             * because is UTF-8, but it is actually ok to do so, since here, to
10505             * match, 1 char == 1 byte. */
10506            this_eol = scan + max;
10507        }
10508        /* FALLTHROUGH */
10509
10510      case ANYOFM_tb:
10511        scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10512                                           (U8) ARG1u(p), FLAGS(p));
10513        break;
10514
10515      case NANYOFM_t8:
10516        while (     hardcount < max
10517               &&   scan < this_eol
10518               &&  (*scan & FLAGS(p)) != ARG1u(p))
10519        {
10520            scan += UTF8SKIP(scan);
10521            hardcount++;
10522        }
10523        break;
10524
10525      case NANYOFM_tb:
10526        scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10527                                         (U8) ARG1u(p), FLAGS(p));
10528        break;
10529
10530      case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10531      case ANYOFHb_tb:
10532      case ANYOFHbbm_tb:
10533      case ANYOFHr_tb:
10534      case ANYOFHs_tb:
10535        break;
10536
10537      case ANYOFH_t8:
10538        anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10539        while (  hardcount < max
10540               && scan < this_eol
10541               && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10542               && _invlist_contains_cp(anyofh_list,
10543                                             utf8_to_uvchr_buf((U8 *) scan,
10544                                                               (U8 *) this_eol,
10545                                                               NULL)))
10546        {
10547            scan += UTF8SKIP(scan);
10548            hardcount++;
10549        }
10550        break;
10551
10552      case ANYOFHb_t8:
10553        /* we know the first byte must be the FLAGS field */
10554        anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10555        while (   hardcount < max
10556               && scan < this_eol
10557               && (U8) *scan == ANYOF_FLAGS(p)
10558               && _invlist_contains_cp(anyofh_list,
10559                                             utf8_to_uvchr_buf((U8 *) scan,
10560                                                               (U8 *) this_eol,
10561                                                               NULL)))
10562        {
10563            scan += UTF8SKIP(scan);
10564            hardcount++;
10565        }
10566        break;
10567
10568      case ANYOFHbbm_t8:
10569        while (   hardcount < max
10570               && scan + 1 < this_eol
10571               && (U8) *scan == ANYOF_FLAGS(p)
10572               && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10573                                (U8) scan[1] & UTF_CONTINUATION_MASK))
10574        {
10575            scan += 2;  /* This node only matces 2-byte UTF-8 */
10576            hardcount++;
10577        }
10578        break;
10579
10580      case ANYOFHr_t8:
10581        anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10582        while (  hardcount < max
10583               && scan < this_eol
10584               && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10585                          LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10586                          HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10587               && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10588               && _invlist_contains_cp(anyofh_list,
10589                                             utf8_to_uvchr_buf((U8 *) scan,
10590                                                               (U8 *) this_eol,
10591                                                               NULL)))
10592        {
10593            scan += UTF8SKIP(scan);
10594            hardcount++;
10595        }
10596        break;
10597
10598      case ANYOFHs_t8:
10599        anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10600        while (   hardcount < max
10601               && scan + FLAGS(p) < this_eol
10602               && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10603               && _invlist_contains_cp(anyofh_list,
10604                                             utf8_to_uvchr_buf((U8 *) scan,
10605                                                               (U8 *) this_eol,
10606                                                               NULL)))
10607        {
10608            scan += UTF8SKIP(scan);
10609            hardcount++;
10610        }
10611        break;
10612
10613      case ANYOFR_t8:
10614        while (   hardcount < max
10615               && scan < this_eol
10616               && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10617               && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10618                                            (U8 *) this_eol,
10619                                            NULL),
10620                              ANYOFRbase(p), ANYOFRdelta(p)))
10621        {
10622            scan += UTF8SKIP(scan);
10623            hardcount++;
10624        }
10625        break;
10626
10627      case ANYOFR_tb:
10628        while (   hardcount < max
10629               && scan < this_eol
10630               && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10631        {
10632            scan++;
10633            hardcount++;
10634        }
10635        break;
10636
10637      case ANYOFRb_t8:
10638        while (   hardcount < max
10639               && scan < this_eol
10640               && (U8) *scan == ANYOF_FLAGS(p)
10641               && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10642                                            (U8 *) this_eol,
10643                                            NULL),
10644                              ANYOFRbase(p), ANYOFRdelta(p)))
10645        {
10646            scan += UTF8SKIP(scan);
10647            hardcount++;
10648        }
10649        break;
10650
10651      case ANYOFRb_tb:
10652        while (   hardcount < max
10653               && scan < this_eol
10654               && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10655        {
10656            scan++;
10657            hardcount++;
10658        }
10659        break;
10660
10661    /* The argument (FLAGS) to all the POSIX node types is the class number */
10662
10663      case NPOSIXL_tb:
10664        to_complement = 1;
10665        /* FALLTHROUGH */
10666
10667      case POSIXL_tb:
10668        CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10669        while (   scan < this_eol
10670               && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10671        {
10672            scan++;
10673        }
10674        break;
10675
10676      case NPOSIXL_t8:
10677        to_complement = 1;
10678        /* FALLTHROUGH */
10679
10680      case POSIXL_t8:
10681        while (   hardcount < max && scan < this_eol
10682               && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10683                                                      (U8 *) scan,
10684                                                      (U8 *) this_eol)))
10685        {
10686            scan += UTF8SKIP(scan);
10687            hardcount++;
10688        }
10689        break;
10690
10691      case POSIXD_tb:
10692        /* FALLTHROUGH */
10693
10694      case POSIXA_t8:
10695        if (this_eol - scan > max) {
10696
10697            /* We didn't adjust <this_eol> at the beginning of this routine
10698             * because is UTF-8, but it is actually ok to do so, since here, to
10699             * match, 1 char == 1 byte. */
10700            this_eol = scan + max;
10701        }
10702        /* FALLTHROUGH */
10703
10704      case POSIXA_tb:
10705        while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10706            scan++;
10707        }
10708        break;
10709
10710      case NPOSIXD_tb:
10711        /* FALLTHROUGH */
10712
10713      case NPOSIXA_tb:
10714        while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10715            scan++;
10716        }
10717        break;
10718
10719      case NPOSIXA_t8:
10720
10721        /* The complement of something that matches only ASCII matches all
10722         * non-ASCII, plus everything in ASCII that isn't in the class. */
10723        while (   hardcount < max && scan < this_eol
10724               && (   ! isASCII_utf8_safe(scan, loceol)
10725                   || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10726            {
10727                scan += UTF8SKIP(scan);
10728                hardcount++;
10729            }
10730        break;
10731
10732      case NPOSIXU_tb:
10733        to_complement = 1;
10734        /* FALLTHROUGH */
10735
10736      case POSIXU_tb:
10737        while (   scan < this_eol
10738               && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10739        {
10740            scan++;
10741        }
10742        break;
10743
10744      case NPOSIXU_t8:
10745      case NPOSIXD_t8:
10746        to_complement = 1;
10747        /* FALLTHROUGH */
10748
10749      case POSIXD_t8:
10750      case POSIXU_t8:
10751        classnum = (char_class_number_) FLAGS(p);
10752        switch (classnum) {
10753          default:
10754            while (   hardcount < max && scan < this_eol
10755                   && to_complement
10756                    ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10757                       utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10758            {
10759                scan += UTF8SKIP(scan);
10760                hardcount++;
10761            }
10762            break;
10763
10764            /* For the classes below, the knowledge of how to handle every code
10765             * point is compiled into Perl via a macro.  This code is written
10766             * for making the loops as tight as possible.  It could be
10767             * refactored to save space instead. */
10768
10769          case CC_ENUM_SPACE_:
10770            while (   hardcount < max
10771                   && scan < this_eol
10772                   && (to_complement
10773                                   ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10774            {
10775                scan += UTF8SKIP(scan);
10776                hardcount++;
10777            }
10778            break;
10779          case CC_ENUM_BLANK_:
10780            while (   hardcount < max
10781                   && scan < this_eol
10782                   && (to_complement
10783                                ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10784            {
10785                scan += UTF8SKIP(scan);
10786                hardcount++;
10787            }
10788            break;
10789          case CC_ENUM_XDIGIT_:
10790            while (   hardcount < max
10791                   && scan < this_eol
10792                   && (to_complement
10793                               ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10794            {
10795                scan += UTF8SKIP(scan);
10796                hardcount++;
10797            }
10798            break;
10799          case CC_ENUM_VERTSPACE_:
10800            while (   hardcount < max
10801                   && scan < this_eol
10802                   && (to_complement
10803                               ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10804            {
10805                scan += UTF8SKIP(scan);
10806                hardcount++;
10807            }
10808            break;
10809          case CC_ENUM_CNTRL_:
10810            while (   hardcount < max
10811                   && scan < this_eol
10812                   && (to_complement
10813                               ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10814            {
10815                scan += UTF8SKIP(scan);
10816                hardcount++;
10817            }
10818            break;
10819        }
10820        break;
10821
10822      case LNBREAK_t8:
10823        while (    hardcount < max && scan < this_eol
10824               && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10825        {
10826            scan += c;
10827            hardcount++;
10828        }
10829        break;
10830
10831      case LNBREAK_tb:
10832        /* LNBREAK can match one or two latin chars, which is ok, but we have
10833         * to use hardcount in this situation, and throw away the adjustment to
10834         * <this_eol> done before the switch statement */
10835        while (
10836            hardcount < max && scan < loceol
10837            && (c = is_LNBREAK_latin1_safe(scan, loceol))
10838        ) {
10839            scan += c;
10840            hardcount++;
10841        }
10842        break;
10843
10844    default:
10845        Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10846                         " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10847        NOT_REACHED; /* NOTREACHED */
10848
10849    }
10850
10851    if (hardcount)
10852        c = hardcount;
10853    else
10854        c = scan - *startposp;
10855    *startposp = scan;
10856
10857    DEBUG_r({
10858        DECLARE_AND_GET_RE_DEBUG_FLAGS;
10859        DEBUG_EXECUTE_r({
10860            SV * const prop = sv_newmortal();
10861            regprop(prog, prop, p, reginfo, NULL);
10862            Perl_re_exec_indentf( aTHX_
10863                        "%s can match %" IVdf " times out of %" IVdf "...\n",
10864                        depth, SvPVX_const(prop),(IV)c,(IV)max);
10865        });
10866    });
10867
10868    return(c);
10869}
10870
10871/*
10872 - reginclass - determine if a character falls into a character class
10873
10874  n is the ANYOF-type regnode
10875  p is the target string
10876  p_end points to one byte beyond the end of the target string
10877  utf8_target tells whether p is in UTF-8.
10878
10879  Returns true if matched; false otherwise.
10880
10881  Note that this can be a synthetic start class, a combination of various
10882  nodes, so things you think might be mutually exclusive, such as locale,
10883  aren't.  It can match both locale and non-locale
10884
10885 */
10886
10887STATIC bool
10888S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10889{
10890    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10891                        ? 0
10892                        : ANYOF_FLAGS(n);
10893    bool match = FALSE;
10894    UV c = *p;
10895
10896    PERL_ARGS_ASSERT_REGINCLASS;
10897
10898    /* If c is not already the code point, get it.  Note that
10899     * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10900    if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10901        STRLEN c_len = 0;
10902        const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10903        c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10904        if (c_len == (STRLEN)-1) {
10905            _force_out_malformed_utf8_message(p, p_end,
10906                                              utf8n_flags,
10907                                              1 /* 1 means die */ );
10908            NOT_REACHED; /* NOTREACHED */
10909        }
10910        if (     c > 255
10911            &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10912            && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10913        {
10914            _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10915        }
10916    }
10917
10918    /* If this character is potentially in the bitmap, check it */
10919    if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10920        if (ANYOF_BITMAP_TEST(n, c))
10921            match = TRUE;
10922        else if (  (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10923                 && OP(n) == ANYOFD
10924                 && ! utf8_target
10925                 && ! isASCII(c))
10926        {
10927            match = TRUE;
10928        }
10929        else if (flags & ANYOF_LOCALE_FLAGS) {
10930            if (  (flags & ANYOFL_FOLD)
10931                && c < 256
10932                && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10933            {
10934                match = TRUE;
10935            }
10936            else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10937                     && c <= U8_MAX  /* param to isFOO_lc() */
10938            ) {
10939                /* The data structure is arranged so bits 0, 2, 4, ... are set
10940                 * if the class includes the Posix character class given by
10941                 * bit/2; and 1, 3, 5, ... are set if the class includes the
10942                 * complemented Posix class given by int(bit/2), so the
10943                 * remainder modulo 2 tells us if to complement or not.
10944                 *
10945                 * Note that this code assumes that all the classes are closed
10946                 * under folding.  For example, if a character matches \w, then
10947                 * its fold does too; and vice versa.  This should be true for
10948                 * any well-behaved locale for all the currently defined Posix
10949                 * classes, except for :lower: and :upper:, which are handled
10950                 * by the pseudo-class :cased: which matches if either of the
10951                 * other two does.  To get rid of this assumption, an outer
10952                 * loop could be used below to iterate over both the source
10953                 * character, and its fold (if different) */
10954
10955                U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10956
10957                do {
10958                    /* Find the next set bit indicating a class to try matching
10959                     * against */
10960                    U8 bit_pos = lsbit_pos32(posixl_bits);
10961
10962                    if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10963                        match = TRUE;
10964                        break;
10965                    }
10966
10967                    /* Remove this class from consideration; repeat */
10968                    POSIXL_CLEAR(posixl_bits, bit_pos);
10969                } while(posixl_bits != 0);
10970            }
10971        }
10972    }
10973
10974    /* If the bitmap didn't (or couldn't) match, and something outside the
10975     * bitmap could match, try that. */
10976    if (!match) {
10977        if (      c >= NUM_ANYOF_CODE_POINTS
10978            &&    ANYOF_ONLY_HAS_BITMAP(n)
10979            && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10980        {
10981            /* In this case, the ARG is set up so that the final bit indicates
10982             * whether it matches or not */
10983            match = ARG1u(n) & 1;
10984        }
10985        else
10986            /* Here, the main way it could match is if the code point is
10987             * outside the bitmap and an inversion list exists to handle such
10988             * things.  The other ways all occur when a flag is set to indicate
10989             * we need special handling.  That handling doesn't come in to
10990             * effect for ANYOFD nodes unless the target string is UTF-8 and
10991             * that matters to code point being matched. */
10992             if (    c >= NUM_ANYOF_CODE_POINTS
10993                 || (   (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
10994                     && (   UNLIKELY(OP(n) != ANYOFD)
10995                         || (utf8_target && ! isASCII_uvchr(c)
10996#                               if NUM_ANYOF_CODE_POINTS > 256
10997                                                               && c < 256
10998#                               endif
10999                                                                         ))))
11000        {
11001            /* Here, we have an inversion list for outside-the-bitmap code
11002             * points and/or the flag is set indicating special handling is
11003             * needed.  The flag is set when there is auxiliary data beyond the
11004             * normal inversion list, or if there is the possibility of a
11005             * special Turkic locale match, even without auxiliary data.
11006             *
11007             * First check if there is an inversion list and/or auxiliary data.
11008             * */
11009            if (ANYOF_HAS_AUX(n)) {
11010                SV* only_utf8_locale = NULL;
11011
11012                /* This call will return in 'definition' the union of the base
11013                 * non-bitmap inversion list, if any, plus the deferred
11014                 * definitions of user-defined properties, if any.  It croaks
11015                 * if there is such a property but which still has no definition
11016                 * available */
11017                SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
11018                                                      &only_utf8_locale, NULL);
11019                if (definition) {
11020                    /* Most likely is the outside-the-bitmap inversion list. */
11021                    if (_invlist_contains_cp(definition, c)) {
11022                        match = TRUE;
11023                    }
11024                    else /* Failing that, hardcode the two tests for a Turkic
11025                            locale */
11026                         if (   UNLIKELY(IN_UTF8_TURKIC_LOCALE)
11027                             && isALPHA_FOLD_EQ(*p, 'i'))
11028                    {
11029                        /* Turkish locales have these hard-coded rules
11030                         * overriding normal ones */
11031                        if (*p == 'i') {
11032                            if (_invlist_contains_cp(definition,
11033                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11034                            {
11035                                match = TRUE;
11036                            }
11037                        }
11038                        else if (_invlist_contains_cp(definition,
11039                                                 LATIN_SMALL_LETTER_DOTLESS_I))
11040                        {
11041                            match = TRUE;
11042                        }
11043                    }
11044                }
11045
11046                if (   UNLIKELY(only_utf8_locale)
11047                    && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
11048                    && ! match)
11049                {
11050                    match = _invlist_contains_cp(only_utf8_locale, c);
11051                }
11052            }
11053
11054            /* In a Turkic locale under folding, hard-code the I i case pair
11055             * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
11056             * flag set unless [Ii] were match possibilities */
11057            if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) {
11058                if (utf8_target) {
11059                    if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11060                        if (ANYOF_BITMAP_TEST(n, 'i')) {
11061                            match = TRUE;
11062                        }
11063                    }
11064                    else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
11065                        if (ANYOF_BITMAP_TEST(n, 'I')) {
11066                            match = TRUE;
11067                        }
11068                    }
11069                }
11070
11071#if NUM_ANYOF_CODE_POINTS > 256
11072                /* Larger bitmap means these special cases aren't handled
11073                 * outside the bitmap above. */
11074                if (*p == 'i') {
11075                    if (ANYOF_BITMAP_TEST(n,
11076                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11077                    {
11078                        match = TRUE;
11079                    }
11080                }
11081                else if (*p == 'I') {
11082                    if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
11083                        match = TRUE;
11084                    }
11085                }
11086#endif
11087            }
11088        }
11089
11090        if (   UNICODE_IS_SUPER(c)
11091            && (flags & ANYOF_WARN_SUPER__shared)
11092            && OP(n) != ANYOFD
11093            && ckWARN_d(WARN_NON_UNICODE))
11094        {
11095            Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
11096                "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
11097        }
11098    }
11099
11100#if ANYOF_INVERT != 1
11101    /* Depending on compiler optimization cBOOL takes time, so if don't have to
11102     * use it, don't */
11103#   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
11104#endif
11105
11106    /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
11107    return (flags & ANYOF_INVERT) ^ match;
11108}
11109
11110STATIC U8 *
11111S_reghop3(U8 *s, SSize_t off, const U8* lim)
11112{
11113    /* return the position 'off' UTF-8 characters away from 's', forward if
11114     * 'off' >= 0, backwards if negative.  But don't go outside of position
11115     * 'lim', which better be < s  if off < 0 */
11116
11117    PERL_ARGS_ASSERT_REGHOP3;
11118
11119    if (off >= 0) {
11120        while (off-- && s < lim) {
11121            /* XXX could check well-formedness here */
11122            U8 *new_s = s + UTF8SKIP(s);
11123            if (new_s > lim) /* lim may be in the middle of a long character */
11124                return s;
11125            s = new_s;
11126        }
11127    }
11128    else {
11129        while (off++ && s > lim) {
11130            s--;
11131            if (UTF8_IS_CONTINUED(*s)) {
11132                while (s > lim && UTF8_IS_CONTINUATION(*s))
11133                    s--;
11134                if (! UTF8_IS_START(*s)) {
11135                    Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11136                }
11137            }
11138            /* XXX could check well-formedness here */
11139        }
11140    }
11141    return s;
11142}
11143
11144STATIC U8 *
11145S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
11146{
11147    PERL_ARGS_ASSERT_REGHOP4;
11148
11149    if (off >= 0) {
11150        while (off-- && s < rlim) {
11151            /* XXX could check well-formedness here */
11152            s += UTF8SKIP(s);
11153        }
11154    }
11155    else {
11156        while (off++ && s > llim) {
11157            s--;
11158            if (UTF8_IS_CONTINUED(*s)) {
11159                while (s > llim && UTF8_IS_CONTINUATION(*s))
11160                    s--;
11161                if (! UTF8_IS_START(*s)) {
11162                    Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11163                }
11164            }
11165            /* XXX could check well-formedness here */
11166        }
11167    }
11168    return s;
11169}
11170
11171/* like reghop3, but returns NULL on overrun, rather than returning last
11172 * char pos */
11173
11174STATIC U8 *
11175S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
11176{
11177    PERL_ARGS_ASSERT_REGHOPMAYBE3;
11178
11179    if (off >= 0) {
11180        while (off-- && s < lim) {
11181            /* XXX could check well-formedness here */
11182            s += UTF8SKIP(s);
11183        }
11184        if (off >= 0)
11185            return NULL;
11186    }
11187    else {
11188        while (off++ && s > lim) {
11189            s--;
11190            if (UTF8_IS_CONTINUED(*s)) {
11191                while (s > lim && UTF8_IS_CONTINUATION(*s))
11192                    s--;
11193                if (! UTF8_IS_START(*s)) {
11194                    Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11195                }
11196            }
11197            /* XXX could check well-formedness here */
11198        }
11199        if (off <= 0)
11200            return NULL;
11201    }
11202    return s;
11203}
11204
11205
11206/* when executing a regex that may have (?{}), extra stuff needs setting
11207   up that will be visible to the called code, even before the current
11208   match has finished. In particular:
11209
11210   * $_ is localised to the SV currently being matched;
11211   * pos($_) is created if necessary, ready to be updated on each call-out
11212     to code;
11213   * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11214     isn't set until the current pattern is successfully finished), so that
11215     $1 etc of the match-so-far can be seen;
11216   * save the old values of subbeg etc of the current regex, and  set then
11217     to the current string (again, this is normally only done at the end
11218     of execution)
11219*/
11220
11221static void
11222S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11223{
11224    MAGIC *mg;
11225    regexp *const rex = ReANY(reginfo->prog);
11226    regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11227
11228    eval_state->rex = rex;
11229    eval_state->sv  = reginfo->sv;
11230
11231    if (reginfo->sv) {
11232        /* Make $_ available to executed code. */
11233        if (reginfo->sv != DEFSV) {
11234            SAVE_DEFSV;
11235            DEFSV_set(reginfo->sv);
11236        }
11237        /* will be dec'd by S_cleanup_regmatch_info_aux */
11238        SvREFCNT_inc_NN(reginfo->sv);
11239
11240        if (!(mg = mg_find_mglob(reginfo->sv))) {
11241            /* prepare for quick setting of pos */
11242            mg = sv_magicext_mglob(reginfo->sv);
11243            mg->mg_len = -1;
11244        }
11245        eval_state->pos_magic = mg;
11246        eval_state->pos       = mg->mg_len;
11247        eval_state->pos_flags = mg->mg_flags;
11248    }
11249    else
11250        eval_state->pos_magic = NULL;
11251
11252    if (!PL_reg_curpm) {
11253        /* PL_reg_curpm is a fake PMOP that we can attach the current
11254         * regex to and point PL_curpm at, so that $1 et al are visible
11255         * within a /(?{})/. It's just allocated once per interpreter the
11256         * first time its needed */
11257        Newxz(PL_reg_curpm, 1, PMOP);
11258#ifdef USE_ITHREADS
11259        {
11260            SV* const repointer = &PL_sv_undef;
11261            /* this regexp is also owned by the new PL_reg_curpm, which
11262               will try to free it.  */
11263            av_push(PL_regex_padav, repointer);
11264            PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11265            PL_regex_pad = AvARRAY(PL_regex_padav);
11266        }
11267#endif
11268    }
11269    SET_reg_curpm(reginfo->prog);
11270    eval_state->curpm = PL_curpm;
11271    PL_curpm_under = PL_curpm;
11272    PL_curpm = PL_reg_curpm;
11273    if (RXp_MATCH_COPIED(rex)) {
11274        /*  Here is a serious problem: we cannot rewrite subbeg,
11275            since it may be needed if this match fails.  Thus
11276            $` inside (?{}) could fail... */
11277        eval_state->subbeg     = RXp_SUBBEG(rex);
11278        eval_state->sublen     = RXp_SUBLEN(rex);
11279        eval_state->suboffset  = RXp_SUBOFFSET(rex);
11280        eval_state->subcoffset = RXp_SUBCOFFSET(rex);
11281#ifdef PERL_ANY_COW
11282        eval_state->saved_copy = RXp_SAVED_COPY(rex);
11283#endif
11284        RXp_MATCH_COPIED_off(rex);
11285    }
11286    else
11287        eval_state->subbeg = NULL;
11288    RXp_SUBBEG(rex) = (char *)reginfo->strbeg;
11289    RXp_SUBOFFSET(rex) = 0;
11290    RXp_SUBCOFFSET(rex) = 0;
11291    RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg;
11292}
11293
11294
11295/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11296
11297static void
11298S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11299{
11300    regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11301    regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
11302    regmatch_slab *s;
11303
11304    Safefree(aux->poscache);
11305
11306    if (eval_state) {
11307
11308        /* undo the effects of S_setup_eval_state() */
11309
11310        if (eval_state->subbeg) {
11311            regexp * const rex = eval_state->rex;
11312            RXp_SUBBEG(rex) = eval_state->subbeg;
11313            RXp_SUBLEN(rex)     = eval_state->sublen;
11314            RXp_SUBOFFSET(rex)  = eval_state->suboffset;
11315            RXp_SUBCOFFSET(rex) = eval_state->subcoffset;
11316#ifdef PERL_ANY_COW
11317            RXp_SAVED_COPY(rex) = eval_state->saved_copy;
11318#endif
11319            RXp_MATCH_COPIED_on(rex);
11320        }
11321        if (eval_state->pos_magic)
11322        {
11323            eval_state->pos_magic->mg_len = eval_state->pos;
11324            eval_state->pos_magic->mg_flags =
11325                 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11326               | (eval_state->pos_flags & MGf_BYTES);
11327        }
11328
11329        PL_curpm = eval_state->curpm;
11330        SvREFCNT_dec(eval_state->sv);
11331    }
11332
11333    PL_regmatch_state = aux->old_regmatch_state;
11334    PL_regmatch_slab  = aux->old_regmatch_slab;
11335
11336    /* free all slabs above current one - this must be the last action
11337     * of this function, as aux and eval_state are allocated within
11338     * slabs and may be freed here */
11339
11340    s = PL_regmatch_slab->next;
11341    if (s) {
11342        PL_regmatch_slab->next = NULL;
11343        while (s) {
11344            regmatch_slab * const osl = s;
11345            s = s->next;
11346            Safefree(osl);
11347        }
11348    }
11349}
11350
11351
11352STATIC void
11353S_to_utf8_substr(pTHX_ regexp *prog)
11354{
11355    /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11356     * on the converted value */
11357
11358    int i = 1;
11359
11360    PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11361
11362    do {
11363        if (prog->substrs->data[i].substr
11364            && !prog->substrs->data[i].utf8_substr) {
11365            SV* const sv = newSVsv(prog->substrs->data[i].substr);
11366            prog->substrs->data[i].utf8_substr = sv;
11367            sv_utf8_upgrade(sv);
11368            if (SvVALID(prog->substrs->data[i].substr)) {
11369                if (SvTAIL(prog->substrs->data[i].substr)) {
11370                    /* Trim the trailing \n that fbm_compile added last
11371                       time.  */
11372                    SvCUR_set(sv, SvCUR(sv) - 1);
11373                    /* Whilst this makes the SV technically "invalid" (as its
11374                       buffer is no longer followed by "\0") when fbm_compile()
11375                       adds the "\n" back, a "\0" is restored.  */
11376                    fbm_compile(sv, FBMcf_TAIL);
11377                } else
11378                    fbm_compile(sv, 0);
11379            }
11380            if (prog->substrs->data[i].substr == prog->check_substr)
11381                prog->check_utf8 = sv;
11382        }
11383    } while (i--);
11384}
11385
11386STATIC bool
11387S_to_byte_substr(pTHX_ regexp *prog)
11388{
11389    /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11390     * on the converted value; returns FALSE if can't be converted. */
11391
11392    int i = 1;
11393
11394    PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11395
11396    do {
11397        if (prog->substrs->data[i].utf8_substr
11398            && !prog->substrs->data[i].substr) {
11399            SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11400            if (! sv_utf8_downgrade(sv, TRUE)) {
11401                SvREFCNT_dec_NN(sv);
11402                return FALSE;
11403            }
11404            if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11405                if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11406                    /* Trim the trailing \n that fbm_compile added last
11407                        time.  */
11408                    SvCUR_set(sv, SvCUR(sv) - 1);
11409                    fbm_compile(sv, FBMcf_TAIL);
11410                } else
11411                    fbm_compile(sv, 0);
11412            }
11413            prog->substrs->data[i].substr = sv;
11414            if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11415                prog->check_substr = sv;
11416        }
11417    } while (i--);
11418
11419    return TRUE;
11420}
11421
11422#ifndef PERL_IN_XSUB_RE
11423
11424bool
11425Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11426{
11427    /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11428     * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11429     * the larger string bounded by 'strbeg' and 'strend'.
11430     *
11431     * 'cp' needs to be assigned (if not, a future version of the Unicode
11432     * Standard could make it something that combines with adjacent characters,
11433     * so code using it would then break), and there has to be a GCB break
11434     * before and after the character. */
11435
11436
11437    GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11438    const U8 * prev_cp_start;
11439
11440    PERL_ARGS_ASSERT_IS_GRAPHEME;
11441
11442    if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11443        || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11444    {
11445        /* These are considered graphemes */
11446        return TRUE;
11447    }
11448
11449    /* Otherwise, unassigned code points are forbidden */
11450    if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11451                                    _invlist_search(PL_Assigned_invlist, cp))))
11452    {
11453        return FALSE;
11454    }
11455
11456    cp_gcb_val = getGCB_VAL_CP(cp);
11457
11458    /* Find the GCB value of the previous code point in the input */
11459    prev_cp_start = utf8_hop_back(s, -1, strbeg);
11460    if (UNLIKELY(prev_cp_start == s)) {
11461        prev_cp_gcb_val = GCB_EDGE;
11462    }
11463    else {
11464        prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11465    }
11466
11467    /* And check that is a grapheme boundary */
11468    if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11469                TRUE /* is UTF-8 encoded */ ))
11470    {
11471        return FALSE;
11472    }
11473
11474    /* Similarly verify there is a break between the current character and the
11475     * following one */
11476    s += UTF8SKIP(s);
11477    if (s >= strend) {
11478        next_cp_gcb_val = GCB_EDGE;
11479    }
11480    else {
11481        next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11482    }
11483
11484    return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11485}
11486
11487/*
11488=for apidoc_section $unicode
11489
11490=for apidoc isSCRIPT_RUN
11491
11492Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11493not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11494sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11495two degenerate cases given below, this function returns TRUE iff all code
11496points in it come from any combination of three "scripts" given by the Unicode
11497"Script Extensions" property: Common, Inherited, and possibly one other.
11498Additionally all decimal digits must come from the same consecutive sequence of
1149910.
11500
11501For example, if all the characters in the sequence are Greek, or Common, or
11502Inherited, this function will return TRUE, provided any decimal digits in it
11503are from the same block of digits in Common.  (These are the ASCII digits
11504"0".."9" and additionally a block for full width forms of these, and several
11505others used in mathematical notation.)   For scripts (unlike Greek) that have
11506their own digits defined this will accept either digits from that set or from
11507one of the Common digit sets, but not a combination of the two.  Some scripts,
11508such as Arabic, have more than one set of digits.  All digits must come from
11509the same set for this function to return TRUE.
11510
11511C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11512contain the script found, using the C<SCX_enum> typedef.  Its value will be
11513C<SCX_INVALID> if the function returns FALSE.
11514
11515If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11516will be C<SCX_INVALID>.
11517
11518If the sequence contains a single code point which is unassigned to a character
11519in the version of Unicode being used, the function will return TRUE, and the
11520script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11521in the input sequence will result in the function treating the input as not
11522being a script run.
11523
11524The returned script will be C<SCX_Inherited> iff all the code points in it are
11525from the Inherited script.
11526
11527Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11528it are from the Inherited or Common scripts.
11529
11530=cut
11531
11532*/
11533
11534bool
11535Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11536{
11537    /* Basically, it looks at each character in the sequence to see if the
11538     * above conditions are met; if not it fails.  It uses an inversion map to
11539     * find the enum corresponding to the script of each character.  But this
11540     * is complicated by the fact that a few code points can be in any of
11541     * several scripts.  The data has been constructed so that there are
11542     * additional enum values (all negative) for these situations.  The
11543     * absolute value of those is an index into another table which contains
11544     * pointers to auxiliary tables for each such situation.  Each aux array
11545     * lists all the scripts for the given situation.  There is another,
11546     * parallel, table that gives the number of entries in each aux table.
11547     * These are all defined in charclass_invlists.h */
11548
11549    /* XXX Here are the additional things UTS 39 says could be done:
11550     *
11551     * Forbid sequences of the same nonspacing mark
11552     *
11553     * Check to see that all the characters are in the sets of exemplar
11554     * characters for at least one language in the Unicode Common Locale Data
11555     * Repository [CLDR]. */
11556
11557
11558    /* Things that match /\d/u */
11559    SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11560    UV * decimals_array = invlist_array(decimals_invlist);
11561
11562    /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11563     * not currently known) */
11564    UV zero_of_run = 0;
11565
11566    SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11567    SCX_enum script_of_char = SCX_INVALID;
11568
11569    /* If the script remains not fully determined from iteration to iteration,
11570     * this is the current intersection of the possiblities.  */
11571    SCX_enum * intersection = NULL;
11572    PERL_UINT_FAST8_T intersection_len = 0;
11573
11574    bool retval = TRUE;
11575    SCX_enum * ret_script = NULL;
11576
11577    assert(send >= s);
11578
11579    PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11580
11581    /* All code points in 0..255 are either Common or Latin, so must be a
11582     * script run.  We can return immediately unless we need to know which
11583     * script it is. */
11584    if (! utf8_target && LIKELY(send > s)) {
11585        if (ret_script == NULL) {
11586            return TRUE;
11587        }
11588
11589        /* If any character is Latin, the run is Latin */
11590        while (s < send) {
11591            if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11592                *ret_script = SCX_Latin;
11593                return TRUE;
11594            }
11595        }
11596
11597        /* Here, all are Common */
11598        *ret_script = SCX_Common;
11599        return TRUE;
11600    }
11601
11602    /* Look at each character in the sequence */
11603    while (s < send) {
11604        /* If the current character being examined is a digit, this is the code
11605         * point of the zero for its sequence of 10 */
11606        UV zero_of_char;
11607
11608        UV cp;
11609
11610        /* The code allows all scripts to use the ASCII digits.  This is
11611         * because they are in the Common script.  Hence any ASCII ones found
11612         * are ok, unless and until a digit from another set has already been
11613         * encountered.  digit ranges in Common are not similarly blessed) */
11614        if (UNLIKELY(isDIGIT(*s))) {
11615            if (UNLIKELY(script_of_run == SCX_Unknown)) {
11616                retval = FALSE;
11617                break;
11618            }
11619            if (zero_of_run) {
11620                if (zero_of_run != '0') {
11621                    retval = FALSE;
11622                    break;
11623                }
11624            }
11625            else {
11626                zero_of_run = '0';
11627            }
11628            s++;
11629            continue;
11630        }
11631
11632        /* Here, isn't an ASCII digit.  Find the code point of the character */
11633        if (! UTF8_IS_INVARIANT(*s)) {
11634            Size_t len;
11635            cp = valid_utf8_to_uvchr((U8 *) s, &len);
11636            s += len;
11637        }
11638        else {
11639            cp = *(s++);
11640        }
11641
11642        /* If is within the range [+0 .. +9] of the script's zero, it also is a
11643         * digit in that script.  We can skip the rest of this code for this
11644         * character. */
11645        if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11646            continue;
11647        }
11648
11649        /* Find the character's script.  The correct values are hard-coded here
11650         * for small-enough code points. */
11651        if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11652                               unlikely to change */
11653            if (       cp > 255
11654                || (   isALPHA_L1(cp)
11655                    && LIKELY(cp != MICRO_SIGN_NATIVE)))
11656            {
11657                script_of_char = SCX_Latin;
11658            }
11659            else {
11660                script_of_char = SCX_Common;
11661            }
11662        }
11663        else {
11664            script_of_char = _Perl_SCX_invmap[
11665                                       _invlist_search(PL_SCX_invlist, cp)];
11666        }
11667
11668        /* We arbitrarily accept a single unassigned character, but not in
11669         * combination with anything else, and not a run of them. */
11670        if (   UNLIKELY(script_of_run == SCX_Unknown)
11671            || UNLIKELY(   script_of_run != SCX_INVALID
11672                        && script_of_char == SCX_Unknown))
11673        {
11674            retval = FALSE;
11675            break;
11676        }
11677
11678        /* For the first character, or the run is inherited, the run's script
11679         * is set to the char's */
11680        if (   UNLIKELY(script_of_run == SCX_INVALID)
11681            || UNLIKELY(script_of_run == SCX_Inherited))
11682        {
11683            script_of_run = script_of_char;
11684        }
11685
11686        /* For the character's script to be Unknown, it must be the first
11687         * character in the sequence (for otherwise a test above would have
11688         * prevented us from reaching here), and we have set the run's script
11689         * to it.  Nothing further to be done for this character */
11690        if (UNLIKELY(script_of_char == SCX_Unknown)) {
11691            continue;
11692        }
11693
11694        /* We accept 'inherited' script characters currently even at the
11695         * beginning.  (We know that no characters in Inherited are digits, or
11696         * we'd have to check for that) */
11697        if (UNLIKELY(script_of_char == SCX_Inherited)) {
11698            continue;
11699        }
11700
11701        /* If the run so far is Common, and the new character isn't, change the
11702         * run's script to that of this character */
11703        if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11704            script_of_run = script_of_char;
11705        }
11706
11707        /* Now we can see if the script of the new character is the same as
11708         * that of the run */
11709        if (LIKELY(script_of_char == script_of_run)) {
11710            /* By far the most common case */
11711            goto scripts_match;
11712        }
11713
11714        /* Here, the script of the run isn't Common.  But characters in Common
11715         * match any script */
11716        if (script_of_char == SCX_Common) {
11717            goto scripts_match;
11718        }
11719
11720#ifndef HAS_SCX_AUX_TABLES
11721
11722        /* Too early a Unicode version to have a code point belonging to more
11723         * than one script, so, if the scripts don't exactly match, fail */
11724        PERL_UNUSED_VAR(intersection_len);
11725        retval = FALSE;
11726        break;
11727
11728#else
11729
11730        /* Here there is no exact match between the character's script and the
11731         * run's.  And we've handled the special cases of scripts Unknown,
11732         * Inherited, and Common.
11733         *
11734         * Negative script numbers signify that the value may be any of several
11735         * scripts, and we need to look at auxiliary information to make our
11736         * determination.  But if both are non-negative, we can fail now */
11737        if (LIKELY(script_of_char >= 0)) {
11738            const SCX_enum * search_in;
11739            PERL_UINT_FAST8_T search_in_len;
11740            PERL_UINT_FAST8_T i;
11741
11742            if (LIKELY(script_of_run >= 0)) {
11743                retval = FALSE;
11744                break;
11745            }
11746
11747            /* Use the previously constructed set of possible scripts, if any.
11748             * */
11749            if (intersection) {
11750                search_in = intersection;
11751                search_in_len = intersection_len;
11752            }
11753            else {
11754                search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11755                search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11756            }
11757
11758            for (i = 0; i < search_in_len; i++) {
11759                if (search_in[i] == script_of_char) {
11760                    script_of_run = script_of_char;
11761                    goto scripts_match;
11762                }
11763            }
11764
11765            retval = FALSE;
11766            break;
11767        }
11768        else if (LIKELY(script_of_run >= 0)) {
11769            /* script of character could be one of several, but run is a single
11770             * script */
11771            const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11772            const PERL_UINT_FAST8_T search_in_len
11773                                     = SCX_AUX_TABLE_lengths[-script_of_char];
11774            PERL_UINT_FAST8_T i;
11775
11776            for (i = 0; i < search_in_len; i++) {
11777                if (search_in[i] == script_of_run) {
11778                    script_of_char = script_of_run;
11779                    goto scripts_match;
11780                }
11781            }
11782
11783            retval = FALSE;
11784            break;
11785        }
11786        else {
11787            /* Both run and char could be in one of several scripts.  If the
11788             * intersection is empty, then this character isn't in this script
11789             * run.  Otherwise, we need to calculate the intersection to use
11790             * for future iterations of the loop, unless we are already at the
11791             * final character */
11792            const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11793            const PERL_UINT_FAST8_T char_len
11794                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11795            const SCX_enum * search_run;
11796            PERL_UINT_FAST8_T run_len;
11797
11798            SCX_enum * new_overlap = NULL;
11799            PERL_UINT_FAST8_T i, j;
11800
11801            if (intersection) {
11802                search_run = intersection;
11803                run_len = intersection_len;
11804            }
11805            else {
11806                search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11807                run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11808            }
11809
11810            intersection_len = 0;
11811
11812            for (i = 0; i < run_len; i++) {
11813                for (j = 0; j < char_len; j++) {
11814                    if (search_run[i] == search_char[j]) {
11815
11816                        /* Here, the script at i,j matches.  That means this
11817                         * character is in the run.  But continue on to find
11818                         * the complete intersection, for the next loop
11819                         * iteration, and for the digit check after it.
11820                         *
11821                         * On the first found common script, we malloc space
11822                         * for the intersection list for the worst case of the
11823                         * intersection, which is the minimum of the number of
11824                         * scripts remaining in each set. */
11825                        if (intersection_len == 0) {
11826                            Newx(new_overlap,
11827                                 MIN(run_len - i, char_len - j),
11828                                 SCX_enum);
11829                        }
11830                        new_overlap[intersection_len++] = search_run[i];
11831                    }
11832                }
11833            }
11834
11835            /* Here we've looked through everything.  If they have no scripts
11836             * in common, not a run */
11837            if (intersection_len == 0) {
11838                retval = FALSE;
11839                break;
11840            }
11841
11842            /* If there is only a single script in common, set to that.
11843             * Otherwise, use the intersection going forward */
11844            Safefree(intersection);
11845            intersection = NULL;
11846            if (intersection_len == 1) {
11847                script_of_run = script_of_char = new_overlap[0];
11848                Safefree(new_overlap);
11849                new_overlap = NULL;
11850            }
11851            else {
11852                intersection = new_overlap;
11853            }
11854        }
11855
11856#endif
11857
11858  scripts_match:
11859
11860        /* Here, the script of the character is compatible with that of the
11861         * run.  That means that in most cases, it continues the script run.
11862         * Either it and the run match exactly, or one or both can be in any of
11863         * several scripts, and the intersection is not empty.  However, if the
11864         * character is a decimal digit, it could still mean failure if it is
11865         * from the wrong sequence of 10.  So, we need to look at if it's a
11866         * digit.  We've already handled the 10 digits [0-9], and the next
11867         * lowest one is this one: */
11868        if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11869            continue;   /* Not a digit; this character is part of the run */
11870        }
11871
11872        /* If we have a definitive '0' for the script of this character, we
11873         * know that for this to be a digit, it must be in the range of +0..+9
11874         * of that zero. */
11875        if (   script_of_char >= 0
11876            && (zero_of_char = script_zeros[script_of_char]))
11877        {
11878            if (! withinCOUNT(cp, zero_of_char, 9)) {
11879                continue;   /* Not a digit; this character is part of the run
11880                             */
11881            }
11882
11883        }
11884        else {  /* Need to look up if this character is a digit or not */
11885            SSize_t index_of_zero_of_char;
11886            index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11887            if (     UNLIKELY(index_of_zero_of_char < 0)
11888                || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11889            {
11890                continue;   /* Not a digit; this character is part of the run.
11891                             */
11892            }
11893
11894            zero_of_char = decimals_array[index_of_zero_of_char];
11895        }
11896
11897        /* Here, the character is a decimal digit, and the zero of its sequence
11898         * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11899         * they better be the same. */
11900        if (zero_of_run) {
11901            if (zero_of_run != zero_of_char) {
11902                retval = FALSE;
11903                break;
11904            }
11905        }
11906        else {  /* Otherwise we now have a zero for this run */
11907            zero_of_run = zero_of_char;
11908        }
11909    } /* end of looping through CLOSESR text */
11910
11911    Safefree(intersection);
11912
11913    if (ret_script != NULL) {
11914        if (retval) {
11915            *ret_script = script_of_run;
11916        }
11917        else {
11918            *ret_script = SCX_INVALID;
11919        }
11920    }
11921
11922    return retval;
11923}
11924#endif /* ifndef PERL_IN_XSUB_RE */
11925
11926/* Buffer logic. */
11927SV*
11928Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
11929                    const U32 flags)
11930{
11931    PERL_ARGS_ASSERT_REG_NAMED_BUFF;
11932
11933    PERL_UNUSED_ARG(value);
11934
11935    if (flags & RXapif_FETCH) {
11936        return reg_named_buff_fetch(rx, key, flags);
11937    } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
11938        Perl_croak_no_modify();
11939        return NULL;
11940    } else if (flags & RXapif_EXISTS) {
11941        return reg_named_buff_exists(rx, key, flags)
11942            ? &PL_sv_yes
11943            : &PL_sv_no;
11944    } else if (flags & RXapif_REGNAMES) {
11945        return reg_named_buff_all(rx, flags);
11946    } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
11947        return reg_named_buff_scalar(rx, flags);
11948    } else {
11949        Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
11950        return NULL;
11951    }
11952}
11953
11954SV*
11955Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
11956                         const U32 flags)
11957{
11958    PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
11959    PERL_UNUSED_ARG(lastkey);
11960
11961    if (flags & RXapif_FIRSTKEY)
11962        return reg_named_buff_firstkey(rx, flags);
11963    else if (flags & RXapif_NEXTKEY)
11964        return reg_named_buff_nextkey(rx, flags);
11965    else {
11966        Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
11967                                            (int)flags);
11968        return NULL;
11969    }
11970}
11971
11972SV*
11973Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
11974                          const U32 flags)
11975{
11976    SV *ret;
11977    struct regexp *const rx = ReANY(r);
11978
11979    PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
11980
11981    if (rx && RXp_PAREN_NAMES(rx)) {
11982        HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
11983        if (he_str) {
11984            IV i;
11985            SV* sv_dat=HeVAL(he_str);
11986            I32 *nums=(I32*)SvPVX(sv_dat);
11987            AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
11988            for ( i=0; i<SvIVX(sv_dat); i++ ) {
11989                if ((I32)(rx->nparens) >= nums[i]
11990                    && RXp_OFFS_VALID(rx,nums[i]))
11991                {
11992                    ret = newSVpvs("");
11993                    Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE);
11994                    if (!retarray)
11995                        return ret;
11996                } else {
11997                    if (retarray)
11998                        ret = newSV_type(SVt_NULL);
11999                }
12000                if (retarray)
12001                    av_push_simple(retarray, ret);
12002            }
12003            if (retarray)
12004                return newRV_noinc(MUTABLE_SV(retarray));
12005        }
12006    }
12007    return NULL;
12008}
12009
12010bool
12011Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
12012                           const U32 flags)
12013{
12014    struct regexp *const rx = ReANY(r);
12015
12016    PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
12017
12018    if (rx && RXp_PAREN_NAMES(rx)) {
12019        if (flags & RXapif_ALL) {
12020            return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
12021        } else {
12022            SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
12023            if (sv) {
12024                SvREFCNT_dec_NN(sv);
12025                return TRUE;
12026            } else {
12027                return FALSE;
12028            }
12029        }
12030    } else {
12031        return FALSE;
12032    }
12033}
12034
12035SV*
12036Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
12037{
12038    struct regexp *const rx = ReANY(r);
12039
12040    PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
12041
12042    if ( rx && RXp_PAREN_NAMES(rx) ) {
12043        (void)hv_iterinit(RXp_PAREN_NAMES(rx));
12044
12045        return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
12046    } else {
12047        return FALSE;
12048    }
12049}
12050
12051SV*
12052Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
12053{
12054    struct regexp *const rx = ReANY(r);
12055    DECLARE_AND_GET_RE_DEBUG_FLAGS;
12056
12057    PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
12058
12059    if (rx && RXp_PAREN_NAMES(rx)) {
12060        HV *hv = RXp_PAREN_NAMES(rx);
12061        HE *temphe;
12062        while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12063            IV i;
12064            IV parno = 0;
12065            SV* sv_dat = HeVAL(temphe);
12066            I32 *nums = (I32*)SvPVX(sv_dat);
12067            for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12068                if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12069                    RXp_OFFS_VALID(rx,nums[i]))
12070                {
12071                    parno = nums[i];
12072                    break;
12073                }
12074            }
12075            if (parno || flags & RXapif_ALL) {
12076                return newSVhek(HeKEY_hek(temphe));
12077            }
12078        }
12079    }
12080    return NULL;
12081}
12082
12083SV*
12084Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
12085{
12086    SV *ret;
12087    AV *av;
12088    SSize_t length;
12089    struct regexp *const rx = ReANY(r);
12090
12091    PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
12092
12093    if (rx && RXp_PAREN_NAMES(rx)) {
12094        if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
12095            return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
12096        } else if (flags & RXapif_ONE) {
12097            ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
12098            av = MUTABLE_AV(SvRV(ret));
12099            length = av_count(av);
12100            SvREFCNT_dec_NN(ret);
12101            return newSViv(length);
12102        } else {
12103            Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
12104                                                (int)flags);
12105            return NULL;
12106        }
12107    }
12108    return &PL_sv_undef;
12109}
12110
12111SV*
12112Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
12113{
12114    struct regexp *const rx = ReANY(r);
12115    AV *av = newAV();
12116
12117    PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
12118
12119    if (rx && RXp_PAREN_NAMES(rx)) {
12120        HV *hv= RXp_PAREN_NAMES(rx);
12121        HE *temphe;
12122        (void)hv_iterinit(hv);
12123        while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12124            IV i;
12125            IV parno = 0;
12126            SV* sv_dat = HeVAL(temphe);
12127            I32 *nums = (I32*)SvPVX(sv_dat);
12128            for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12129                if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12130                    RXp_OFFS_VALID(rx,nums[i]))
12131                {
12132                    parno = nums[i];
12133                    break;
12134                }
12135            }
12136            if (parno || flags & RXapif_ALL) {
12137                av_push_simple(av, newSVhek(HeKEY_hek(temphe)));
12138            }
12139        }
12140    }
12141
12142    return newRV_noinc(MUTABLE_SV(av));
12143}
12144
12145void
12146Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren,
12147                             SV * const sv)
12148{
12149    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
12150    Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0);
12151}
12152
12153#ifndef PERL_IN_XSUB_RE
12154
12155void
12156Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren,
12157                                   SV * const sv, U32 flags)
12158{
12159    struct regexp *const rx = ReANY(re);
12160    char *s = NULL;
12161    SSize_t i,t = 0;
12162    SSize_t s1, t1;
12163    I32 n = paren;
12164    I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12165
12166    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS;
12167
12168    if (      n == RX_BUFF_IDX_CARET_PREMATCH
12169           || n == RX_BUFF_IDX_CARET_FULLMATCH
12170           || n == RX_BUFF_IDX_CARET_POSTMATCH
12171       )
12172    {
12173        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12174        if (!keepcopy) {
12175            /* on something like
12176             *    $r = qr/.../;
12177             *    /$qr/p;
12178             * the KEEPCOPY is set on the PMOP rather than the regex */
12179            if (PL_curpm && re == PM_GETRE(PL_curpm))
12180                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12181        }
12182        if (!keepcopy)
12183            goto ret_undef;
12184    }
12185
12186    if (!RXp_SUBBEG(rx))
12187        goto ret_undef;
12188
12189    if (n == RX_BUFF_IDX_CARET_FULLMATCH)
12190        /* no need to distinguish between them any more */
12191        n = RX_BUFF_IDX_FULLMATCH;
12192
12193    if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
12194        && (i = RXp_OFFS_START(rx,0)) != -1)
12195    {
12196        /* $`, ${^PREMATCH} */
12197        s = RXp_SUBBEG(rx);
12198    }
12199    else
12200    if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
12201        && (t = RXp_OFFS_END(rx,0)) != -1)
12202    {
12203        /* $', ${^POSTMATCH} */
12204        s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t;
12205        i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t;
12206    }
12207    else /* when flags is true we do an absolute lookup, and compare against rx->nparens */
12208    if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) {
12209        I32 *map = (!flags && n) ? rx->logical_to_parno : NULL;
12210        I32 true_parno = map ? map[n] : n;
12211        do {
12212            if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1)  &&
12213                ((t1 = RXp_OFFS_END(rx,true_parno)) != -1))
12214            {
12215                /* $&, ${^MATCH}, $1 ... */
12216                i = t1 - s1;
12217                s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx);
12218                goto found_it;
12219            }
12220            else if (map) {
12221                true_parno = rx->parno_to_logical_next[true_parno];
12222            }
12223            else {
12224                break;
12225            }
12226        } while (true_parno);
12227        goto ret_undef;
12228    } else {
12229        goto ret_undef;
12230    }
12231
12232  found_it:
12233    assert(s >= RXp_SUBBEG(rx));
12234    assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) );
12235    if (i >= 0) {
12236#ifdef NO_TAINT_SUPPORT
12237        sv_setpvn(sv, s, i);
12238#else
12239        const int oldtainted = TAINT_get;
12240        TAINT_NOT;
12241        sv_setpvn(sv, s, i);
12242        TAINT_set(oldtainted);
12243#endif
12244        if (RXp_MATCH_UTF8(rx))
12245            SvUTF8_on(sv);
12246        else
12247            SvUTF8_off(sv);
12248        if (TAINTING_get) {
12249            if (RXp_MATCH_TAINTED(rx)) {
12250                if (SvTYPE(sv) >= SVt_PVMG) {
12251                    MAGIC* const mg = SvMAGIC(sv);
12252                    MAGIC* mgt;
12253                    TAINT;
12254                    SvMAGIC_set(sv, mg->mg_moremagic);
12255                    SvTAINT(sv);
12256                    if ((mgt = SvMAGIC(sv))) {
12257                        mg->mg_moremagic = mgt;
12258                        SvMAGIC_set(sv, mg);
12259                    }
12260                } else {
12261                    TAINT;
12262                    SvTAINT(sv);
12263                }
12264            } else
12265                SvTAINTED_off(sv);
12266        }
12267    } else {
12268      ret_undef:
12269        sv_set_undef(sv);
12270        return;
12271    }
12272}
12273
12274#endif
12275
12276void
12277Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
12278                                                         SV const * const value)
12279{
12280    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
12281
12282    PERL_UNUSED_ARG(rx);
12283    PERL_UNUSED_ARG(paren);
12284    PERL_UNUSED_ARG(value);
12285
12286    if (!PL_localizing)
12287        Perl_croak_no_modify();
12288}
12289
12290I32
12291Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
12292                              const I32 paren)
12293{
12294    struct regexp *const rx = ReANY(r);
12295    I32 i,j;
12296    I32 s1, t1;
12297    I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12298
12299    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
12300
12301    if (   paren == RX_BUFF_IDX_CARET_PREMATCH
12302        || paren == RX_BUFF_IDX_CARET_FULLMATCH
12303        || paren == RX_BUFF_IDX_CARET_POSTMATCH
12304    )
12305    {
12306        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12307        if (!keepcopy) {
12308            /* on something like
12309             *    $r = qr/.../;
12310             *    /$qr/p;
12311             * the KEEPCOPY is set on the PMOP rather than the regex */
12312            if (PL_curpm && r == PM_GETRE(PL_curpm))
12313                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12314        }
12315        if (!keepcopy)
12316            goto warn_undef;
12317    }
12318
12319    /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
12320    switch (paren) {
12321      case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
12322      case RX_BUFF_IDX_PREMATCH:       /* $` */
12323        if ( (i = RXp_OFFS_START(rx,0)) != -1) {
12324            if (i > 0) {
12325                s1 = 0;
12326                t1 = i;
12327                goto getlen;
12328            }
12329        }
12330        return 0;
12331
12332      case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
12333      case RX_BUFF_IDX_POSTMATCH:       /* $' */
12334        if ( (j = RXp_OFFS_END(rx,0)) != -1 ) {
12335            i = RXp_SUBLEN(rx) - j;
12336            if (i > 0) {
12337                s1 = j;
12338                t1 = RXp_SUBLEN(rx);
12339                goto getlen;
12340            }
12341        }
12342        return 0;
12343
12344      default: /* $& / ${^MATCH}, $1, $2, ... */
12345        if (paren <= logical_nparens) {
12346            I32 true_paren = rx->logical_to_parno
12347                             ? rx->logical_to_parno[paren]
12348                             : paren;
12349            do {
12350                if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) &&
12351                    ((t1 = RXp_OFFS_END(rx,true_paren)) != -1))
12352                {
12353                    i = t1 - s1;
12354                    goto getlen;
12355                } else if (rx->parno_to_logical_next) {
12356                    true_paren = rx->parno_to_logical_next[true_paren];
12357                } else {
12358                    break;
12359                }
12360            } while(true_paren);
12361        }
12362      warn_undef:
12363        if (ckWARN(WARN_UNINITIALIZED))
12364            report_uninit((const SV *)sv);
12365        return 0;
12366    }
12367  getlen:
12368    if (i > 0 && RXp_MATCH_UTF8(rx)) {
12369        const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1;
12370        const U8 *ep;
12371        STRLEN el;
12372
12373        i = t1 - s1;
12374        if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
12375            i = el;
12376    }
12377    return i;
12378}
12379
12380/*
12381 * ex: set ts=8 sts=4 sw=4 et:
12382 */
12383