regexec.c - make REF into a backtracking state

This way we can do the required paren restoration only when it is in use. When
we match a REF type node which is potentially a reference to an unclosed paren
we push the match context information, currently for "everything", but in a
future patch we can teach it to be more efficient by adding a new parameter to
the REF regop to track which parens it should save.

This converts the backtracking changes from the previous commit, so that it is
run only when specifically enabled via the define RE_PESSIMISTIC_PARENS which
is by default 0. We don't make the new fields in the struct conditional as the
stack frames are large and our changes don't make any real difference and it
keeps things simpler to not have conditional members, especially since some of
the structures have to line up with each other.

If enabling RE_PESSIMISTIC_PARENS fixes a backtracking bug then it means
something is sensitive to us not necessarily restoring the parens properly on
failure. We make some assumptions that the paren state after a failing state
will be corrected by a future successful state, or that the state of the
parens is irrelevant as we will fail anyway. This can be made not true by
EVAL, backrefs, and potentially some other scenarios. Thus I have left this
inefficient logic in place but guarded by the flag.
This commit is contained in:
Yves Orton 2023-01-14 14:38:38 +01:00
parent 38508ce8fc
commit 59db194299
7 changed files with 108 additions and 23 deletions

View File

@ -6039,6 +6039,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
? REFFL
: REFF),
num);
if (RExC_nestroot && num >= RExC_nestroot)
REGNODE_p(ret)->flags = VOLATILE_REF;
if (OP(REGNODE_p(ret)) == REFF) {
RExC_seen_d_op = TRUE;
}

View File

@ -13,6 +13,12 @@
#define PERL_REGCOMP_H_
/* define this to 1 if you want to enable a really aggressive and inefficient
* paren cleanup during backtracking. We should pass test with this as 0. */
#ifndef RE_PESSIMISTIC_PARENS
#define RE_PESSIMISTIC_PARENS 0
#endif
#include "regcharclass.h"
/* Convert branch sequences to more efficient trie ops? */
@ -1483,6 +1489,8 @@ typedef enum {
#define EVAL_OPTIMISTIC_FLAG 128
#define EVAL_FLAGS_MASK (EVAL_OPTIMISTIC_FLAG-1)
#endif /* PERL_REGCOMP_H_ */
/*

View File

@ -344,3 +344,4 @@ MARKPOINT next:FAIL
SKIP next:FAIL
CUTGROUP next:FAIL
KEEPS next:FAIL
REF next:FAIL

View File

@ -1258,4 +1258,6 @@ static const scan_data_t zero_scan_data = {
#define REGNODE_STEP_OVER(ret,t1,t2) \
NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode))
#define VOLATILE_REF 1
#endif /* REGCOMP_INTERNAL_H */

View File

@ -6841,8 +6841,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case TRIE_next_fail: /* we failed - try next alternative */
{
U8 *uc;
REGCP_UNWIND(ST.lastcp);
regcppop(rex,&maxopenparen);
if (RE_PESSIMISTIC_PARENS) {
REGCP_UNWIND(ST.lastcp);
regcppop(rex,&maxopenparen);
}
if ( ST.jump ) {
/* undo any captures done in the tail part of a branch,
* e.g.
@ -6965,8 +6967,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
});
if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
if (RE_PESSIMISTIC_PARENS) {
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
}
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
script_run_begin);
NOT_REACHED; /* NOTREACHED */
@ -7983,6 +7987,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
utf8_fold_flags = 0;
goto do_ref;
#undef ST
#define ST st->u.backref
case REF: /* /\1/ */
folder = NULL;
fold_array = NULL;
@ -8018,8 +8024,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
endref = rex->offs[n].end;
if (ln == -1 || endref == -1)
sayNO; /* Do not match unless seen CLOSEn. */
if (ln == endref)
break;
goto ref_yes;
s = reginfo->strbeg + ln;
if (type != REF /* REF can do byte comparison */
@ -8038,7 +8045,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
sayNO;
}
locinput = limit;
break;
goto ref_yes;
}
/* Not utf8: Inline the first character, for speed. */
@ -8058,8 +8065,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
: ! folder(aTHX_ locinput, s, ln)))
sayNO;
locinput += ln;
break;
}
ref_yes:
if (scan->flags) { /* == VOLATILE_REF but only other value is 0 */
ST.cp = regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
script_run_begin);
}
break;
NOT_REACHED; /* NOTREACHED */
case REF_next:
sayYES;
break;
case REF_next_fail:
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen);
sayNO;
break;
case NOTHING: /* null op; e.g. the 'nothing' following
* the '*' in m{(a+|b)*}' */
@ -9005,8 +9030,7 @@ NULL
);
/* Try grabbing another A and see if it helps. */
cur_curlyx->u.curlyx.lastloc = locinput;
ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
maxopenparen);
ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_min,
/*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
@ -9035,8 +9059,10 @@ NULL
ST.lastcloseparen = rex->lastcloseparen;
ST.next_branch = next;
REGCP_SET(ST.cp);
regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
if (RE_PESSIMISTIC_PARENS) {
regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
}
/* Now go into the branch */
if (has_cutgroup) {
@ -9073,8 +9099,10 @@ NULL
do_cutgroup = 0;
no_final = 0;
}
REGCP_UNWIND(ST.lastcp);
regcppop(rex,&maxopenparen);
if (RE_PESSIMISTIC_PARENS) {
REGCP_UNWIND(ST.lastcp);
regcppop(rex,&maxopenparen);
}
REGCP_UNWIND(ST.cp);
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
CAPTURE_CLEAR(ST.before_paren+1,ST.after_paren,"BRANCH_next_fail");
@ -9439,8 +9467,10 @@ NULL
case CURLY_B_min_fail:
/* failed to find B in a non-greedy match. */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
if (RE_PESSIMISTIC_PARENS) {
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
}
REGCP_UNWIND(ST.cp);
if (ST.paren) {
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
@ -9553,8 +9583,10 @@ NULL
}
curly_try_B_min:
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
if (RE_PESSIMISTIC_PARENS) {
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
}
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
script_run_begin);
@ -9568,8 +9600,10 @@ NULL
&& locinput + ST.Binfo.min_length <= loceol
&& S_test_EXACTISH_ST(locinput, ST.Binfo)))
{
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
if (RE_PESSIMISTIC_PARENS) {
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
}
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
script_run_begin);
@ -9581,8 +9615,10 @@ NULL
case CURLY_B_max_fail:
/* failed to find B in a greedy match */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
if (RE_PESSIMISTIC_PARENS) {
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
}
CURLY_B_all_failed:
REGCP_UNWIND(ST.cp);
if (ST.paren) {

View File

@ -958,7 +958,7 @@ typedef struct regmatch_state {
struct {
U32 paren;
CHECKPOINT cp;
CHECKPOINT lastcp; /* remember current savestack index */
CHECKPOINT lastcp; /* remember current savestack index */
U32 lastparen;
U32 lastcloseparen;
char *maxpos; /* highest possible point in string to match */
@ -969,6 +969,10 @@ typedef struct regmatch_state {
struct next_matchable_info Binfo;
} curly; /* and CURLYN/PLUS/STAR */
struct {
CHECKPOINT cp;
CHECKPOINT lastcp;
} backref; /* REF and friends */
} u;
} regmatch_state;

View File

@ -128,7 +128,7 @@ typedef struct regnode tregnode_WHILEM;
/* Regops and State definitions */
#define REGNODE_MAX 111
#define REGMATCH_STATE_MAX 151
#define REGMATCH_STATE_MAX 153
/* -- For regexec.c to switch on target being utf8 (t8) or not (tb, b='byte'); */
#define with_t_UTF8ness(op, t_utf8) (((op) << 1) + (cBOOL(t_utf8)))
@ -1573,6 +1573,22 @@ typedef struct regnode tregnode_WHILEM;
#define KEEPS_next_fail_t8_pb 606 /* 0x25e */
#define KEEPS_next_fail_t8_p8 607 /* 0x25f */
#define REF_next 152 /* 0x98 state for REF */
#define REF_next_tb 304 /* 0x130 */
#define REF_next_t8 305 /* 0x131 */
#define REF_next_tb_pb 608 /* 0x260 */
#define REF_next_tb_p8 609 /* 0x261 */
#define REF_next_t8_pb 610 /* 0x262 */
#define REF_next_t8_p8 611 /* 0x263 */
#define REF_next_fail 153 /* 0x99 state for REF */
#define REF_next_fail_tb 306 /* 0x132 */
#define REF_next_fail_t8 307 /* 0x133 */
#define REF_next_fail_tb_pb 612 /* 0x264 */
#define REF_next_fail_tb_p8 613 /* 0x265 */
#define REF_next_fail_t8_pb 614 /* 0x266 */
#define REF_next_fail_t8_p8 615 /* 0x267 */
/* PL_regnode_name[] - Opcode/state names in string form, for debugging */
@ -1733,6 +1749,8 @@ EXTCONST char * const PL_regnode_name[] = {
"CUTGROUP_next_fail", /* REGNODE_MAX +0x26 */
"KEEPS_next", /* REGNODE_MAX +0x27 */
"KEEPS_next_fail", /* REGNODE_MAX +0x28 */
"REF_next", /* REGNODE_MAX +0x29 */
"REF_next_fail", /* REGNODE_MAX +0x2a */
};
#endif /* DOINIT */
@ -2806,6 +2824,20 @@ EXTCONST struct regnode_meta PL_regnode_info[] = {
.arg_len = 0,
.arg_len_varies = 0,
.off_by_arg = 0
},
{
/* #152 state REF_next */
.type = REF,
.arg_len = 0,
.arg_len_varies = 0,
.off_by_arg = 0
},
{
/* #153 state REF_next_fail */
.type = REF,
.arg_len = 0,
.arg_len_varies = 0,
.off_by_arg = 0
}
};
#endif /* DOINIT */