diff --git a/CHANGES b/CHANGES index 410fe15..c2c29ba 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,10 @@ --- $MawkId: CHANGES,v 1.412 2024/12/14 17:00:30 tom Exp $ +-- $MawkId: CHANGES,v 1.414 2024/12/31 15:21:17 tom Exp $ + +20241231 + + in-progress changes to improve regex brace expressions, using new + machine codes M_ENTER and M_LOOP. + + add a T_CAT in compiled regex to fix a panic (report by Dimitar + Dimitrov). 20241214 + fix stricter gcc15 warnings. diff --git a/MANIFEST b/MANIFEST index 96447d6..c4846ab 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,4 +1,4 @@ -MANIFEST for mawk, version t20241214 +MANIFEST for mawk, version t20241231 -------------------------------------------------------------------------------- MANIFEST this file ACKNOWLEDGMENT acknowledgements diff --git a/mawk.h b/mawk.h index 8186403..d3315a5 100644 --- a/mawk.h +++ b/mawk.h @@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: mawk.h,v 1.76 2024/12/14 21:21:20 tom Exp $ + * $MawkId: mawk.h,v 1.77 2024/12/24 16:51:37 tom Exp $ */ /* mawk.h */ @@ -153,6 +153,8 @@ extern Int d_to_I(double); extern Long d_to_L(double); extern ULong d_to_UL(double d); +#define NonNull(s) ((s) == NULL ? "" : (s)) + #define d_to_i(d) ((int)d_to_I(d)) #define d_to_l(d) ((long)d_to_L(d)) diff --git a/package/debian/changelog b/package/debian/changelog index 9780c2e..4e4db7b 100644 --- a/package/debian/changelog +++ b/package/debian/changelog @@ -1,3 +1,9 @@ +mawk-cur (1.3.4-20241231) unstable; urgency=low + + * maintenance updates + + -- Thomas E. Dickey Tue, 31 Dec 2024 06:23:05 -0500 + mawk-cur (1.3.4-20241214) unstable; urgency=low * maintenance updates diff --git a/package/freebsd/Makefile b/package/freebsd/Makefile index aa92976..b84151c 100644 --- a/package/freebsd/Makefile +++ b/package/freebsd/Makefile @@ -2,7 +2,7 @@ # $FreeBSD: head/lang/mawk/Makefile 516890 2019-11-06 14:17:48Z wen $ PORTNAME= mawk -DISTVERSION= 1.3.4.20241214 +DISTVERSION= 1.3.4.20241231 CATEGORIES= lang MASTER_SITES= https://invisible-island.net/archives/${PORTNAME}/ \ https://invisible-mirror.net/archives/${PORTNAME}/ diff --git a/package/mawk.spec b/package/mawk.spec index 3e1b15f..887535f 100644 --- a/package/mawk.spec +++ b/package/mawk.spec @@ -1,9 +1,9 @@ Summary: mawk - pattern scanning and text processing language %global AppProgram mawk %global AppVersion 1.3.4 -%global AppPatched 20241214 +%global AppPatched 20241231 %global MySite https://invisible-island.net -# $MawkId: mawk.spec,v 1.136 2024/12/14 17:00:30 tom Exp $ +# $MawkId: mawk.spec,v 1.137 2024/12/31 11:23:05 tom Exp $ Name: %{AppProgram} Version: %{AppVersion} Release: %{AppPatched} diff --git a/patchlev.h b/patchlev.h index 51890b8..d8bee74 100644 --- a/patchlev.h +++ b/patchlev.h @@ -11,9 +11,9 @@ the GNU General Public License, version 2, 1991. */ /* - * $MawkId: patchlev.h,v 1.163 2024/12/14 17:00:30 tom Exp $ + * $MawkId: patchlev.h,v 1.164 2024/12/31 11:23:05 tom Exp $ */ #define PATCH_BASE 1 #define PATCH_LEVEL 3 #define PATCH_STRING ".4" -#define DATE_STRING "20241214" +#define DATE_STRING "20241231" diff --git a/regexp.c b/regexp.c index 5d4cc80..7c8405b 100644 --- a/regexp.c +++ b/regexp.c @@ -10,7 +10,7 @@ Mawk is distributed without warranty under the terms of the GNU General Public License, version 2, 1991. */ -/* $MawkId: regexp.c,v 1.16 2024/08/25 17:34:05 tom Exp $ */ +/* $MawkId: regexp.c,v 1.17 2024/12/31 15:13:35 tom Exp $ */ #include @@ -24,8 +24,18 @@ the GNU General Public License, version 2, 1991. #define Visible_RT_STATE #define Visible_STATE # include -#define RE_FILL() { goto refill; } -#define RE_CASE() { goto reswitch; } + +#define RE_FILL() do { TRACE2((rt_form "refill...\n", rt_args)); goto refill; } while (0) +#define RE_CASE() do { goto reswitch; } while (0) + +#define rt_form "[%s@%d] %d:%03d " +#define rt_args __FILE__, __LINE__, \ + (int)(run_entry - RE_run_stack_base), \ + (int)(m - machine) + +#define TR_AT(what) \ + TRACE2((rt_form "%s\n", rt_args, what)) + # include # include # include diff --git a/rexp.c b/rexp.c index 4998ed3..a651d2a 100644 --- a/rexp.c +++ b/rexp.c @@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp.c,v 1.53 2024/12/14 12:55:59 tom Exp $ + * $MawkId: rexp.c,v 1.56 2024/12/31 12:56:54 tom Exp $ */ /* op precedence parser for regular expressions */ @@ -19,6 +19,10 @@ the GNU General Public License, version 2, 1991. #include #include +#ifndef FIXME_INTERVAL_LIMITS +#define FIXME_INTERVAL_LIMITS 0 /* =1 for pre-bugfix */ +#endif + /* DATA */ int REerrno; const char *const REerrlist[] = @@ -118,6 +122,91 @@ typedef struct { int prec; } OPS; +#ifndef NO_INTERVAL_EXPR +/* duplicate a machine, oldmp into newmp */ +static void +duplicate_m(MACHINE * newmp, MACHINE * oldmp) +{ + register STATE *p; + TRACE(("duplicate_m %p -> %p\n", (void *) oldmp, (void *) newmp)); + TRACE(("...start %p\n", (void *) oldmp->start)); + TRACE(("...stop %p\n", (void *) oldmp->stop)); + p = (STATE *) RE_malloc(2 * STATESZ); + RE_copy_states(p, oldmp->start, 2); + newmp->start = (STATE *) p; + newmp->stop = (STATE *) (p + 1); +} + +static void +RE_set_limit(MACHINE * mp, Int minlimit, Int maxlimit) +{ + STATE *p = mp->start; + STATE *q = NULL; + + if (p->s_type == M_2JA) + ++p; + + if (p->s_type == M_SAVE_POS) { + int depth = 0; + STATE *r = p; + do { + switch (r->s_type) { + case M_SAVE_POS: + depth++; + break; + case M_2JC: + case M_LOOP: + if (--depth == 0) { + q = r; + } + break; + case M_ACCEPT: + depth = -1; + break; + } + ++r; + } while (depth > 0); + } + if (q != NULL) { + size_t len = (size_t) (mp->stop - mp->start + 2); + int offset = (int) (q - mp->start); + + q->s_type = M_LOOP; + q->it_min = minlimit; + q->it_max = maxlimit; + + /* reallocate the states, to insert an item at the beginning */ + mp->start = (STATE *) RE_realloc(mp->start, len * STATESZ); + mp->stop = mp->start + len - 1; + q = mp->start; + while (--len != 0) { + q[len] = q[len - 1]; + } + q->s_type = M_ENTER; + q->s_data.jump = offset + 1; + } +} + +/* replace m with m* limited to the max iterations + (variation of m* closure) */ +static void +RE_close_limit(MACHINE * mp, Int min_limit, Int max_limit) +{ + RE_close(mp); + RE_set_limit(mp, min_limit, max_limit); +} + +/* replace m with m+ limited to the max iterations + which is one or more, limited + (variation of m+ positive closure) */ +static void +RE_poscl_limit(MACHINE * mp, Int min_limit, Int max_limit) +{ + RE_poscl(mp); + RE_set_limit(mp, min_limit, max_limit); +} +#endif /* ! NO_INTERVAL_EXPR */ + /* duplicate_m() relies upon copying machines whose size is 1, i.e., atoms */ #define BigMachine(mp) (((mp)->stop - (mp)->start) > 1) @@ -156,8 +245,15 @@ REcompile(char *re, size_t len) t = RE_lex(m_stack(0)); memset(m_ptr, 0, sizeof(*m_ptr)); + /* provide for making the trace a little easier to read by indenting */ +#if OPT_TRACE > 1 +#define M_FMT(format) "@%d: %*s " format, __LINE__, 4 * ((int) (m_ptr - m_array)), " " +#else +#define M_FMT(format) format +#endif + while (1) { - TRACE(("RE_lex token %s\n", token_name(t))); + TRACE((M_FMT("RE_lex token %s\n"), token_name(t))); switch (t) { case T_STR: case T_ANY: @@ -179,7 +275,7 @@ REcompile(char *re, size_t len) * convert m{3,} to mmm* (with a limit of MAX_INT) * convert m{3,10} to mmm* with a limit of 10 */ - TRACE(("interval {%ld,%ld}\n", (long) intrvalmin, (long) intrvalmax)); + TRACE((M_FMT("interval {%ld,%ld}\n"), (long) intrvalmin, (long) intrvalmax)); if ((m_ptr - m_array) < STACKSZ) memset(m_ptr + 1, 0, sizeof(*m_ptr)); if (intrvalmin == 0) { /* zero or more */ @@ -227,49 +323,49 @@ REcompile(char *re, size_t len) RE_free(m_ptr->start); m_ptr--; } - TRACE(("RE_lex token %s\n", + TRACE((M_FMT("RE_lex token %s\n"), "of zero interval is ignored!")); break; case 1: RE_01(m_ptr); /* m{0,1} which is m? */ - TRACE(("RE_lex token %s\n", token_name(T_Q))); + TRACE((M_FMT("RE_lex token %s\n"), token_name(T_Q))); break; default: RE_close_limit(m_ptr, intrvalmin, intrvalmax); - TRACE(("RE_lex token %s\n", token_name(T_Q))); + TRACE((M_FMT("RE_lex token %s\n"), token_name(T_Q))); } } else if (BigMachine(m_ptr)) { RE_poscl_limit(m_ptr, intrvalmin, intrvalmax); -#ifdef NO_RI_LOOP_UNROLL - } else if (intrvalmin >= 1) { /* one or more */ - RE_poscl_limit(m_ptr, intrvalmin, intrvalmax); - TRACE(("RE_lex token %s\n", token_name(T_PLUS))); -#else } else if (intrvalmin == 1) { /* one or more */ RE_poscl_limit(m_ptr, intrvalmin, intrvalmax); - TRACE(("RE_lex token %s\n", token_name(T_PLUS))); -#endif } else if (m_ptr->start != NULL) { /* n or more */ - register Int i; - /* copy 2 copies of m_ptr, use 2nd copy to replace - the first copy that gets swallowed by concat */ - MACHINE *result_mp = m_ptr; - MACHINE *concat_mp = (m_ptr + 1); - MACHINE *new_mp = (m_ptr + 2); - TRACE(("calling duplicate_m result_mp %ld -> concat_mp %ld\n", - result_mp - m_array, - concat_mp - m_array)); - duplicate_m(concat_mp, result_mp); - TRACE(("calling duplicate_m result_mp %ld -> new_mp %ld\n", - result_mp - m_array, - new_mp - m_array)); - duplicate_m(new_mp, result_mp); - for (i = 2; i <= intrvalmin; i++) { - RE_cat(result_mp, concat_mp); - duplicate_m(concat_mp, new_mp); + /* loop-unrolling only works if min==max, so that the loops in + * test/match functions can process the whole loop in each + * iteration */ + if (FIXME_INTERVAL_LIMITS || intrvalmin == intrvalmax) { + register Int i; + /* copy 2 copies of m_ptr, use 2nd copy to replace + the first copy that gets swallowed by concat */ + MACHINE *result_mp = m_ptr; + MACHINE *concat_mp = (m_ptr + 1); + MACHINE *new_mp = (m_ptr + 2); + TRACE((M_FMT("calling duplicate_m result_mp %ld -> concat_mp %ld\n"), + result_mp - m_array, + concat_mp - m_array)); + duplicate_m(concat_mp, result_mp); + TRACE((M_FMT("calling duplicate_m result_mp %ld -> new_mp %ld\n"), + result_mp - m_array, + new_mp - m_array)); + duplicate_m(new_mp, result_mp); + for (i = 2; i <= intrvalmin; i++) { + RE_cat(result_mp, concat_mp); + duplicate_m(concat_mp, new_mp); + } + /* don't need 2nd copy in new_mp */ + RE_free(new_mp->start); + } else { + RE_poscl_limit(m_ptr, intrvalmin, intrvalmax); } - /* don't need 2nd copy in new_mp */ - RE_free(new_mp->start); } break; #endif /* ! NO_INTERVAL_EXPR */ @@ -358,7 +454,7 @@ REcompile(char *re, size_t len) op_ptr->token = t; } /* end of switch */ - if (m_ptr == m_stack(STACKSZ - 1)) { + if (m_ptr >= m_stack(STACKSZ - 1)) { /*overflow */ RE_error_trap(-ERR_5); } @@ -431,19 +527,3 @@ REerror(void) { return REerrlist[REerrno]; } - -#ifndef NO_INTERVAL_EXPR -/* duplicate a machine, oldmp into newmp */ -void -duplicate_m(MACHINE * newmp, MACHINE * oldmp) -{ - register STATE *p; - TRACE(("duplicate_m %p -> %p\n", (void *) oldmp, (void *) newmp)); - TRACE(("...start %p\n", (void *) oldmp->start)); - TRACE(("...stop %p\n", (void *) oldmp->stop)); - p = (STATE *) RE_malloc(2 * STATESZ); - RE_copy_states(p, oldmp->start, 2); - newmp->start = (STATE *) p; - newmp->stop = (STATE *) (p + 1); -} -#endif /* NO_INTERVAL_EXPR */ diff --git a/rexp.h b/rexp.h index be29e29..9949854 100644 --- a/rexp.h +++ b/rexp.h @@ -12,7 +12,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp.h,v 1.44 2024/11/11 20:59:21 tom Exp $ + * $MawkId: rexp.h,v 1.45 2024/12/31 11:42:44 tom Exp $ */ #ifndef REXP_H @@ -46,7 +46,11 @@ typedef enum { ,M_2JA /* optional (undesirable) jump */ ,M_2JB /* optional (desirable) jump */ ,M_SAVE_POS /* push position onto stack */ - ,M_2JC /* pop pos'n, optional jump if advanced */ + ,M_2JC /* pop position, optional jump if advanced */ +#ifndef NO_INTERVAL_EXPR + ,M_ENTER /* begin counted loop (reset counter) */ + ,M_LOOP /* end counted loop (update/test counter) */ +#endif ,M_ACCEPT /* end of match */ ,U_ON /* ...distinct from the preceding */ } MAWK_REGEX; @@ -71,8 +75,8 @@ typedef struct _state int jump; } s_data; #ifndef NO_INTERVAL_EXPR - Int it_min; /* used for s_type == M_2JC */ - Int it_max; /* used for s_type == M_2JC */ + Int it_min; /* used for s_type == M_LOOP */ + Int it_max; /* used for s_type == M_LOOP */ Int it_cnt; #endif } @@ -139,8 +143,8 @@ typedef struct _rt_state STATE *m; /* save the machine ptr */ int u; /* save the u_flag */ char *s; /* save the active string ptr */ - int sp; /* size of position stack */ - int tp; /* offset to top entry of position stack */ + int pos_index; /* index into position stack */ + int top_index; /* offset to top entry of position stack */ char *ss; /* save the match start -- only used by REmatch */ } #endif @@ -157,7 +161,7 @@ typedef struct _rt_pos_entry /* run time stack frame responsible for removing this node */ int owner; - /* previous node is this - this->prev_offset. See RE_pos_pop() */ + /* previous node is this - this->prev_offset. See pos_pop() */ int prev_offset; } #endif @@ -184,12 +188,6 @@ extern STATE *RE_poscl(MACHINE *); extern void RE_01(MACHINE *); extern GCC_NORETURN void RE_panic(const char *, ...) GCC_PRINTFLIKE(1,2); -#ifndef NO_INTERVAL_EXPR -extern void RE_close_limit(MACHINE *, Int, Int); -extern void RE_poscl_limit(MACHINE *, Int, Int); -extern void duplicate_m(MACHINE *, MACHINE *); -#endif - #ifndef MAWK_H extern char *str_str(char *, size_t, char *, size_t); #endif @@ -213,73 +211,57 @@ extern Int intrvalmin; extern Int intrvalmax; extern char *re_exp; -#if defined(LOCAL_REGEXP) && defined(REGEXP_INTERNALS) -static /* inline */ RT_POS_ENTRY * -RE_pos_push(RT_POS_ENTRY * head, const RT_STATE * owner, const char *s) -{ - head->pos = s; - head->owner = (int) (owner - RE_run_stack_base); - - if (++head == RE_pos_stack_limit) { - head = RE_new_pos_stack(); - } - head->prev_offset = 1; - return head; -} - -static /* inline */ const char * -RE_pos_pop(RT_POS_ENTRY ** head, const RT_STATE * current) -{ - RT_POS_ENTRY *prev2 = *head - (*head)->prev_offset; - - if (prev2->owner == current - RE_run_stack_base) { /* likely */ - /* no need to preserve intervening nodes */ - *head = prev2; - } else if (*head == prev2) { - RE_panic("unbalanced M_SAVE_POS and M_2JC"); - } else { - (*head)->prev_offset += prev2->prev_offset; - } - - return prev2->pos; -} - -#ifndef NO_INTERVAL_EXPR -/* reset it_cnt to zero for the M_2JC state - * which is where loop count is checked - */ -static void -RE_init_it_cnt(STATE * s) -{ - STATE *p = s; - while (p->s_type < M_ACCEPT) { - if (p->s_type == M_2JC) - p->it_cnt = 0; - p++; - } -} +#if OPT_TRACE +#define if_TRACE(stmt) stmt #else -#define RE_init_it_cnt(s) /* nothing */ +#define if_TRACE(stmt) /*nothing*/ #endif -#ifndef NO_INTERVAL_EXPR -#undef NO_RI_LOOP_UNROLL /* experimental 2020/10/22 -TD */ -#ifdef NO_RI_LOOP_UNROLL -#else -static void -RE_set_limit(STATE * s, Int minlimit, Int maxlimit) -{ - STATE *p = s; - while (p->s_type < M_ACCEPT) { - if (p->s_type == M_2JC) { - p->it_min = minlimit; - p->it_max = maxlimit; - } - p++; - } -} -#endif /* ! NO_RI_LOOP_UNROLL */ -#endif /* ! NO_INTERVAL_EXPR */ +#define pos_push(pos_param, run_param, position) do { \ + pos_param->pos = s; \ + pos_param->owner = (int) (run_param - RE_run_stack_base); \ + \ + TRACE2(("[%s@%d] pos_push #%ld: \"%s\" owner %d\n", \ + __FILE__, __LINE__, \ + (pos_param - RE_pos_stack_base), \ + NonNull(position), \ + pos_param->owner)); \ + \ + if (++pos_param == RE_pos_stack_limit) { \ + pos_param = RE_new_pos_stack(); \ + } \ + if_TRACE(pos_param->pos = NULL); \ + if_TRACE(pos_param->owner = 0); \ + pos_param->prev_offset = 1; \ +} while (0) + +#define pos_pop(pos_param, run_param, popped_position) do { \ + RT_POS_ENTRY *prev2 = pos_param - pos_param->prev_offset; \ + \ + if (prev2->owner == run_param - RE_run_stack_base) { /* likely */ \ + /* no need to preserve intervening nodes */ \ + TRACE2(("[%s@%d] pos_pop #%ld -> #%ld \"%s\" owner %d\n", \ + __FILE__, __LINE__, \ + (pos_param - RE_pos_stack_base), \ + (prev2 - RE_pos_stack_base), \ + NonNull(prev2->pos), \ + prev2->owner)); \ + pos_param = prev2; \ + } else if (pos_param == prev2) { \ + RE_panic("unbalanced M_SAVE_POS and M_2JC"); \ + } else { \ + TRACE2(("[%s@%d] pos_pop #%ld: \"%s\" offset %d -> %d\n", \ + __FILE__, __LINE__, \ + (pos_param - RE_pos_stack_base), \ + NonNull(pos_param->pos), \ + pos_param->prev_offset, \ + pos_param->prev_offset + prev2->prev_offset)); \ + pos_param->prev_offset += prev2->prev_offset; \ + } \ + popped_position = prev2->pos; \ +} while (0) + +#if defined(LOCAL_REGEXP) && defined(REGEXP_INTERNALS) #ifdef NO_LEAKS extern void RE_copy_states(STATE *, const STATE *, size_t); diff --git a/rexp0.c b/rexp0.c index bf4d7f6..2ce168a 100644 --- a/rexp0.c +++ b/rexp0.c @@ -12,7 +12,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp0.c,v 1.50 2024/12/14 21:21:20 tom Exp $ + * $MawkId: rexp0.c,v 1.52 2024/12/30 19:17:41 tom Exp $ */ /* lexical scanner */ @@ -276,6 +276,9 @@ RE_lex(MACHINE * mp) case T_LP: switch (prev) { +#ifndef NO_INTERVAL_EXPR + case T_RB: +#endif case T_CHAR: case T_STR: case T_ANY: @@ -288,13 +291,6 @@ RE_lex(MACHINE * mp) case T_U: return prev = T_CAT; -#ifndef NO_INTERVAL_EXPR - case T_RB: - if (!repetitions_flag) { - return prev = T_CAT; - } -#endif - /* FALLTHRU */ default: nest++; @@ -626,6 +622,9 @@ lookup_cclass(char **start) int first = -2; int last = -2; + if (data == NULL) + RE_error_trap(-ERR_3); + for (ch = 0; ch < 256; ++ch) { switch (code) { case CCLASS_NONE: @@ -678,6 +677,8 @@ lookup_cclass(char **start) if (used + 2 >= have) { have *= 2; data = realloc(data, sizeof(CCLASS) * have); + if (data == NULL) + RE_error_trap(-ERR_3); } data[used].first = first; data[used].last = last; @@ -689,6 +690,8 @@ lookup_cclass(char **start) if (used + 2 >= have) { have *= 2; data = realloc(data, sizeof(CCLASS) * have); + if (data == NULL) + RE_error_trap(-ERR_3); } data[used].first = first; data[used].last = last; diff --git a/rexp1.c b/rexp1.c index bc14b74..8f6fe5e 100644 --- a/rexp1.c +++ b/rexp1.c @@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp1.c,v 1.28 2024/12/14 12:57:40 tom Exp $ + * $MawkId: rexp1.c,v 1.31 2024/12/30 15:46:23 tom Exp $ */ /* re machine operations */ @@ -113,9 +113,6 @@ RE_cat(MACHINE * mp, MACHINE * np) #endif mp->stop = mp->start + (sz - 1); RE_copy_states(mp->start + sz1, np->start, sz2); -#ifndef NO_INTERVAL_EXPR - mp->start[sz].s_type = M_ACCEPT; /* this is needed in RE_init_it_cnt */ -#endif RE_free(np->start); } @@ -143,6 +140,12 @@ RE_or(MACHINE * mp, MACHINE * np) p->s_data.jump = (int) szn; } +#ifndef NO_INTERVAL_EXPR +#define is_LOOP_TYPE(type) ((type) == M_2JC || (type) == M_LOOP) +#else +#define is_LOOP_TYPE(type) ((type) == M_2JC) +#endif + /* * Ignore attempts to wrap an atom using zero-or-more repetitions in another * loop with the same condition. @@ -162,59 +165,15 @@ RE_or(MACHINE * mp, MACHINE * np) ((ps + 2)->s_type % U_ON) != M_STR && \ ((ps + 2)->s_type % U_ON) != M_U) { \ TRACE((".. expected atom %s\n", REs_type(ps + 2))); \ - } else if (((ps + 3)->s_type % U_ON) != M_2JC) { \ - TRACE((".. expected loop %s\n", REs_type(ps + 3))); \ - } else { \ + } else if (is_LOOP_TYPE((ps + 3)->s_type)) { \ TRACE(("ignore repeated loop\n")); \ + } else { \ + TRACE((".. expected loop %s\n", REs_type(ps + 3))); \ return NULL; \ } \ } \ } -#ifndef NO_INTERVAL_EXPR -/* replace m with m* limited to the max iterations - (variation of m* closure) */ -void -RE_close_limit(MACHINE * mp, Int min_limit, Int max_limit) -{ -#ifdef NO_RI_LOOP_UNROLL - STATE *s; - - TRACE(("RE_close_limit " INT_FMT ".." INT_FMT "\n", min_limit, max_limit)); - if ((s = RE_close(mp)) != 0) { - if (s->s_type == M_2JC) { - s->it_min = min_limit; - s->it_max = max_limit; - } - } -#else - RE_close(mp); - RE_set_limit(mp->start, min_limit, max_limit); -#endif -} - -/* replace m with m+ limited to the max iterations - which is one or more, limited - (variation of m+ positive closure) */ -void -RE_poscl_limit(MACHINE * mp, Int min_limit, Int max_limit) -{ -#ifdef NO_RI_LOOP_UNROLL - STATE *s; - TRACE(("RE_poscl_limit " INT_FMT ".." INT_FMT "\n", min_limit, max_limit)); - if ((s = RE_poscl(mp)) != NULL) { - if (s->s_type == M_2JC) { - s->it_min = min_limit; - s->it_max = max_limit; - } - } -#else - RE_poscl(mp); - RE_set_limit(mp->start, min_limit, max_limit); -#endif -} -#endif /* ! NO_INTERVAL_EXPR */ - /* UNARY OPERATIONS */ /* replace m by m* (zero or more) */ @@ -226,7 +185,7 @@ RE_close(MACHINE * mp) size_t sz; /* - * 2JA end + * 2JA end * loop: * SAVE_POS * m @@ -329,6 +288,9 @@ RE_malloc(size_t sz) TRACE(("RE_malloc(%lu) ->%p\n", (unsigned long) sz, p)); if (p == NULL) RE_error_trap(MEMORY_FAILURE); +#ifdef OPT_TRACE + memset(p, 0, sz); +#endif return p; } diff --git a/rexp2.c b/rexp2.c index 9ab0845..805f236 100644 --- a/rexp2.c +++ b/rexp2.c @@ -12,7 +12,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp2.c,v 1.49 2024/12/14 12:57:40 tom Exp $ + * $MawkId: rexp2.c,v 1.50 2024/12/31 15:21:17 tom Exp $ */ /* test a string against a machine */ @@ -91,7 +91,6 @@ RE_new_run_stack(void) RE_run_stack_limit = RE_run_stack_base + newsize; RE_run_stack_empty = RE_run_stack_base - 1; - /* return the new stackp */ return RE_run_stack_base + oldsize; } @@ -111,49 +110,35 @@ RE_new_pos_stack(void) fprintf(stderr, "out of memory for RE string position stack\n"); mawk_exit(100); } +#if OPT_TRACE + memset(RE_pos_stack_base + oldsize, 0, + (newsize - oldsize) * sizeof(RT_POS_ENTRY)); +#endif RE_pos_stack_limit = RE_pos_stack_base + newsize; RE_pos_stack_empty = RE_pos_stack_base; - /* return the new stackp */ return RE_pos_stack_base + oldsize; } -#ifdef DEBUG -static RT_STATE * -slow_push( - RT_STATE * sp, - STATE * m, - char *s, - RT_POS_ENTRY * pos_top, - int u) -{ - if (sp == RE_run_stack_limit) - sp = RE_new_run_stack(); - sp->m = m; - sp->s = s; - sp->u = u; - sp->sp = pos_top - RE_pos_stack_base; - sp->tp = pos_top->prev_offset; - return sp; -} -#endif +#define rt_push(mx,sx,px,ux) do { \ + if (++run_entry == RE_run_stack_limit) \ + run_entry = RE_new_run_stack(); \ + run_entry->m = (mx); \ + run_entry->s = (sx); \ + run_entry->pos_index = (int) ((px) - RE_pos_stack_base); \ + run_entry->top_index = (px)->prev_offset; \ + run_entry->u = (ux); \ + TRACE2((rt_form "rt_push %s pos@%d top@%d\n", rt_args, \ + REs_type(mx), \ + run_entry->pos_index, \ + run_entry->top_index)); \ +} while(0) -#ifdef DEBUG -#define push(mx,sx,px,ux) do { \ - stackp = slow_push(++stackp, mx, sx, px, ux); \ - } while(0) -#else -#define push(mx,sx,px,ux) do { \ - if (++stackp == RE_run_stack_limit) \ - stackp = RE_new_run_stack(); \ - stackp->m = (mx); \ - stackp->s = (sx); \ - stackp->u = (ux); \ - stackp->sp = (int) ((px) - RE_pos_stack_base); \ - stackp->tp = (px)->prev_offset; \ - } while(0) -#endif +#define rt_pop() do { \ + TRACE2((rt_form "rt_pop\n", rt_args)); \ + run_entry--; \ +} while (0) #define CASE_UANY(x) case (x)+U_OFF: /* FALLTHRU */ case (x)+U_ON @@ -190,10 +175,11 @@ REtest(char *str, /* string to test */ { register STATE *m = machine; char *s = str; - register RT_STATE *stackp; + const char *old_s; + register RT_STATE *run_entry; int u_flag; char *str_end = str + len; - RT_POS_ENTRY *sp; + RT_POS_ENTRY *pos_entry; int ti; /*convenient temps */ STATE *tm; @@ -201,42 +187,101 @@ REtest(char *str, /* string to test */ /* handle the easy case quickly */ if (m->s_type == M_STR && (m + 1)->s_type == M_ACCEPT) { - return str_str(s, len, m->s_data.str, (size_t) m->s_len) != (char *) 0; + TRACE(("returning str_str\n")); + return str_str(s, len, m->s_data.str, m->s_len) != (char *) 0; } else { u_flag = U_ON; - stackp = RE_run_stack_empty; - sp = RE_pos_stack_empty; - RE_init_it_cnt(m); + run_entry = RE_run_stack_empty; + pos_entry = RE_pos_stack_empty; + if_TRACE(memset(pos_entry, 0, 2 * sizeof(*pos_entry))); RE_CASE(); } refill: - if (stackp == RE_run_stack_empty) { +#ifndef NO_INTERVAL_EXPR + if (run_entry != RE_run_stack_empty) { + STATE *m2; + int found; +#if OPT_TRACE > 1 + RT_STATE *statep; + RT_POS_ENTRY *posp; + + for (statep = RE_run_stack_base; statep <= run_entry; ++statep) { + TRACE(("check - STATE %d: m %03d s \"%s\" pos@%d top@%d u %d\n", + (int) (statep - RE_run_stack_base), + (int) (statep->m - machine), + NonNull(statep->s), + statep->pos_index, + statep->top_index, + statep->u)); + } + for (posp = RE_pos_stack_base; posp <= pos_entry; ++posp) { + TRACE(("check - POS %d: pos \"%s\" owner@%d prev@%d\n", + (int) (posp - RE_pos_stack_base), + NonNull(posp->pos), + posp->owner, + posp->prev_offset)); + } +#endif + /* + * We're here because we had a mismatch in a loop. Find the end of the + * loop, and reset it if the mismatch was due to too-few matches. + * FIXME - provide this info in compile-stage + */ + found = 0; + for (m2 = run_entry->m; m2->s_type < M_ACCEPT; ++m2) { + TRACE(("CHECK %03d %s\n", (int) (m2 - machine), REs_type(m2))); + switch (m2->s_type) { + case M_SAVE_POS: + case M_2JA: + case M_2JB: + case M_2JC: + found = 1; + break; + case M_LOOP: + found = 1; + TRACE2(("Found M_LOOP: %03d\n", (int) (m2 - machine))); + TRACE2(("currently " INT_FMT " [" INT_FMT ".." INT_FMT "]\n", + m2->it_cnt, m2->it_min, m2->it_max)); + if (m2->it_cnt < m2->it_min) { + TRACE2(("too few - invoke M_ENTER\n")); + run_entry->m = m2 + m2->s_data.jump - 1; + } + break; + } + if (found) + break; + } + } +#endif + if (run_entry == RE_run_stack_empty) { + TR_AT("accept failure"); return 0; } - m = stackp->m; - s = stackp->s; - sp = RE_pos_stack_base + stackp->sp; - sp->prev_offset = stackp->tp; - u_flag = (stackp--)->u; + m = run_entry->m; + s = run_entry->s; + pos_entry = RE_pos_stack_base + run_entry->pos_index; + pos_entry->prev_offset = run_entry->top_index; + u_flag = run_entry->u; + rt_pop(); reswitch: - TRACE2(("[%s@%d] %d:%03d %-8s %-15s: %s\n", __FILE__, __LINE__, - (int) (stackp - RE_run_stack_base), - (int) (m - machine), - REs_type(m), - RE_u_end(u_flag), - s)); + TRACE((rt_form "%-8s %-15s: \"%s\"\n", rt_args, + REs_type(m), + RE_u_end(u_flag), + s)); switch (m->s_type + u_flag) { case M_STR + U_OFF + END_OFF: if (s > str_end || (size_t) (str_end - s) < m->s_len || memcmp(s, m->s_data.str, m->s_len)) { + TR_AT("no match"); RE_FILL(); } s += m->s_len; m++; + TR_AT("match"); RE_CASE(); case M_STR + U_OFF + END_ON: @@ -249,10 +294,11 @@ REtest(char *str, /* string to test */ RE_CASE(); case M_STR + U_ON + END_OFF: - if (!(s = str_str(s, (size_t) (str_end - s), m->s_data.str, (size_t) m->s_len))) { + s = str_str(s, (size_t) (str_end - s), m->s_data.str, m->s_len); + if (s == NULL) { RE_FILL(); } - push(m, s + 1, sp, U_ON); + rt_push(m, s + 1, pos_entry, U_ON); s += m->s_len; m++; u_flag = U_OFF; @@ -297,7 +343,7 @@ REtest(char *str, /* string to test */ s++; } s++; - push(m, s, sp, U_ON); + rt_push(m, s, pos_entry, U_ON); m++; u_flag = U_OFF; RE_CASE(); @@ -336,7 +382,7 @@ REtest(char *str, /* string to test */ RE_FILL(); } s++; - push(m, s, sp, U_ON); + rt_push(m, s, pos_entry, U_ON); m++; u_flag = U_OFF; RE_CASE(); @@ -392,63 +438,88 @@ REtest(char *str, /* string to test */ RE_CASE(); CASE_UANY(M_SAVE_POS): /* save position for a later M_2JC */ - sp = RE_pos_push(sp, stackp, s); + pos_push(pos_entry, run_entry, s); m++; RE_CASE(); CASE_UANY(M_2JA): /* take the non jump branch */ /* don't stack an ACCEPT */ if ((tm = m + m->s_data.jump)->s_type == M_ACCEPT) { + TR_AT("accept success"); return 1; } - push(tm, s, sp, u_flag); + rt_push(tm, s, pos_entry, u_flag); m++; RE_CASE(); - CASE_UANY(M_2JC): /* take the jump branch if position changed */ #ifndef NO_INTERVAL_EXPR - if (m->it_max < MAX__INT && ++(m->it_cnt) >= m->it_max) { - RE_pos_pop(&sp, stackp); + CASE_UANY(M_ENTER): /* take the jump branch if position changed */ + TRACE(("reset loop " INT_FMT " [" INT_FMT ".." INT_FMT "]\n", + m->it_cnt, m->it_min, m->it_max)); + (m + m->s_data.jump)->it_cnt = 0; + m++; + RE_CASE(); + + CASE_UANY(M_LOOP): /* take the jump branch if position changed */ + m->it_cnt++; + TRACE(("checking loop " INT_FMT " [" INT_FMT ".." INT_FMT "]\n", + m->it_cnt, m->it_min, m->it_max)); + if (m->it_max < MAX__INT && m->it_cnt >= m->it_max) { m++; + TR_AT("past maximum for M_LOOP"); RE_CASE(); /* test the next thing */ - } else + } else if (m->it_cnt < m->it_min) { + m += m->s_data.jump; + TR_AT("under minimum for M_LOOP"); + RE_CASE(); + } + goto fall_through; /* workaround for gcc bug */ + fall_through: + /* FALLTHRU */ #endif /* ! NO_INTERVAL_EXPR */ - if (RE_pos_pop(&sp, stackp) == s) { + + CASE_UANY(M_2JC): /* take the jump branch if position changed */ + pos_pop(pos_entry, run_entry, old_s); + if (old_s == s) { /* did not advance: do not jump back */ m++; RE_CASE(); } /* don't stack an ACCEPT */ if ((tm = m + 1)->s_type == M_ACCEPT) { + TR_AT("accept success"); return 1; } - push(tm, s, sp, u_flag); + rt_push(tm, s, pos_entry, u_flag); m += m->s_data.jump; RE_CASE(); CASE_UANY(M_2JB): /* don't stack an ACCEPT */ if ((tm = m + 1)->s_type == M_ACCEPT) { + TR_AT("accept success"); return 1; } - push(tm, s, sp, u_flag); + rt_push(tm, s, pos_entry, u_flag); m += m->s_data.jump; RE_CASE(); CASE_UANY(M_ACCEPT): + TR_AT("accept success"); return 1; default: RE_bad_state("REtest", m, u_flag); } + return 0; } -#undef push +#undef rt_push #include char * -is_string_split(PTR q, size_t * lenp) +is_string_split(PTR q, size_t *lenp) { STATE *p = cast_to_re(q); diff --git a/rexp3.c b/rexp3.c index 0f70cbe..084eef3 100644 --- a/rexp3.c +++ b/rexp3.c @@ -12,54 +12,36 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp3.c,v 1.69 2024/12/11 21:45:11 tom Exp $ + * $MawkId: rexp3.c,v 1.70 2024/12/31 10:20:48 tom Exp $ */ /* match a string against a machine */ #include -#define push(mx,sx,px,ssx,ux) do { \ - if (++stackp == RE_run_stack_limit) \ - stackp = RE_new_run_stack() ;\ - TRACE2(("[%s@%d] pushing %d:%03d\n", __FILE__, __LINE__, \ - (int)(stackp - RE_run_stack_base), \ - (int)(m - machine))); \ - stackp->m = (mx); \ - stackp->s = (sx); \ - stackp->sp = (int) ((px) - RE_pos_stack_base); \ - stackp->tp = (px)->prev_offset; \ - stackp->ss = (ssx); \ - stackp->u = (ux); \ +#define rt_push(mx,sx,px,ssx,ux) do { \ + if (++run_entry == RE_run_stack_limit) \ + run_entry = RE_new_run_stack() ;\ + run_entry->m = (mx); \ + run_entry->s = (sx); \ + run_entry->pos_index = (int) ((px) - RE_pos_stack_base); \ + run_entry->top_index = (px)->prev_offset; \ + run_entry->ss = (ssx); \ + run_entry->u = (ux); \ + TRACE2((rt_form "rt_push %s\n", rt_args, REs_type(mx))); \ } while(0) -#ifdef NO_RI_LOOP_UNROLL -#define restart_count(old,new) \ - if (old != new) { \ - TRACE2(("RESET %p ->%p\n", old, new)); \ - m->it_cnt = 1; \ - } -#else -#define restart_count(old,new) /* nothing */ -#endif - #define CASE_UANY(x) case (x)+U_OFF: /* FALLTHRU */ case (x)+U_ON -#define TR_AT(what) \ - TRACE2(("[%s@%d] %d.%03d %s\n", __FILE__, __LINE__, \ - (int) (stackp - RE_run_stack_base), \ - (int) (m - machine), \ - what)) - #define TR_BEST() \ - TRACE2(("[%s@%d] new best [%d..%d]'%.*s'\n", __FILE__, __LINE__, \ + TRACE2((rt_form "new best [%d..%d] \"%.*s\"\n", rt_args, \ (int) (cb_ss - str), \ (int) (cb_e - str), \ (int) (cb_e - cb_ss), \ cb_ss)) #define TR_STR(s) \ - TRACE(("[%s@%d] str:%i len:%lu\n", __FILE__, __LINE__, \ + TRACE((rt_form "str:%i len:%lu\n", rt_args, \ ((s) ? (int) ((s) - str) : -99), \ (unsigned long) *lenp)) @@ -68,7 +50,7 @@ the GNU General Public License, version 2, 1991. *lenp = (size_t) (cb_e - cb_ss); \ } \ TR_STR(s); \ - TRACE2(("[%s@%d] returning %d\n", __FILE__, __LINE__, \ + TRACE2((rt_form "returning %d\n", rt_args, \ cb_ss ? (int)(cb_ss - str) : -1)); \ return cb_ss @@ -85,10 +67,11 @@ REmatch(char *str, /* string to test */ register STATE *m = machine; char *s; char *ss; - register RT_STATE *stackp; + const char *old_s; + RT_STATE *run_entry = NULL; int u_flag; char *str_end; - RT_POS_ENTRY *sp; + RT_POS_ENTRY *pos_entry; char *ts; /* state of current best match stored here */ @@ -102,7 +85,7 @@ REmatch(char *str, /* string to test */ /* check for the easy case */ if (m->s_type == M_STR && (m + 1)->s_type == M_ACCEPT) { - if ((ts = str_str(str, str_len, m->s_data.str, (size_t) m->s_len))) { + if ((ts = str_str(str, str_len, m->s_data.str, m->s_len))) { *lenp = m->s_len; } TR_STR(ts); @@ -113,18 +96,49 @@ REmatch(char *str, /* string to test */ s = str; u_flag = U_ON; cb_e = cb_ss = ss = (char *) 0; - stackp = RE_run_stack_empty; - sp = RE_pos_stack_empty; - RE_init_it_cnt(m); + run_entry = RE_run_stack_empty; + pos_entry = RE_pos_stack_empty; RE_CASE(); refill: - TR_AT(("refill...")); - if (stackp == RE_run_stack_empty) { + TRACE((rt_form "refill... pos@%d\n", rt_args, + (int) (pos_entry - RE_pos_stack_base))); +#ifndef NO_INTERVAL_EXPR + if (0) { +#if OPT_TRACE > 1 + RT_STATE *statep; + RT_POS_ENTRY *posp; + + for (statep = RE_run_stack_base; statep <= run_entry; ++statep) { + TRACE(("%s - STATE %ld: m %03ld s \"%s\" pos@%d top@%d u %d\n", + statep == run_entry ? "CHECK" : "check", + (statep - RE_run_stack_base), + (statep->m - machine), + NonNull(statep->s), + statep->pos_index, + statep->top_index, + statep->u)); + } + for (posp = RE_pos_stack_base; posp <= pos_entry; ++posp) { + TRACE(("%s - POS %ld: pos \"%s\" owner@%d prev@%d\n", + posp == pos_entry ? "CHECK" : "check", + (posp - RE_pos_stack_base), + NonNull(posp->pos), + posp->owner, + posp->prev_offset)); + } +#endif + } +#endif + if (run_entry == RE_run_stack_empty) { RE_TURN(); } - ss = stackp->ss; - s = (stackp--)->s; + ss = run_entry->ss; + s = run_entry->s; + rt_pop(); + TRACE((rt_form "run-sp s=\"%s\", ss=\"%s\"\n", rt_args, + NonNull(s), + NonNull(ss))); if (cb_ss) { /* does new state start too late ? */ if (ss) { if (current_best(ss)) { @@ -135,32 +149,33 @@ REmatch(char *str, /* string to test */ } } - m = (stackp + 1)->m; - TR_AT("now"); - sp = RE_pos_stack_base + (stackp + 1)->sp; - sp->prev_offset = (stackp + 1)->tp; - u_flag = (stackp + 1)->u; + TRACE((rt_form "run-sp type %s -> %s\n", rt_args, + REs_type(m), + REs_type((run_entry + 1)->m))); + + m = (run_entry + 1)->m; + pos_entry = RE_pos_stack_base + (run_entry + 1)->pos_index; + pos_entry->prev_offset = (run_entry + 1)->top_index; + u_flag = (run_entry + 1)->u; reswitch: - TRACE(("[%s@%d] %d:%03d %-8s %-15s: %s\n", __FILE__, __LINE__, - (int) (stackp - RE_run_stack_base), - (int) (m - machine), + TRACE((rt_form "%-8s %-15s: \"%s\"\n", rt_args, REs_type(m), RE_u_end(u_flag), cb_ss ? cb_ss : s)); switch (m->s_type + u_flag) { case M_STR + U_OFF + END_OFF: - TR_AT("now"); if (s >= str_end || (str_end - s) < (ptrdiff_t) m->s_len) { - TR_AT("now"); + TR_AT("now too far to match"); RE_FILL(); } else if (memcmp(s, m->s_data.str, m->s_len) != 0) { - TR_AT("now"); + TR_AT("now mismatched"); RE_FILL(); } + TR_AT("now matched"); if (!ss) { if (cb_ss && current_best(s)) { - TR_AT("now"); + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -175,11 +190,12 @@ REmatch(char *str, /* string to test */ TR_AT("now"); if ((str_end - s) != (ptrdiff_t) m->s_len) { RE_FILL(); - } else if (memcmp(s, m->s_data.str, (size_t) m->s_len) != 0) { + } else if (memcmp(s, m->s_data.str, m->s_len) != 0) { RE_FILL(); } if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -197,12 +213,14 @@ REmatch(char *str, /* string to test */ } else if (s < str) { s = str; } - if (!(s = str_str(s, (size_t) (str_end - s), m->s_data.str, (size_t) m->s_len))) { + s = str_str(s, (size_t) (str_end - s), m->s_data.str, m->s_len); + if (s == NULL) { RE_FILL(); } - push(m, s + 1, sp, ss, U_ON); + rt_push(m, s + 1, pos_entry, ss, U_ON); if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -220,8 +238,7 @@ REmatch(char *str, /* string to test */ RE_FILL(); } else if (s < str) { s = str; - } - { + } { ptrdiff_t ti = (str_end - s) - (ptrdiff_t) m->s_len; if (ti < 0 || memcmp(s = s + ti, m->s_data.str, m->s_len) != 0) { RE_FILL(); @@ -261,6 +278,7 @@ REmatch(char *str, /* string to test */ RE_FILL(); } else if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -281,9 +299,10 @@ REmatch(char *str, /* string to test */ break; s++; } - push(m, s + 1, sp, ss, U_ON); + rt_push(m, s + 1, pos_entry, ss, U_ON); if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -304,6 +323,7 @@ REmatch(char *str, /* string to test */ } else if (!ss) { char *xs = str_end - 1; if (cb_ss && current_best(xs)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = xs; @@ -320,6 +340,7 @@ REmatch(char *str, /* string to test */ RE_FILL(); } else if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -334,6 +355,7 @@ REmatch(char *str, /* string to test */ RE_FILL(); } else if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -349,9 +371,10 @@ REmatch(char *str, /* string to test */ if (s >= str_end) { RE_FILL(); } - push(m, s + 1, sp, ss, U_ON); + rt_push(m, s + 1, pos_entry, ss, U_ON); if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -371,6 +394,7 @@ REmatch(char *str, /* string to test */ s = str_end - 1; if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -408,6 +432,7 @@ REmatch(char *str, /* string to test */ RE_FILL(); } else if (!ss) { if (cb_ss) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = str_end; @@ -424,6 +449,7 @@ REmatch(char *str, /* string to test */ } } else if (!ss) { if (cb_ss) { + TR_AT("new match is not better"); RE_FILL(); } else ss = str_end; @@ -438,6 +464,7 @@ REmatch(char *str, /* string to test */ s = str; if (!ss) { if (cb_ss && current_best(s)) { + TR_AT("new match is not better"); RE_FILL(); } else { ss = s; @@ -453,66 +480,56 @@ REmatch(char *str, /* string to test */ CASE_UANY(M_SAVE_POS): /* save position for a later M_2JC */ /* see also REtest */ - sp = RE_pos_push(sp, stackp, s); + pos_push(pos_entry, run_entry, s); m++; RE_CASE(); CASE_UANY(M_2JA): /* take the non jump branch */ - push(m + m->s_data.jump, s, sp, ss, u_flag); + rt_push(m + m->s_data.jump, s, pos_entry, ss, u_flag); m++; RE_CASE(); CASE_UANY(M_2JB): /* take the non jump branch */ - push(m + m->s_data.jump, s, sp, ss, u_flag); + rt_push(m + m->s_data.jump, s, pos_entry, ss, u_flag); m++; RE_CASE(); - CASE_UANY(M_2JC): /* take the jump branch if position changed */ - /* see REtest */ #ifndef NO_INTERVAL_EXPR -#ifdef NO_RI_LOOP_UNROLL - m->it_cnt++; - TRACE(("checking loop " INT_FMT " [" INT_FMT ".." INT_FMT "]\n", - m->it_cnt, m->it_min, m->it_max)); - TR_STR(s); - if (m->it_cnt < m->it_min) { - /* keep looping until minimum is met */ - RE_pos_pop(&sp, stackp); - push(m + 1, s, sp, ss, u_flag); - m += m->s_data.jump; - TR_AT("now"); - } else if ((m->it_cnt >= m->it_min) - && (m->it_max == MAX__INT - || (m->it_max < MAX__INT && m->it_cnt >= m->it_max))) { - /* quit looping once maximum is met */ - RE_pos_pop(&sp, stackp); - m++; - TR_AT("now"); - } else -#else /* !NO_RI_LOOP_UNROLL */ - if (m->it_max < MAX__INT && ++(m->it_cnt) >= m->it_max) { - ++m; - RE_CASE(); /* test the next thing */ - } else -#endif /* NO_RI_LOOP_UNROLL */ - if (RE_pos_pop(&sp, stackp) == s) { - /* fall out of loop, to next instruction */ - m++; - TR_AT("now"); - } else { - /* continue looping as long as matching */ - push(m + 1, s, sp, ss, u_flag); - m += m->s_data.jump; - TR_AT("now"); - } + CASE_UANY(M_ENTER): /* take the jump branch if position changed */ + (m + m->s_data.jump)->it_cnt = 0; + m++; RE_CASE(); -#else - if (RE_pos_pop(&sp, stackp) == s) { - m++; + + CASE_UANY(M_LOOP): /* take the jump branch if position changed */ + m->it_cnt++; + TRACE(("checking #%d: loop " INT_FMT " [" INT_FMT ".." INT_FMT "]\n", + (int) (pos_entry - RE_pos_stack_base), + m->it_cnt, m->it_min, m->it_max)); + if (m->it_max < MAX__INT && m->it_cnt >= m->it_max) { + ++m; + TR_AT("now test the next thing"); + RE_CASE(); /* test the next thing */ + } else if (m->it_cnt < m->it_min) { + TR_AT("now continue getting minimum"); + m += m->s_data.jump; RE_CASE(); } + goto fall_through; /* workaround for gcc bug */ + fall_through: /* FALLTHRU */ -#endif /* ! NO_INTERVAL_EXPR */ +#endif + + CASE_UANY(M_2JC): /* take the jump branch if position changed */ + pos_pop(pos_entry, run_entry, old_s); + if (old_s == s) { + m++; + TR_AT("now fall out of loop"); + } else { + rt_push(m + 1, s, pos_entry, ss, u_flag); + m += m->s_data.jump; + TR_AT("now continue loop to match"); + } + RE_CASE(); case M_ACCEPT + U_OFF: if (s >= str_end) { @@ -522,7 +539,6 @@ REmatch(char *str, /* string to test */ ss = s; if (!cb_ss || ss < cb_ss || (ss == cb_ss && s > cb_e)) { /* we have a new current best */ - restart_count(cb_ss, ss); cb_ss = ss; cb_e = s; TR_BEST(); @@ -544,7 +560,6 @@ REmatch(char *str, /* string to test */ } if (!cb_ss || ss < cb_ss || (ss == cb_ss && s > cb_e)) { /* we have a new current best */ - restart_count(cb_ss, ss); cb_ss = ss; cb_e = s; TR_BEST(); @@ -554,5 +569,6 @@ REmatch(char *str, /* string to test */ default: RE_bad_state("REmatch", m, u_flag); } + return NULL; } -#undef push +#undef rt_push diff --git a/rexpdb.c b/rexpdb.c index 6bcf6a6..368cd76 100644 --- a/rexpdb.c +++ b/rexpdb.c @@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexpdb.c,v 1.30 2024/08/25 17:16:24 tom Exp $ + * $MawkId: rexpdb.c,v 1.31 2024/12/30 15:35:57 tom Exp $ */ #include @@ -33,6 +33,10 @@ static const char xlat[][12] = "M_2JB", "M_SAVE_POS", "M_2JC", +#ifndef NO_INTERVAL_EXPR + "M_ENTER", + "M_LOOP", +#endif "M_ACCEPT" }; @@ -78,20 +82,24 @@ REmprint(STATE * m, FILE *f) break; case M_2JC: fprintf(f, "\t%03d", line + p->s_data.jump); -#ifndef NO_INTERVAL_EXPR - if (p->it_min != 1 || p->it_max != MAX__INT) { - fprintf(f, " %c", L_CURL); - if (p->it_min != 0) - fprintf(f, INT_FMT, p->it_min); - if (p->it_max != p->it_min) { - fprintf(f, ","); - if (p->it_max != MAX__INT) - fprintf(f, INT_FMT, p->it_max); - } - fprintf(f, "%c", R_CURL); - } -#endif break; +#ifndef NO_INTERVAL_EXPR + case M_ENTER: + fprintf(f, "\t%03d", line + p->s_data.jump); + break; + case M_LOOP: + fprintf(f, "\t%03d", line + p->s_data.jump); + fprintf(f, " %c", L_CURL); + if (p->it_min != 0) + fprintf(f, INT_FMT, p->it_min); + if (p->it_max != p->it_min) { + fprintf(f, ","); + if (p->it_max != MAX__INT) + fprintf(f, INT_FMT, p->it_max); + } + fprintf(f, "%c", R_CURL); + break; +#endif case M_CLASS: { UChar *q = (UChar *) p->s_data.bvp; diff --git a/trace.c b/trace.c index 76c19e4..a36d84d 100644 --- a/trace.c +++ b/trace.c @@ -10,7 +10,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: trace.c,v 1.26 2024/12/14 21:21:34 tom Exp $ + * $MawkId: trace.c,v 1.27 2024/12/25 01:43:52 tom Exp $ */ #define Visible_CELL @@ -110,7 +110,7 @@ void TraceInst(INST * p, INST * base) { INST *q = da_this(p, base, trace_fp); - TRACE((" ...%ld\n", (long) (q - p))); + TRACE(("\t...%ld\n", (long) (q - p))); if (p++ != q) { switch ((MAWK_OPCODES) (base->op)) { case AE_PUSHA: