snapshot of project "mawk", label t20201023

This commit is contained in:
Thomas E. Dickey 2020-10-23 23:32:39 +00:00
parent 7b6215cc93
commit ccf03f93ad
10 changed files with 168 additions and 34 deletions

View File

@ -1,4 +1,11 @@
-- $MawkId: CHANGES,v 1.318 2020/10/16 22:29:45 tom Exp $
-- $MawkId: CHANGES,v 1.322 2020/10/22 22:23:40 tom Exp $
20201023
+ start work on experimental approach to interval-expressions which
does not involve loop-unrolling.
+ improve type-checking for builtin-regex's by using PTR only for the
external-regex's.
+ improve dump format, showing intervals in curly-brace format.
20201016
+ improve dump format, showing the jump-targets rather than offsets.

View File

@ -1,4 +1,4 @@
MANIFEST for mawk, version t20201016
MANIFEST for mawk, version t20201023
--------------------------------------------------------------------------------
MANIFEST this file
ACKNOWLEDGMENT acknowledgements

View File

@ -1,3 +1,9 @@
mawk-cur (1.3.4-20201023) unstable; urgency=low
* maintenance updates
-- Thomas E. Dickey <dickey@invisible-island.net> Sat, 17 Oct 2020 04:19:38 -0400
mawk-cur (1.3.4-20201016) unstable; urgency=low
* maintenance updates

View File

@ -1,8 +1,8 @@
Summary: mawk - pattern scanning and text processing language
%define AppProgram mawk
%define AppVersion 1.3.4
%define AppRelease 20201016
# $MawkId: mawk.spec,v 1.89 2020/10/16 22:27:38 tom Exp $
%define AppRelease 20201023
# $MawkId: mawk.spec,v 1.90 2020/10/17 08:19:38 tom Exp $
Name: %{AppProgram}
Version: %{AppVersion}
Release: %{AppRelease}

View File

@ -11,9 +11,9 @@ the GNU General Public License, version 2, 1991.
*/
/*
* $MawkId: patchlev.h,v 1.115 2020/10/16 22:33:26 tom Exp $
* $MawkId: patchlev.h,v 1.116 2020/10/17 08:19:38 tom Exp $
*/
#define PATCH_BASE 1
#define PATCH_LEVEL 3
#define PATCH_STRING ".4"
#define DATE_STRING "20201016"
#define DATE_STRING "20201023"

14
rexp.c
View File

@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexp.c,v 1.32 2020/10/16 22:45:57 tom Exp $
* $MawkId: rexp.c,v 1.34 2020/10/23 08:07:50 tom Exp $
*/
/* op precedence parser for regular expressions */
@ -217,12 +217,18 @@ REcompile(char *re, size_t len)
TRACE(("RE_lex token %s\n", token_name(T_Q)));
break;
default:
RE_close_limit(m_ptr, intrvalmax);
RE_close_limit(m_ptr, intrvalmin, intrvalmax);
TRACE(("RE_lex token %s\n", token_name(T_Q)));
}
} else if (intrvalmin == 1) { /* one or more */
RE_poscl_limit(m_ptr, intrvalmax);
#ifdef NO_RI_LOOP_UNROLL
} else if (intrvalmin >= 1) { /* one or more */
RE_poscl_limit(m_ptr, intrvalmin, intrvalmax);
TRACE(("RE_lex token %s\n", token_name(T_PLUS)));
#else
} else if (intrvalmin == 1) { /* one or more */
RE_poscl_limit(m_ptr, intrvalmin, intrvalmax);
TRACE(("RE_lex token %s\n", token_name(T_PLUS)));
#endif
} else { /* n or more */
register int i;
/* copy 2 copies of m_ptr, use 2nd copy to replace

23
rexp.h
View File

@ -12,7 +12,7 @@ the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexp.h,v 1.37 2020/10/16 22:27:03 tom Exp $
* $MawkId: rexp.h,v 1.38 2020/10/22 22:57:48 tom Exp $
*/
#ifndef REXP_H
@ -68,6 +68,7 @@ typedef struct {
int jump;
} s_data;
#ifndef NO_INTERVAL_EXPR
Int it_min; /* used for s_type == M_2JC */
Int it_max; /* used for s_type == M_2JC */
Int it_cnt;
#endif
@ -161,14 +162,14 @@ extern MACHINE RE_str(char *, size_t);
extern MACHINE RE_class(BV *);
extern void RE_cat(MACHINE *, MACHINE *);
extern void RE_or(MACHINE *, MACHINE *);
extern void RE_close(MACHINE *);
extern void RE_poscl(MACHINE *);
extern STATE *RE_close(MACHINE *);
extern STATE *RE_poscl(MACHINE *);
extern void RE_01(MACHINE *);
extern void RE_panic(const char *, ...) GCC_NORETURN GCC_PRINTFLIKE(1,2);
#ifndef NO_INTERVAL_EXPR
extern void RE_close_limit(MACHINE *, Int);
extern void RE_poscl_limit(MACHINE *, Int);
extern void RE_close_limit(MACHINE *, Int, Int);
extern void RE_poscl_limit(MACHINE *, Int, Int);
extern void duplicate_m(MACHINE *, MACHINE *);
#endif
@ -245,16 +246,22 @@ RE_init_it_cnt(STATE * s)
#endif
#ifndef NO_INTERVAL_EXPR
#undef NO_RI_LOOP_UNROLL /* experimental 2020/10/22 -TD */
#ifdef NO_RI_LOOP_UNROLL
#else
static void
RE_set_limit(STATE * s, Int limit)
RE_set_limit(STATE * s, Int minlimit, Int maxlimit)
{
STATE *p = s;
while (p->s_type < M_ACCEPT) {
if (p->s_type == M_2JC)
p->it_max = limit;
if (p->s_type == M_2JC) {
p->it_min = minlimit;
p->it_max = maxlimit;
}
p++;
}
}
#endif /* ! NO_RI_LOOP_UNROLL */
#endif /* ! NO_INTERVAL_EXPR */
#endif /* LOCAL_REGEXP */

48
rexp1.c
View File

@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexp1.c,v 1.20 2020/07/30 22:40:05 tom Exp $
* $MawkId: rexp1.c,v 1.21 2020/10/23 00:28:14 tom Exp $
*/
/* re machine operations */
@ -29,6 +29,7 @@ new_TWO(
mp->start->s_type = (SType) type;
mp->stop->s_type = M_ACCEPT;
#ifndef NO_INTERVAL_EXPR
mp->start->it_min = 1;
mp->start->it_max = MAX__INT;
mp->start->it_cnt = 0;
#endif
@ -179,20 +180,43 @@ ignore_star_star(MACHINE * mp)
/* replace m with m* limited to the max iterations
(variation of m* closure) */
void
RE_close_limit(MACHINE * mp, Int ilimit)
RE_close_limit(MACHINE * mp, Int min_limit, Int max_limit)
{
#ifdef NO_RI_LOOP_UNROLL
STATE *s;
TRACE(("RE_close_limit " INT_FMT ".." INT_FMT "\n", min_limit, max_limit));
if ((s = RE_close(mp)) != 0) {
if (s->s_type == M_2JC) {
s->it_min = min_limit;
s->it_max = max_limit;
}
}
#else
RE_close(mp);
RE_set_limit(mp->start, ilimit);
RE_set_limit(mp->start, min_limit, max_limit);
#endif
}
/* replace m with m+ limited to the max iterations
which is one or more, limited
(variation of m+ positive closure) */
void
RE_poscl_limit(MACHINE * mp, Int ilimit)
RE_poscl_limit(MACHINE * mp, Int min_limit, Int max_limit)
{
#ifdef NO_RI_LOOP_UNROLL
STATE *s;
TRACE(("RE_poscl_limit " INT_FMT ".." INT_FMT "\n", min_limit, max_limit));
if ((s = RE_poscl(mp)) != NULL) {
if (s->s_type == M_2JC) {
s->it_min = min_limit;
s->it_max = max_limit;
}
}
#else
RE_poscl(mp);
RE_set_limit(mp->start, ilimit);
RE_set_limit(mp->start, min_limit, max_limit);
#endif
}
#endif /* ! NO_INTERVAL_EXPR */
@ -200,14 +224,14 @@ RE_poscl_limit(MACHINE * mp, Int ilimit)
/* replace m by m* (zero or more) */
void
STATE *
RE_close(MACHINE * mp)
{
register STATE *p;
size_t sz;
if (ignore_star_star(mp))
return;
return NULL;
/*
* 2JA end
* loop:
@ -228,22 +252,25 @@ RE_close(MACHINE * mp)
(++p)->s_type = M_SAVE_POS;
(p += sz)->s_type = M_2JC;
#ifndef NO_INTERVAL_EXPR
p->it_min = 1;
p->it_max = MAX__INT;
#endif
p->s_data.jump = -(int) sz;
(p + 1)->s_type = M_ACCEPT;
return p;
}
/* replace m by m+ (positive closure - one or more) */
void
STATE *
RE_poscl(MACHINE * mp)
{
register STATE *p;
size_t sz;
if (ignore_star_star(mp))
return;
return NULL;
/*
* loop:
* SAVE_POS
@ -261,10 +288,13 @@ RE_poscl(MACHINE * mp)
p += sz - 1;
p->s_type = M_2JC;
#ifndef NO_INTERVAL_EXPR
p->it_min = 1;
p->it_max = MAX__INT;
#endif
p->s_data.jump = -((int) sz);
(p + 1)->s_type = M_ACCEPT;
return p;
}
/* replace m by m? (zero or one) */

77
rexp3.c
View File

@ -12,7 +12,7 @@ the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexp3.c,v 1.47 2020/10/16 23:31:12 tom Exp $
* $MawkId: rexp3.c,v 1.48 2020/10/22 22:45:12 tom Exp $
*/
/* match a string against a machine */
@ -22,6 +22,7 @@ the GNU General Public License, version 2, 1991.
#define push(mx,sx,px,ssx,ux) do { \
if (++stackp == RE_run_stack_limit) \
stackp = RE_new_run_stack() ;\
TRACE2(("@%d, pushing %d:%03d\n", __LINE__, (int)(stackp - RE_run_stack_base), (int)(m - machine))); \
stackp->m = (mx); \
stackp->s = (sx); \
stackp->sp = (int) ((px) - RE_pos_stack_base); \
@ -30,6 +31,16 @@ the GNU General Public License, version 2, 1991.
stackp->u = (ux); \
} while(0)
#ifdef NO_RI_LOOP_UNROLL
#define restart_count(old,new) \
if (old != new) { \
TRACE2(("RESET %p ->%p\n", old, new)); \
m->it_cnt = 1; \
}
#else
#define restart_count(old,new) /* nothing */
#endif
#define CASE_UANY(x) case (x)+U_OFF: /* FALLTHRU */ case (x)+U_ON
#define TR_STR(s) TRACE((" str:%i len:%lu\n", ((s) ? (int) ((s) - str) : -99), (unsigned long) *lenp))
@ -38,6 +49,7 @@ the GNU General Public License, version 2, 1991.
*lenp = (size_t) (cb_e - cb_ss); \
} \
TR_STR(s); \
TRACE2(("returning @%d: %d\n", __LINE__, cb_ss ? (int)(cb_ss - str) : -1)); \
return cb_ss
#if OPT_TRACE
@ -85,6 +97,7 @@ REmatch(char *str, /* string to test */
/* check for the easy case */
if (m->s_type == M_STR && (m + 1)->s_type == M_ACCEPT) {
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
if ((ts = str_str(s, str_len, m->s_data.str, (size_t) m->s_len))) {
*lenp = m->s_len;
}
@ -100,6 +113,7 @@ REmatch(char *str, /* string to test */
RE_CASE();
refill:
TRACE2(("@%d, refill machine %03d\n", __LINE__, (int) (m - machine)));
if (stackp == RE_run_stack_empty) {
RE_TURN();
}
@ -116,23 +130,28 @@ REmatch(char *str, /* string to test */
}
m = (stackp + 1)->m;
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
sp = RE_pos_stack_base + (stackp + 1)->sp;
sp->prev_offset = (stackp + 1)->tp;
u_flag = (stackp + 1)->u;
reswitch:
TRACE(("[%s@%d] %03d %-8s %-15s: %s\n", __FILE__, __LINE__,
TRACE(("[%s@%d] %d:%03d %-8s %-15s: %s\n", __FILE__, __LINE__,
(int) (stackp - RE_run_stack_base),
(int) (m - machine),
REs_type(m),
RE_u_end(u_flag),
cb_ss ? cb_ss : s));
switch (m->s_type + u_flag) {
case M_STR + U_OFF + END_OFF:
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
if (strncmp(s, m->s_data.str, (size_t) m->s_len)) {
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
RE_FILL();
}
if (!ss) {
if (cb_ss && s > cb_ss) {
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
RE_FILL();
} else {
ss = s;
@ -140,9 +159,11 @@ REmatch(char *str, /* string to test */
}
s += m->s_len;
m++;
TRACE2(("@%d, next %03d\n", __LINE__, (int) (m - machine)));
RE_CASE();
case M_STR + U_OFF + END_ON:
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
if (strcmp(s, m->s_data.str)) {
RE_FILL();
}
@ -155,9 +176,11 @@ REmatch(char *str, /* string to test */
}
s += m->s_len;
m++;
TRACE2(("@%d, next %03d\n", __LINE__, (int) (m - machine)));
RE_CASE();
case M_STR + U_ON + END_OFF:
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
if (s >= str_end) {
RE_FILL();
}
@ -178,9 +201,11 @@ REmatch(char *str, /* string to test */
s += m->s_len;
m++;
u_flag = U_OFF;
TRACE2(("@%d, next %03d\n", __LINE__, (int) (m - machine)));
RE_CASE();
case M_STR + U_ON + END_ON:
TRACE2(("@%d, now %03d\n", __LINE__, (int) (m - machine)));
t = (int) ((SLen) (str_end - s) - m->s_len);
if (t < 0 || memcmp(ts = s + t, m->s_data.str, (size_t) m->s_len)) {
RE_FILL();
@ -195,6 +220,7 @@ REmatch(char *str, /* string to test */
s = str_end;
m++;
u_flag = U_OFF;
TRACE2(("@%d, next %03d\n", __LINE__, (int) (m - machine)));
RE_CASE();
case M_CLASS + U_OFF + END_OFF:
@ -417,17 +443,48 @@ REmatch(char *str, /* string to test */
case (M_2JC) + U_ON:
/* see REtest */
#ifndef NO_INTERVAL_EXPR
if (m->it_max < MAX__INT && ++(m->it_cnt) >= m->it_max) {
#ifdef NO_RI_LOOP_UNROLL
m->it_cnt++;
TRACE(("checking loop " INT_FMT " [" INT_FMT ".." INT_FMT "]\n",
m->it_cnt, m->it_min, m->it_max));
TR_STR(s);
if (m->it_cnt < m->it_min) {
/* keep looping until minimum is met */
RE_pos_pop(&sp, stackp);
push(m + 1, s, sp, ss, u_flag);
m += m->s_data.jump;
TRACE2(("TEST @%d: %03d\n", __LINE__, (int) (m - machine)));
} else if ((m->it_cnt >= m->it_min)
&& (m->it_max == MAX__INT
|| (m->it_max < MAX__INT && m->it_cnt >= m->it_max))) {
/* quit looping once maximum is met */
RE_pos_pop(&sp, stackp);
m++;
TRACE2(("TEST @%d: %03d\n", __LINE__, (int) (m - machine)));
} else
#else /* !NO_RI_LOOP_UNROLL */
if (m->it_max < MAX__INT && ++(m->it_cnt) >= m->it_max) {
RE_CASE(); /* test the next thing */
} else
#endif /* ! NO_INTERVAL_EXPR */
#endif /* NO_RI_LOOP_UNROLL */
if (RE_pos_pop(&sp, stackp) == s) {
/* fall out of loop, to next instruction */
m++;
TRACE2(("TEST @%d: %03d\n", __LINE__, (int) (m - machine)));
} else {
/* continue looping as long as matching */
push(m + 1, s, sp, ss, u_flag);
m += m->s_data.jump;
TRACE2(("TEST @%d: %03d\n", __LINE__, (int) (m - machine)));
}
RE_CASE();
#else
if (RE_pos_pop(&sp, stackp) == s) {
m++;
RE_CASE();
}
/* FALLTHRU */
#endif /* ! NO_INTERVAL_EXPR */
case (M_2JB) + U_OFF: /* take the jump branch */
/* FALLTHRU */
case (M_2JB) + U_ON:
@ -440,8 +497,14 @@ REmatch(char *str, /* string to test */
ss = s;
if (!cb_ss || ss < cb_ss || (ss == cb_ss && s > cb_e)) {
/* we have a new current best */
restart_count(cb_ss, ss);
cb_ss = ss;
cb_e = s;
TRACE2(("@%d, new best [%d..%d]'%.*s'\n", __LINE__,
(int) (cb_ss - str),
(int) (cb_e - str),
(int) (cb_e - cb_ss),
cb_ss));
} else if (ss == cb_ss && s == cb_e) {
RE_TURN();
}
@ -456,8 +519,14 @@ REmatch(char *str, /* string to test */
if (!cb_ss || ss < cb_ss || (ss == cb_ss && s > cb_e)) {
/* we have a new current best */
restart_count(cb_ss, ss);
cb_ss = ss;
cb_e = s;
TRACE2(("@%d, new best [%d..%d]'%.*s'\n", __LINE__,
(int) (cb_ss - str),
(int) (cb_e - str),
(int) (cb_e - cb_ss),
cb_ss));
} else if (ss == cb_ss && s == cb_e) {
RE_TURN();
}

View File

@ -11,7 +11,7 @@ the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexpdb.c,v 1.21 2020/10/16 22:43:52 tom Exp $
* $MawkId: rexpdb.c,v 1.23 2020/10/23 23:32:39 tom Exp $
*/
#include "rexp.h"
@ -78,8 +78,17 @@ REmprint(STATE * m, FILE *f)
case M_2JC:
fprintf(f, "\t%03d", line + p->s_data.jump);
#ifndef NO_INTERVAL_EXPR
if (p->it_max != MAX__INT)
fprintf(f, "," INT_FMT, p->it_max);
if (p->it_min != 1 || p->it_max != MAX__INT) {
fprintf(f, " %c", L_CURL);
if (p->it_min != 0)
fprintf(f, INT_FMT, p->it_min);
if (p->it_max != p->it_min) {
fprintf(f, ",");
if (p->it_max != MAX__INT)
fprintf(f, INT_FMT, p->it_max);
}
fprintf(f, "%c", R_CURL);
}
#endif
break;
case M_CLASS: