mawk/rexp.c
2025-01-01 11:42:30 +00:00

530 lines
13 KiB
C

/********************************************
rexp.c
copyright 2008-2023,2024, Thomas E. Dickey
copyright 1991-1993,1996, Michael D. Brennan
This is a source file for mawk, an implementation of
the AWK programming language.
Mawk is distributed without warranty under the terms of
the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: rexp.c,v 1.56 2024/12/31 12:56:54 tom Exp $
*/
/* op precedence parser for regular expressions */
#include <rexp.h>
#include <regexp.h>
#ifndef FIXME_INTERVAL_LIMITS
#define FIXME_INTERVAL_LIMITS 0 /* =1 for pre-bugfix */
#endif
/* DATA */
int REerrno;
const char *const REerrlist[] =
{(char *) 0,
/* ERR_1 */ "missing '('",
/* ERR_2 */ "missing ')'",
/* ERR_3 */ "bad class -- [], [^] or [",
/* ERR_4 */ "missing operand",
/* ERR_5 */ "resource exhaustion -- regular expression too large",
/* ERR_6 */ "syntax error ^* or ^+",
/* ERR_7 */ "bad interval expression",
/* ERR_8 */ ""
};
/* ERR_5 is very unlikely to occur */
/* This table drives the operator precedence parser */
/* *INDENT-OFF* */
#ifdef NO_INTERVAL_EXPR
static short table[8][8] = {
/* 0 | CAT * + ? ( ) */
/* 0 */ {0, OP_L, OP_L, OP_L, OP_L, OP_L, OP_L, ERR_1},
/* | */ {OP_G, OP_G, OP_L, OP_L, OP_L, OP_L, OP_L, OP_G},
/* CAT*/ {OP_G, OP_G, OP_G, OP_L, OP_L, OP_L, OP_L, OP_G},
/* * */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G},
/* + */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G},
/* ? */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G},
/* ( */ {ERR_2, OP_L, OP_L, OP_L, OP_L, OP_L, OP_L, OP_EQ},
/* ) */ {OP_G , OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G}};
#else
static short table[10][10] = {
/* 0 | CAT * + ? ( ) { } */
/* 0 */ {0, OP_L, OP_L, OP_L, OP_L, OP_L, OP_L, ERR_1, ERR_7, OP_L},
/* | */ {OP_G, OP_G, OP_L, OP_L, OP_L, OP_L, OP_L, OP_G, OP_G, OP_G},
/* CAT*/ {OP_G, OP_G, OP_G, OP_L, OP_L, OP_L, OP_L, OP_G, OP_L, OP_G},
/* * */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, OP_G, OP_G},
/* + */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, OP_G, OP_G},
/* ? */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, OP_G, OP_G},
/* ( */ {ERR_2, OP_L, OP_L, OP_L, OP_L, OP_L, OP_L, OP_EQ, OP_G, OP_G},
/* ) */ {OP_G , OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, ERR_7, OP_G},
/* { */ {OP_G, OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, OP_G, OP_EQ},
/* } */ {OP_G , OP_G, OP_G, OP_G, OP_G, OP_G, ERR_7, OP_G, ERR_7, OP_G} };
#endif
/* *INDENT-ON* */
#define STACKSZ 64
static const char *REs_type(STATE * p);
static jmp_buf err_buf; /* used to trap on error */
#if OPT_TRACE > 0
static const char *
token_name(int token)
{
const char *result;
#define CASE(name) case name: result = #name; break
switch (token) {
CASE(T_NONE);
CASE(T_OR);
CASE(T_CAT);
CASE(T_STAR);
CASE(T_PLUS);
CASE(T_Q);
CASE(T_LP);
CASE(T_RP);
CASE(T_START);
CASE(T_END);
CASE(T_ANY);
CASE(T_CLASS);
CASE(T_SLASH);
CASE(T_CHAR);
CASE(T_STR);
#ifndef NO_INTERVAL_EXPR
CASE(T_LB);
CASE(T_RB);
#endif
CASE(T_U);
default:
result = "?";
break;
}
#undef CASE
return result;
}
#endif
void
RE_error_trap(int x)
{
TRACE(("RE_error_trap(%d)\n", x));
REerrno = x;
longjmp(err_buf, 1);
}
typedef struct {
int token;
int prec;
} OPS;
#ifndef NO_INTERVAL_EXPR
/* duplicate a machine, oldmp into newmp */
static void
duplicate_m(MACHINE * newmp, MACHINE * oldmp)
{
register STATE *p;
TRACE(("duplicate_m %p -> %p\n", (void *) oldmp, (void *) newmp));
TRACE(("...start %p\n", (void *) oldmp->start));
TRACE(("...stop %p\n", (void *) oldmp->stop));
p = (STATE *) RE_malloc(2 * STATESZ);
RE_copy_states(p, oldmp->start, 2);
newmp->start = (STATE *) p;
newmp->stop = (STATE *) (p + 1);
}
static void
RE_set_limit(MACHINE * mp, Int minlimit, Int maxlimit)
{
STATE *p = mp->start;
STATE *q = NULL;
if (p->s_type == M_2JA)
++p;
if (p->s_type == M_SAVE_POS) {
int depth = 0;
STATE *r = p;
do {
switch (r->s_type) {
case M_SAVE_POS:
depth++;
break;
case M_2JC:
case M_LOOP:
if (--depth == 0) {
q = r;
}
break;
case M_ACCEPT:
depth = -1;
break;
}
++r;
} while (depth > 0);
}
if (q != NULL) {
size_t len = (size_t) (mp->stop - mp->start + 2);
int offset = (int) (q - mp->start);
q->s_type = M_LOOP;
q->it_min = minlimit;
q->it_max = maxlimit;
/* reallocate the states, to insert an item at the beginning */
mp->start = (STATE *) RE_realloc(mp->start, len * STATESZ);
mp->stop = mp->start + len - 1;
q = mp->start;
while (--len != 0) {
q[len] = q[len - 1];
}
q->s_type = M_ENTER;
q->s_data.jump = offset + 1;
}
}
/* replace m with m* limited to the max iterations
(variation of m* closure) */
static void
RE_close_limit(MACHINE * mp, Int min_limit, Int max_limit)
{
RE_close(mp);
RE_set_limit(mp, min_limit, max_limit);
}
/* replace m with m+ limited to the max iterations
which is one or more, limited
(variation of m+ positive closure) */
static void
RE_poscl_limit(MACHINE * mp, Int min_limit, Int max_limit)
{
RE_poscl(mp);
RE_set_limit(mp, min_limit, max_limit);
}
#endif /* ! NO_INTERVAL_EXPR */
/* duplicate_m() relies upon copying machines whose size is 1, i.e., atoms */
#define BigMachine(mp) (((mp)->stop - (mp)->start) > 1)
STATE *
REcompile(char *re, size_t len)
{
#define m_stack(n) &m_array[(n) + 1]
MACHINE m_array[1 + STACKSZ];
OPS op_stack[STACKSZ];
register MACHINE *m_ptr;
register OPS *op_ptr;
register int t;
TRACE(("REcompile %.*s\n", (int) len, re));
/* do this first because it also checks if we have a
run time stack */
RE_lex_init(re, len);
if (len == 0) {
STATE *p = (STATE *) RE_malloc(sizeof(STATE));
p->s_type = M_ACCEPT;
return p;
}
if (setjmp(err_buf))
return NULL;
/* we used to try to recover memory left on machine stack ;
but now m_ptr is in a register so it won't be right unless
we force it out of a register which isn't worth the trouble */
/* initialize the stacks */
m_ptr = m_array;
op_ptr = op_stack;
op_ptr->token = 0;
t = RE_lex(m_stack(0));
memset(m_ptr, 0, sizeof(*m_ptr));
/* provide for making the trace a little easier to read by indenting */
#if OPT_TRACE > 1
#define M_FMT(format) "@%d: %*s " format, __LINE__, 4 * ((int) (m_ptr - m_array)), " "
#else
#define M_FMT(format) format
#endif
while (1) {
TRACE((M_FMT("RE_lex token %s\n"), token_name(t)));
switch (t) {
case T_STR:
case T_ANY:
case T_U:
case T_START:
case T_END:
case T_CLASS:
m_ptr++;
break;
#ifndef NO_INTERVAL_EXPR
case T_RB:
if (!repetitions_flag) {
goto default_case;
}
/* interval expression {n,m}
* eg,
* convert m{3} to mmm
* convert m{3,} to mmm* (with a limit of MAX_INT)
* convert m{3,10} to mmm* with a limit of 10
*/
TRACE((M_FMT("interval {%ld,%ld}\n"), (long) intrvalmin, (long) intrvalmax));
if ((m_ptr - m_array) < STACKSZ)
memset(m_ptr + 1, 0, sizeof(*m_ptr));
if (intrvalmin == 0) { /* zero or more */
switch (intrvalmax) {
case 0:
/* user stupidity: m{0} or m{0,0}
* don't add this re token
*/
if (m_ptr == m_array) {
t = RE_lex(++m_ptr);
if (t != T_NONE) {
continue;
} else {
m_array[1] = RE_any(); /* FIXME: RE_none? */
m_ptr = m_stack(0);
}
} else if (op_ptr != op_stack) {
/* no previous re */
RE_free(m_ptr->start);
m_ptr--;
switch (op_ptr->token) {
case T_RP:
while (op_ptr != op_stack) {
--op_ptr;
if (op_ptr->token == T_LP) {
if (op_ptr == op_stack) {
op_ptr->token = T_NONE;
}
break;
}
}
op_ptr = op_stack + 1;
break;
case T_LP:
break;
default:
op_ptr--;
break;
}
} else if (*re_exp == '\0') {
/* this was the only re expr
so leave one M_ACCEPT as the machine */
m_ptr->start->s_type = M_ACCEPT;
} else {
RE_free(m_ptr->start);
m_ptr--;
}
TRACE((M_FMT("RE_lex token %s\n"),
"of zero interval is ignored!"));
break;
case 1:
RE_01(m_ptr); /* m{0,1} which is m? */
TRACE((M_FMT("RE_lex token %s\n"), token_name(T_Q)));
break;
default:
RE_close_limit(m_ptr, intrvalmin, intrvalmax);
TRACE((M_FMT("RE_lex token %s\n"), token_name(T_Q)));
}
} else if (BigMachine(m_ptr)) {
RE_poscl_limit(m_ptr, intrvalmin, intrvalmax);
} else if (intrvalmin == 1) { /* one or more */
RE_poscl_limit(m_ptr, intrvalmin, intrvalmax);
} else if (m_ptr->start != NULL) { /* n or more */
/* loop-unrolling only works if min==max, so that the loops in
* test/match functions can process the whole loop in each
* iteration */
if (FIXME_INTERVAL_LIMITS || intrvalmin == intrvalmax) {
register Int i;
/* copy 2 copies of m_ptr, use 2nd copy to replace
the first copy that gets swallowed by concat */
MACHINE *result_mp = m_ptr;
MACHINE *concat_mp = (m_ptr + 1);
MACHINE *new_mp = (m_ptr + 2);
TRACE((M_FMT("calling duplicate_m result_mp %ld -> concat_mp %ld\n"),
result_mp - m_array,
concat_mp - m_array));
duplicate_m(concat_mp, result_mp);
TRACE((M_FMT("calling duplicate_m result_mp %ld -> new_mp %ld\n"),
result_mp - m_array,
new_mp - m_array));
duplicate_m(new_mp, result_mp);
for (i = 2; i <= intrvalmin; i++) {
RE_cat(result_mp, concat_mp);
duplicate_m(concat_mp, new_mp);
}
/* don't need 2nd copy in new_mp */
RE_free(new_mp->start);
} else {
RE_poscl_limit(m_ptr, intrvalmin, intrvalmax);
}
}
break;
#endif /* ! NO_INTERVAL_EXPR */
case T_NONE: /* end of reg expr */
if (op_ptr->token == 0) {
/* done */
if (m_ptr == m_stack(0)) {
return m_ptr->start;
} else {
/* machines still on the stack */
RE_panic("values still on machine stack for %s", re);
}
}
/* FALLTHRU */
/* otherwise, default is operator case */
default:
#ifndef NO_INTERVAL_EXPR
default_case:
#endif
if ((op_ptr->prec = table[op_ptr->token][t]) == OP_G) {
do { /* op_pop */
if (op_ptr->token <= T_CAT) { /*binary op */
if (m_ptr == m_stack(0)
&& op_ptr->token == T_CAT) {
TRACE(("...ignoring empty T_CAT\n"));
op_ptr--;
continue;
}
m_ptr--;
}
/* if not enough values on machine stack
then we have a missing operand */
if (m_ptr == m_array)
RE_error_trap(-ERR_4);
switch (op_ptr->token) {
case T_CAT:
RE_cat(m_ptr, m_ptr + 1);
break;
case T_OR:
RE_or(m_ptr, m_ptr + 1);
break;
case T_STAR:
RE_close(m_ptr);
break;
case T_PLUS:
RE_poscl(m_ptr);
break;
case T_Q:
RE_01(m_ptr);
break;
default:
/*nothing on ( or ) */
break;
}
op_ptr--;
}
while (op_ptr->prec != OP_L);
continue; /* back thru switch at top */
}
if (op_ptr->prec < 0) {
if (op_ptr->prec == ERR_7)
RE_panic("parser returns ERR_7");
else
RE_error_trap(-op_ptr->prec);
}
if (++op_ptr == op_stack + STACKSZ) {
/* stack overflow */
RE_error_trap(-ERR_5);
}
op_ptr->token = t;
} /* end of switch */
if (m_ptr >= m_stack(STACKSZ - 1)) {
/*overflow */
RE_error_trap(-ERR_5);
}
t = RE_lex(m_ptr + 1);
}
}
#ifdef NO_LEAKS
void
REdestroy(STATE * ptr)
{
int done = 0;
int n = 0;
STATE *q = ptr;
TRACE(("REdestroy %p\n", (void *) ptr));
while (!done) {
TRACE(("...destroy[%d] %p type %s\n", n, (void *) q, REs_type(q)));
switch (q->s_type) {
case M_ACCEPT:
done = 1;
break;
case M_STR:
RE_free(q->s_data.str);
break;
default:
if (q->s_type < 0 || q->s_type > END_ON)
done = -1;
break;
}
++q;
++n;
}
RE_free(ptr);
}
#endif /* NO_LEAKS */
/* getting here means a logic flaw or unforeseen case */
void
RE_panic(const char *format, ...)
{
const char *where = "REcompile() - panic: ";
va_list args;
fflush(stdout);
#if OPT_TRACE > 0
va_start(args, format);
Trace("?? %s", where);
TraceVA(format, args);
Trace("\n");
va_end(args);
#endif
fputs(where, stderr);
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
fprintf(stderr, "\n");
mawk_exit(100);
}
/* getting regexp error message */
const char *
REerror(void)
{
return REerrlist[REerrno];
}