mirror of
https://github.com/ThomasDickey/mawk-snapshots.git
synced 2026-01-26 19:09:15 +00:00
1011 lines
20 KiB
C
1011 lines
20 KiB
C
/********************************************
|
|
rexp0.c
|
|
copyright 2008-2020,2024, Thomas E. Dickey
|
|
copyright 2010, Jonathan Nieder
|
|
copyright 1991-1994,1996, Michael D. Brennan
|
|
|
|
This is a source file for mawk, an implementation of
|
|
the AWK programming language.
|
|
|
|
Mawk is distributed without warranty under the terms of
|
|
the GNU General Public License, version 2, 1991.
|
|
********************************************/
|
|
|
|
/*
|
|
* $MawkId: rexp0.c,v 1.48 2024/08/25 17:16:24 tom Exp $
|
|
*/
|
|
|
|
/* lexical scanner */
|
|
|
|
#undef LOCAL_REGEXP /* no need for push/pop */
|
|
#include <rexp.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
typedef struct {
|
|
int first;
|
|
int last;
|
|
} CCLASS;
|
|
|
|
/* static functions */
|
|
static int do_str(int, char **, MACHINE *);
|
|
static int do_class(char **, MACHINE *);
|
|
static int escape(char **);
|
|
static BV *store_bvp(BV *);
|
|
|
|
/* make next array visible */
|
|
/* *INDENT-OFF* */
|
|
static const
|
|
char char2token[] =
|
|
{
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*07*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*0f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*17*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*1f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_END, T_CHAR, T_CHAR, T_CHAR, /*27*/
|
|
T_LP, T_RP, T_STAR, T_PLUS, T_CHAR, T_CHAR, T_ANY, T_CHAR, /*2f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*37*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_Q, /*3f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*47*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*4f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*57*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CLASS,T_SLASH,T_CHAR, T_START,T_CHAR, /*5f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*67*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*6f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*77*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_LB, T_OR, T_RB, T_CHAR, T_CHAR, /*7f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*87*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*8f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*97*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*9f*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*a7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*af*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*b7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*bf*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*c7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*cf*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*d7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*df*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*e7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*ef*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*f7*/
|
|
T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR /*ff*/
|
|
};
|
|
/* *INDENT-ON* */
|
|
|
|
#define NOT_STARTED (-1)
|
|
|
|
static int prev;
|
|
static size_t nest;
|
|
char *re_exp; /* ptr to reg exp string */
|
|
static char *re_str; /* base of 're_exp' */
|
|
static size_t re_len;
|
|
|
|
#ifndef NO_INTERVAL_EXPR
|
|
Int intrvalmin;
|
|
Int intrvalmax;
|
|
|
|
/*
|
|
* Given a string beginning with T_LB, check if that is an interval expression.
|
|
*/
|
|
static int
|
|
ok_intervals(const char *p)
|
|
{
|
|
int result = 0;
|
|
int comma = 0;
|
|
int ch;
|
|
while ((ch = (UChar) * ++p) != '\0') {
|
|
if (ch == R_CURL) {
|
|
result = 1;
|
|
break;
|
|
} else if (isdigit(ch)) {
|
|
; /* zero or more digits */
|
|
} else if (ch == ',') {
|
|
if (++comma > 1) {
|
|
break; /* zero or one commas */
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
Collect two numbers between T_LB and T_RB, saving
|
|
the values in intrvalmin and intrvalmax.
|
|
|
|
There are three ways the interval expressions are formed:
|
|
{n} => previous regexp is repeated n times
|
|
{n,m} => previous regexp is repeated n to m times
|
|
{n,} => previous regexp is repeated n or more times
|
|
{,m} => {0,m}
|
|
Note: awk doesn't define {,m}
|
|
|
|
returns: T_RB, or on error T_CHAR
|
|
*/
|
|
|
|
static int
|
|
do_intervals(
|
|
char **pp) /* where to put the re_char pointer on exit */
|
|
{
|
|
register char *p; /* runs thru the input */
|
|
|
|
p = *pp;
|
|
|
|
intrvalmin = 0;
|
|
intrvalmax = 0;
|
|
if (!isdigit((UChar) * p) && *p != ',') /* error */
|
|
{
|
|
RE_error_trap(-ERR_7);
|
|
}
|
|
|
|
if (*p != ',') {
|
|
intrvalmin = intrvalmin * 10 + *p++ - '0';
|
|
|
|
while (*p != '\0') {
|
|
if (isdigit((UChar) * p)) {
|
|
intrvalmin = intrvalmin * 10 + *p++ - '0';
|
|
} else if ((UChar) * p == R_CURL) {
|
|
p++;
|
|
*pp = p;
|
|
intrvalmax = intrvalmin; /* {n} */
|
|
return T_RB;
|
|
} else if ((UChar) * p == ',') {
|
|
if ((UChar) * ++p == R_CURL) {
|
|
p++;
|
|
*pp = p;
|
|
intrvalmax = MAX__INT;
|
|
return T_RB; /* {n,} */
|
|
}
|
|
break;
|
|
} else {
|
|
p++;
|
|
*pp = p;
|
|
RE_error_trap(-ERR_7);
|
|
}
|
|
}
|
|
} else {
|
|
p++;
|
|
}
|
|
while (*p != '\0') {
|
|
if (isdigit((UChar) * p)) {
|
|
intrvalmax = intrvalmax * 10 + *p++ - '0';
|
|
} else if ((UChar) * p == R_CURL) {
|
|
if (intrvalmax < intrvalmin) {
|
|
RE_error_trap(-ERR_7);
|
|
}
|
|
p++;
|
|
break;
|
|
} else {
|
|
p++;
|
|
*pp = p;
|
|
RE_error_trap(-ERR_7);
|
|
}
|
|
}
|
|
|
|
*pp = p;
|
|
return T_RB;
|
|
}
|
|
#endif /* ! NO_INTERVAL_EXPR */
|
|
|
|
void
|
|
RE_lex_init(char *re, size_t len)
|
|
{
|
|
re_str = re_exp = re;
|
|
re_len = len + 1;
|
|
prev = NOT_STARTED;
|
|
nest = 0;
|
|
#ifndef NO_INTERVAL_EXPR
|
|
intrvalmin = 0;
|
|
intrvalmax = 0; /* {n,} sets max to -1 */
|
|
#endif
|
|
RE_run_stack_init();
|
|
RE_pos_stack_init();
|
|
}
|
|
|
|
/*
|
|
* Get the next token from re_str.
|
|
*
|
|
* For nullary operations (T_STR, T_ANY, T_U, T_CLASS, T_START, T_END),
|
|
* before returning the appropriate token, this will write the
|
|
* corresponding machine to *mp.
|
|
*
|
|
* For the rest (T_PLUS, T_STAR, T_OR, T_Q, T_RP, T_LP, T_LB,
|
|
* T_RB, T_CAT), *mp is left alone.
|
|
*
|
|
* Returns 0 for end of regexp.
|
|
*/
|
|
int
|
|
RE_lex(MACHINE * mp)
|
|
{
|
|
/*
|
|
* re_exp records the current position while parsing.
|
|
* nest records the parenthesis nesting level.
|
|
* prev records the last token returned.
|
|
*/
|
|
register int c;
|
|
|
|
if ((unsigned) (1 + re_exp - re_str) >= re_len) {
|
|
return 0;
|
|
}
|
|
|
|
c = char2token[(UChar) (*re_exp)];
|
|
#ifndef NO_INTERVAL_EXPR
|
|
if (repetitions_flag) {
|
|
if (c == T_LB && !ok_intervals(re_exp))
|
|
c = T_CHAR;
|
|
} else {
|
|
if (c == T_LB || c == T_RB) {
|
|
c = T_CHAR;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
switch (c) {
|
|
case T_PLUS:
|
|
case T_STAR:
|
|
if (prev == T_START)
|
|
RE_error_trap(6);
|
|
/* fall thru */
|
|
|
|
case T_OR:
|
|
case T_Q:
|
|
re_exp++;
|
|
return prev = c;
|
|
|
|
#ifndef NO_INTERVAL_EXPR
|
|
case T_LB:
|
|
re_exp++;
|
|
prev = T_LB;
|
|
break;
|
|
#endif
|
|
|
|
case T_RP:
|
|
if (!nest) {
|
|
/* ) without matching ( is ordinary */
|
|
c = T_CHAR;
|
|
break;
|
|
}
|
|
nest--;
|
|
re_exp++;
|
|
return prev = c;
|
|
|
|
case 0:
|
|
return 0;
|
|
|
|
case T_LP:
|
|
switch (prev) {
|
|
case T_CHAR:
|
|
case T_STR:
|
|
case T_ANY:
|
|
case T_CLASS:
|
|
case T_START:
|
|
case T_RP:
|
|
case T_PLUS:
|
|
case T_STAR:
|
|
case T_Q:
|
|
case T_U:
|
|
return prev = T_CAT;
|
|
|
|
#ifndef NO_INTERVAL_EXPR
|
|
case T_RB:
|
|
if (!repetitions_flag) {
|
|
return prev = T_CAT;
|
|
}
|
|
#endif
|
|
|
|
/* FALLTHRU */
|
|
default:
|
|
nest++;
|
|
re_exp++;
|
|
return prev = T_LP;
|
|
} /* T_LP switch */
|
|
}
|
|
|
|
/* *re_exp is an operand, but implicit cat op is possible */
|
|
switch (prev) {
|
|
case NOT_STARTED:
|
|
case T_OR:
|
|
case T_LP:
|
|
case T_CAT:
|
|
|
|
switch (c) {
|
|
case T_ANY:
|
|
{
|
|
static int plus_is_star_flag = 0;
|
|
|
|
if (*++re_exp == '*') {
|
|
re_exp++;
|
|
*mp = RE_u();
|
|
return prev = T_U;
|
|
} else if (*re_exp == '+') {
|
|
if (plus_is_star_flag) {
|
|
re_exp++;
|
|
*mp = RE_u();
|
|
plus_is_star_flag = 0;
|
|
return prev = T_U;
|
|
} else {
|
|
plus_is_star_flag = 1;
|
|
re_exp--;
|
|
*mp = RE_any();
|
|
return prev = T_ANY;
|
|
}
|
|
} else {
|
|
*mp = RE_any();
|
|
prev = T_ANY;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case T_SLASH:
|
|
re_exp++;
|
|
c = escape(&re_exp);
|
|
prev = do_str(c, &re_exp, mp);
|
|
break;
|
|
|
|
#ifndef NO_INTERVAL_EXPR
|
|
case T_LB:
|
|
case T_RB:
|
|
#endif
|
|
case T_CHAR:
|
|
c = *re_exp++;
|
|
prev = do_str(c, &re_exp, mp);
|
|
break;
|
|
|
|
case T_CLASS:
|
|
prev = do_class(&re_exp, mp);
|
|
break;
|
|
|
|
case T_START:
|
|
*mp = RE_start();
|
|
re_exp++;
|
|
prev = T_START;
|
|
break;
|
|
|
|
case T_END:
|
|
re_exp++;
|
|
*mp = RE_end();
|
|
return prev = T_END;
|
|
|
|
default:
|
|
RE_panic("bad switch in RE_lex: %d", c);
|
|
} /* T_CAT switch */
|
|
break;
|
|
|
|
#ifndef NO_INTERVAL_EXPR
|
|
case T_LB:
|
|
/* get interval expression numbers until closing T_RB */
|
|
prev = do_intervals(&re_exp);
|
|
return prev = T_RB;
|
|
|
|
case T_RB:
|
|
/* FALLTHRU */
|
|
#endif
|
|
|
|
default:
|
|
/* don't advance the pointer */
|
|
return prev = T_CAT;
|
|
}
|
|
|
|
/* check for end character */
|
|
if (*re_exp == '$') {
|
|
mp->start->s_type = (SType) (mp->start->s_type + END_ON);
|
|
re_exp++;
|
|
}
|
|
|
|
return prev;
|
|
}
|
|
|
|
/*
|
|
Collect a run of characters into a string machine.
|
|
If the run ends at *,+, or ?, then don't take the last
|
|
character unless the string has length one.
|
|
*/
|
|
|
|
static int
|
|
do_str(
|
|
int c, /* the first character */
|
|
char **pp, /* where to put the re_char pointer on exit */
|
|
MACHINE * mp) /* where to put the string machine */
|
|
{
|
|
register char *p; /* runs thru the input */
|
|
char *pt = 0; /* trails p by one */
|
|
char *str; /* collect it here */
|
|
register char *s; /* runs thru the output */
|
|
size_t len; /* length collected */
|
|
|
|
p = *pp;
|
|
s = str = RE_malloc(re_len);
|
|
*s++ = (char) c;
|
|
len = 1;
|
|
|
|
while ((1 + p - re_str) < (int) re_len) {
|
|
char *save;
|
|
|
|
c = char2token[(UChar) (*p)];
|
|
#ifndef NO_INTERVAL_EXPR
|
|
if (!repetitions_flag && (c == T_LB || c == T_RB)) {
|
|
c = T_CHAR;
|
|
}
|
|
#endif
|
|
|
|
switch (c) {
|
|
case T_CHAR:
|
|
pt = p;
|
|
*s++ = *p++;
|
|
break;
|
|
|
|
case T_SLASH:
|
|
pt = p;
|
|
save = p + 1; /* keep p in a register */
|
|
*s++ = (char) escape(&save);
|
|
p = save;
|
|
break;
|
|
|
|
default:
|
|
goto out;
|
|
}
|
|
len++;
|
|
}
|
|
|
|
out:
|
|
/* if len > 1 and we stopped on a ? + or * , need to back up */
|
|
if (len > 1 && (*p == '*' || *p == '+' || *p == '?'
|
|
#ifndef NO_INTERVAL_EXPR
|
|
|| (repetitions_flag == 1 && *p == L_CURL)
|
|
#endif
|
|
)) {
|
|
len--;
|
|
p = pt;
|
|
s--;
|
|
}
|
|
|
|
*s = 0;
|
|
*pp = p;
|
|
*mp = RE_str((char *) RE_realloc(str, len + 1), len);
|
|
return T_STR;
|
|
}
|
|
|
|
/*--------------------------------------------
|
|
BUILD A CHARACTER CLASS
|
|
*---------------------------*/
|
|
|
|
#define char_on(b,x) ((b)[(x)>>3] |= (UChar) ( 1 << ((x)&7) ))
|
|
|
|
static void
|
|
block_on(BV b, int x, int y)
|
|
/* caller makes sure x<=y and x>0 y>0 */
|
|
{
|
|
int lo = x >> 3;
|
|
int hi = y >> 3;
|
|
int r_lo = x & 7;
|
|
int r_hi = y & 7;
|
|
|
|
if (lo == hi) {
|
|
b[lo] |= (UChar) ((1 << (r_hi + 1)) - (1 << r_lo));
|
|
} else {
|
|
int i;
|
|
for (i = lo + 1; i < hi; i++)
|
|
b[i] = 0xff;
|
|
b[lo] |= (UChar) (0xff << r_lo);
|
|
b[hi] |= (UChar) (~(0xff << (r_hi + 1)));
|
|
}
|
|
}
|
|
|
|
#define CCLASS_DATA(name) { CCLASS_##name, #name, sizeof(#name) - 1 }
|
|
|
|
typedef enum {
|
|
CCLASS_NONE = 0,
|
|
CCLASS_alnum,
|
|
CCLASS_alpha,
|
|
CCLASS_blank,
|
|
CCLASS_cntrl,
|
|
CCLASS_digit,
|
|
CCLASS_graph,
|
|
CCLASS_lower,
|
|
CCLASS_print,
|
|
CCLASS_punct,
|
|
CCLASS_space,
|
|
CCLASS_upper,
|
|
CCLASS_xdigit
|
|
} CCLASS_ENUM;
|
|
|
|
#ifndef isblank
|
|
#define isblank(c) ((c) == ' ' || (c) == '\t')
|
|
#endif
|
|
|
|
static CCLASS *
|
|
lookup_cclass(char **start)
|
|
{
|
|
static CCLASS *cclass_data[CCLASS_xdigit];
|
|
static const struct {
|
|
CCLASS_ENUM code;
|
|
const char name[8];
|
|
unsigned size;
|
|
} cclass_table[] = {
|
|
CCLASS_DATA(alnum),
|
|
CCLASS_DATA(alpha),
|
|
CCLASS_DATA(blank),
|
|
CCLASS_DATA(cntrl),
|
|
CCLASS_DATA(digit),
|
|
CCLASS_DATA(graph),
|
|
CCLASS_DATA(lower),
|
|
CCLASS_DATA(print),
|
|
CCLASS_DATA(punct),
|
|
CCLASS_DATA(space),
|
|
CCLASS_DATA(upper),
|
|
CCLASS_DATA(xdigit),
|
|
};
|
|
CCLASS *result = 0;
|
|
CCLASS_ENUM code = CCLASS_NONE;
|
|
const char *name;
|
|
char *colon;
|
|
size_t size;
|
|
size_t item;
|
|
|
|
#ifdef NO_LEAKS
|
|
if (start == 0) {
|
|
for (item = 0; item < (sizeof(cclass_data) /
|
|
sizeof(cclass_data[0])); ++item) {
|
|
if (cclass_data[item]) {
|
|
free(cclass_data[item]);
|
|
cclass_data[item] = 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#endif
|
|
name = (*start += 2); /* point past "[:" */
|
|
colon = strchr(name, ':');
|
|
if (colon == 0 || colon[1] != ']') {
|
|
RE_error_trap(-ERR_3);
|
|
}
|
|
|
|
size = (size_t) (colon - *start); /* length of name */
|
|
if (size < 5 || size > 6) {
|
|
RE_error_trap(-ERR_3);
|
|
}
|
|
|
|
*start = colon + 2;
|
|
|
|
switch (name[0]) {
|
|
case 'a':
|
|
item = ((name[2] == 'n')
|
|
? CCLASS_alnum
|
|
: CCLASS_alpha);
|
|
break;
|
|
case 'b':
|
|
item = CCLASS_blank;
|
|
break;
|
|
case 'c':
|
|
item = CCLASS_cntrl;
|
|
break;
|
|
case 'd':
|
|
item = CCLASS_digit;
|
|
break;
|
|
case 'g':
|
|
item = CCLASS_graph;
|
|
break;
|
|
case 'l':
|
|
item = CCLASS_lower;
|
|
break;
|
|
case 'p':
|
|
item = ((name[1] == 'r')
|
|
? CCLASS_print
|
|
: CCLASS_punct);
|
|
break;
|
|
case 's':
|
|
item = CCLASS_space;
|
|
break;
|
|
case 'u':
|
|
item = CCLASS_upper;
|
|
break;
|
|
case 'x':
|
|
item = CCLASS_xdigit;
|
|
break;
|
|
default:
|
|
item = CCLASS_NONE;
|
|
break;
|
|
}
|
|
|
|
if (item-- != CCLASS_NONE &&
|
|
(size == cclass_table[item].size) &&
|
|
!strncmp(name, cclass_table[item].name, size)) {
|
|
code = cclass_table[item].code;
|
|
} else {
|
|
RE_error_trap(-ERR_3);
|
|
}
|
|
|
|
if ((result = cclass_data[item]) == 0) {
|
|
int ch = 0;
|
|
size_t have = 4;
|
|
size_t used = 0;
|
|
CCLASS *data = malloc(sizeof(CCLASS) * have);
|
|
int in_class = 0;
|
|
int first = -2;
|
|
int last = -2;
|
|
|
|
for (ch = 0; ch < 256; ++ch) {
|
|
switch (code) {
|
|
case CCLASS_NONE:
|
|
in_class = 0;
|
|
break;
|
|
case CCLASS_alnum:
|
|
in_class = isalnum(ch);
|
|
break;
|
|
case CCLASS_alpha:
|
|
in_class = isalpha(ch);
|
|
break;
|
|
case CCLASS_blank:
|
|
in_class = isblank(ch);
|
|
break;
|
|
case CCLASS_cntrl:
|
|
in_class = iscntrl(ch);
|
|
break;
|
|
case CCLASS_digit:
|
|
in_class = isdigit(ch);
|
|
break;
|
|
case CCLASS_graph:
|
|
in_class = isgraph(ch);
|
|
break;
|
|
case CCLASS_lower:
|
|
in_class = islower(ch);
|
|
break;
|
|
case CCLASS_print:
|
|
in_class = isprint(ch);
|
|
break;
|
|
case CCLASS_punct:
|
|
in_class = ispunct(ch);
|
|
break;
|
|
case CCLASS_space:
|
|
in_class = isspace(ch);
|
|
break;
|
|
case CCLASS_upper:
|
|
in_class = isupper(ch);
|
|
break;
|
|
case CCLASS_xdigit:
|
|
in_class = isxdigit(ch);
|
|
break;
|
|
}
|
|
if (in_class) {
|
|
if (first >= 0) {
|
|
last = ch;
|
|
} else {
|
|
first = last = ch;
|
|
}
|
|
} else if (first >= 0) {
|
|
if (used + 2 >= have) {
|
|
have *= 2;
|
|
data = realloc(data, sizeof(CCLASS) * have);
|
|
}
|
|
data[used].first = first;
|
|
data[used].last = last;
|
|
++used;
|
|
first = last = -2;
|
|
}
|
|
}
|
|
if (first >= 0) {
|
|
if (used + 2 >= have) {
|
|
have *= 2;
|
|
data = realloc(data, sizeof(CCLASS) * have);
|
|
}
|
|
data[used].first = first;
|
|
data[used].last = last;
|
|
++used;
|
|
}
|
|
data[used].first = -1;
|
|
cclass_data[item] = data;
|
|
result = data;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static CCLASS *
|
|
get_cclass(char *start, char **next)
|
|
{
|
|
CCLASS *result = 0;
|
|
|
|
if (start[0] == '['
|
|
&& start[1] == ':') {
|
|
result = lookup_cclass(&start);
|
|
if (next != 0) {
|
|
*next = start;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Check if we're pointing to a left square-bracket. If so, return nonzero
|
|
* if that is a literal one, not part of character class, etc.
|
|
*
|
|
* http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html#tag_09_03_05
|
|
*/
|
|
static int
|
|
literal_leftsq(char *start)
|
|
{
|
|
int result = 0;
|
|
if (start[0] == '[') {
|
|
if (get_cclass(start, 0) == 0)
|
|
result = 1;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/* build a BV for a character class.
|
|
*start points at the '['
|
|
on exit: *start points at the character after ']'
|
|
mp points at a machine that recognizes the class
|
|
*/
|
|
|
|
static int
|
|
do_class(char **start, MACHINE * mp)
|
|
{
|
|
char *p, *q;
|
|
BV *bvp;
|
|
int prevc;
|
|
int comp_flag;
|
|
int level;
|
|
CCLASS *cclass;
|
|
|
|
p = (*start) + 1;
|
|
|
|
/* []...] puts ] in a class
|
|
[^]..] negates a class with ]
|
|
*/
|
|
if (literal_leftsq(p) || p[0] == ']')
|
|
p++;
|
|
else if (p[0] == '^' && (literal_leftsq(p + 1) || p[1] == ']'))
|
|
p += 2;
|
|
|
|
/* XXX. Does not handle collating symbols or equivalence
|
|
* class expressions. See also collect_RE(). */
|
|
for (level = 0, q = p;; ++q) {
|
|
if (*q == '[' && q[1] == ':') {
|
|
if (++level > 1)
|
|
RE_error_trap(-ERR_3);
|
|
} else if (*q == ']') {
|
|
if (level == 0)
|
|
break;
|
|
if (q[-1] != ':')
|
|
RE_error_trap(-ERR_3);
|
|
--level;
|
|
} else if (*q == '\\') {
|
|
++q;
|
|
}
|
|
if (*q == '\0' && q == (re_str + re_len - 1)) {
|
|
/* no closing bracket */
|
|
RE_error_trap(-ERR_3);
|
|
}
|
|
}
|
|
|
|
/* q now pts at the back of the class */
|
|
p = *start + 1;
|
|
*start = q + 1;
|
|
|
|
bvp = (BV *) RE_malloc(sizeof(BV));
|
|
memset(bvp, 0, sizeof(BV));
|
|
|
|
if (*p == '^') {
|
|
comp_flag = 1;
|
|
p++;
|
|
} else
|
|
comp_flag = 0;
|
|
|
|
prevc = -1; /* indicates - cannot be part of a range */
|
|
|
|
while (p < q) {
|
|
switch (*p) {
|
|
case '\\':
|
|
|
|
++p;
|
|
prevc = escape(&p);
|
|
char_on(*bvp, prevc);
|
|
break;
|
|
|
|
case '[':
|
|
if ((cclass = get_cclass(p, &p)) != 0) {
|
|
while (cclass->first >= 0) {
|
|
block_on(*bvp, cclass->first, cclass->last);
|
|
++cclass;
|
|
}
|
|
} else {
|
|
prevc = (UChar) * p++;
|
|
char_on(*bvp, prevc);
|
|
}
|
|
break;
|
|
|
|
case '-':
|
|
|
|
if (prevc == -1 || p + 1 == q) {
|
|
prevc = '-';
|
|
char_on(*bvp, '-');
|
|
p++;
|
|
} else {
|
|
int c;
|
|
char *mark = ++p;
|
|
|
|
if (*p != '\\')
|
|
c = (UChar) * p++;
|
|
else {
|
|
++p;
|
|
c = escape(&p);
|
|
}
|
|
|
|
if (prevc <= c) {
|
|
block_on(*bvp, prevc, c);
|
|
prevc = -1;
|
|
} else { /* back up */
|
|
p = mark;
|
|
prevc = '-';
|
|
char_on(*bvp, '-');
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
prevc = (UChar) * p++;
|
|
char_on(*bvp, prevc);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (comp_flag) {
|
|
for (p = (char *) bvp; p < (char *) bvp + sizeof(BV); p++) {
|
|
*p = (char) (~*p);
|
|
}
|
|
}
|
|
|
|
*mp = RE_class(store_bvp(bvp));
|
|
return T_CLASS;
|
|
}
|
|
|
|
/* storage for bit vectors so they can be reused ,
|
|
stored in an unsorted linear array
|
|
the array grows as needed
|
|
*/
|
|
|
|
#define BV_GROWTH 6
|
|
|
|
static BV **bv_base, **bv_limit;
|
|
static BV **bv_next; /* next empty slot in the array */
|
|
|
|
static BV *
|
|
store_bvp(BV * bvp)
|
|
{
|
|
register BV **p;
|
|
unsigned t;
|
|
|
|
if (bv_next == bv_limit) {
|
|
/* need to grow */
|
|
if (!bv_base) {
|
|
/* first growth */
|
|
t = 0;
|
|
bv_base = (BV **) RE_malloc(BV_GROWTH * sizeof(BV *));
|
|
} else {
|
|
t = (unsigned) (bv_next - bv_base);
|
|
bv_base = (BV **) RE_realloc(bv_base,
|
|
(t + BV_GROWTH) * sizeof(BV *));
|
|
}
|
|
|
|
bv_next = bv_base + t;
|
|
bv_limit = bv_next + BV_GROWTH;
|
|
}
|
|
|
|
/* put bvp in bv_next as a sentinel */
|
|
*bv_next = bvp;
|
|
p = bv_base;
|
|
while (memcmp(*p, bvp, sizeof(BV)))
|
|
p++;
|
|
|
|
if (p == bv_next) {
|
|
/* it is new */
|
|
bv_next++;
|
|
} else {
|
|
/* we already have it */
|
|
RE_free(bvp);
|
|
}
|
|
|
|
return *p;
|
|
}
|
|
|
|
/* ---------- convert escape sequences -------------*/
|
|
|
|
#define isoctal(x) ((x)>='0'&&(x)<='7')
|
|
|
|
#define NOT_HEX 16
|
|
static const char hex_val['f' - 'A' + 1] =
|
|
{
|
|
10, 11, 12, 13, 14, 15, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
10, 11, 12, 13, 14, 15};
|
|
|
|
/* interpret 1 character as hex */
|
|
static int
|
|
ctohex(int c)
|
|
{
|
|
int t;
|
|
|
|
if (isdigit((UChar) c))
|
|
return c - '0';
|
|
if (isxdigit((UChar) c) && (t = hex_val[c - 'A']))
|
|
return t;
|
|
return NOT_HEX;
|
|
}
|
|
|
|
/*
|
|
* Return the char and move the pointer forward.
|
|
* On entry *s -> at the character after the slash.
|
|
*/
|
|
static int
|
|
escape(char **start_p)
|
|
{
|
|
register char *p = *start_p;
|
|
register unsigned x;
|
|
unsigned xx;
|
|
|
|
switch (*p) {
|
|
case 'n':
|
|
*start_p = p + 1;
|
|
return '\n';
|
|
case 't':
|
|
*start_p = p + 1;
|
|
return '\t';
|
|
case 'f':
|
|
*start_p = p + 1;
|
|
return '\f';
|
|
case 'b':
|
|
*start_p = p + 1;
|
|
return '\b';
|
|
case 'r':
|
|
*start_p = p + 1;
|
|
return '\r';
|
|
case 'a':
|
|
*start_p = p + 1;
|
|
return '\07';
|
|
case 'v':
|
|
*start_p = p + 1;
|
|
return '\013';
|
|
case '\0':
|
|
return '\\';
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
x = (unsigned) (*p++ - '0');
|
|
if (isoctal(*p)) {
|
|
x = (x << 3) + (unsigned) (*p++ - '0');
|
|
if (isoctal(*p))
|
|
x = (x << 3) + (unsigned) (*p++ - '0');
|
|
}
|
|
*start_p = p;
|
|
return (int) (x & 0xff);
|
|
case 'x':
|
|
++p;
|
|
if ((x = (unsigned) ctohex(*p)) == NOT_HEX) {
|
|
*start_p = p;
|
|
return 'x';
|
|
}
|
|
|
|
/* look for another hex digit */
|
|
if ((xx = (unsigned) ctohex(*++p)) != NOT_HEX) {
|
|
x = (x << 4) + xx;
|
|
p++;
|
|
}
|
|
|
|
*start_p = p;
|
|
return (int) x;
|
|
default:
|
|
/* anything else \c -> c */
|
|
*start_p = p + 1;
|
|
return (UChar) * p;
|
|
}
|
|
}
|