diff --git a/CHANGES b/CHANGES index 567d263..0b3bc0e 100644 --- a/CHANGES +++ b/CHANGES @@ -1,8 +1,14 @@ --- $MawkId: CHANGES,v 1.56 2009/09/14 09:32:45 tom Exp $ +-- $MawkId: CHANGES,v 1.58 2009/09/16 23:32:59 tom Exp $ Changes by Thomas E Dickey -20090914 +20090916 + correct logic in scan.c to handle expression "[[]" (report by Aleksey + Cheusov). + + add MAWK_LONG_OPTIONS feature to allow mawk to ignore long options + which are not implemented. + modify built-in regular expression functions to accept embedded nulls. modify input reader FINgets() to accept embedded nulls in data read diff --git a/MANIFEST b/MANIFEST index 1f142c4..1af9c71 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,4 +1,4 @@ -MANIFEST for mawk, version t20090820a +MANIFEST for mawk, version t20090820b -------------------------------------------------------------------------------- MANIFEST this file ACKNOWLEDGMENT acknowledgements diff --git a/cast.c b/cast.c index 066f27c..41010d3 100644 --- a/cast.c +++ b/cast.c @@ -10,7 +10,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: cast.c,v 1.8 2009/08/20 23:00:13 tom Exp $ + * $MawkId: cast.c,v 1.9 2009/09/16 09:29:51 tom Exp $ * @Log: cast.c,v @ * Revision 1.6 1996/08/11 22:07:50 mike * Fix small bozo in rt_error("overflow converting ...") @@ -316,6 +316,12 @@ cast_for_split(CELL * cp) cp->type = C_SPACE; return; } else if (c == 0) { +#ifdef LOCAL_REGEXP + char temp[1]; + temp[0] = (char) c; + free_STRING(string(cp)); + cp->ptr = (PTR) new_STRING1(temp, 1); +#else /* * A null is not a meta character, but strchr will match it anyway. * For now, there's no reason to compile a null as a regular @@ -327,6 +333,7 @@ cast_for_split(CELL * cp) free_STRING(string(cp)); cp->ptr = (PTR) new_STRING1(temp, 1); return; +#endif } else if (strchr(meta, c)) { xbuff[1] = (char) c; free_STRING(string(cp)); diff --git a/init.c b/init.c index c618485..8c6f577 100644 --- a/init.c +++ b/init.c @@ -10,7 +10,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: init.c,v 1.11 2009/08/21 00:53:52 tom Exp $ + * $MawkId: init.c,v 1.12 2009/09/16 22:32:17 tom Exp $ * @Log: init.c,v @ * Revision 1.11 1995/08/20 17:35:21 mike * include for MSC, needed for environ decl @@ -169,6 +169,30 @@ process_cmdline(int argc, char **argv) } /* safe to look at argv[i][2] */ + /* + * Check for "long" options and decide how to handle them. + */ + if (strlen(argv[i]) > 2 && !strncmp(argv[i], "--", 2)) { + char *env = getenv("MAWK_LONG_OPTIONS"); + if (env != 0) { + switch (*env) { + default: + case 'e': /* error */ + bad_option(argv[i]); + break; + case 'w': /* warn */ + errmsg(0, "ignored option: %s", argv[i]); + break; + case 'i': /* ignore */ + break; + } + } else { + bad_option(argv[i]); + } + nextarg = i + 1; + continue; + } + if (argv[i][2] == 0) { if (i == argc - 1 && argv[i][1] != '-') { if (strchr("WFvf", argv[i][1])) { diff --git a/rexp0.c b/rexp0.c index a5b8a26..45deccd 100644 --- a/rexp0.c +++ b/rexp0.c @@ -10,7 +10,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: rexp0.c,v 1.13 2009/09/13 22:38:22 tom Exp $ + * $MawkId: rexp0.c,v 1.16 2009/09/17 22:58:49 tom Exp $ * @Log: rexp0.c,v @ * Revision 1.5 1996/11/08 15:39:27 mike * While cleaning up block_on, I introduced a bug. Now fixed. @@ -84,7 +84,7 @@ static BV *store_bvp(BV *); static const char RE_char2token['|' + 1] = { - 0, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*07*/ + T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*07*/ T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*0f*/ T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*17*/ T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, T_CHAR, /*1f*/ @@ -127,6 +127,10 @@ RE_lex(MACHINE * mp) { register int c; + if ((unsigned) (1 + lp - re_str) >= re_len) { + return 0; + } + switch (c = char2token((UChar) (*lp))) { case T_PLUS: case T_STAR: @@ -269,7 +273,7 @@ do_str( *s++ = (char) c; len = 1; - while (1) { + while ((1 + p - re_str) < (int) re_len) { char *save; switch (char2token((UChar) (*p))) { @@ -398,8 +402,9 @@ lookup_cclass(char **start) } } - if (code == CCLASS_NONE) + if (code == CCLASS_NONE) { RE_error_trap(-E3); + } if ((result = cclass_table[item].data) == 0) { int ch = 0; @@ -485,6 +490,38 @@ lookup_cclass(char **start) return result; } +static CCLASS * +get_cclass(char *start, char **next) +{ + CCLASS *result = 0; + + if (start[0] == '[' + && start[1] == ':') { + result = lookup_cclass(&start); + if (next != 0) { + *next = start; + } + } + return result; +} + +/* + * Check if we're pointing to a left square-bracket. If so, return nonzero + * if that is a literal one, not part of character class, etc. + * + * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html#tag_09_03_05 + */ +static int +literal_leftsq(char *start) +{ + int result = 0; + if (start[0] == '[') { + if (get_cclass(start, 0) == 0) + result = 1; + } + return result; +} + /* build a BV for a character class. *start points at the '[' on exit: *start points at the character after ']' @@ -506,15 +543,16 @@ do_class(char **start, MACHINE * mp) /* []...] puts ] in a class [^]..] negates a class with ] */ - if (*p == ']') + if (literal_leftsq(p) || p[0] == ']') p++; - else if (*p == '^' && *(p + 1) == ']') + else if (p[0] == '^' && (literal_leftsq(p + 1) || p[1] == ']')) p += 2; for (level = 0, q = p; (level != 0) || (*q != ']'); ++q) { if (*q == '[') { - if (q[1] != ':' || ++level > 1) + if (q[1] != ':' || ++level > 1) { RE_error_trap(-E3); + } } else if (*q == ']') { if (level > 0) { if (q[-1] != ':') @@ -554,7 +592,7 @@ do_class(char **start, MACHINE * mp) break; case '[': - if (p[1] == ':' && (cclass = lookup_cclass(&p)) != 0) { + if ((cclass = get_cclass(p, &p)) != 0) { while (cclass->first >= 0) { block_on(*bvp, cclass->first, cclass->last); ++cclass; diff --git a/scan.c b/scan.c index 2a1eef3..8d744aa 100644 --- a/scan.c +++ b/scan.c @@ -10,7 +10,7 @@ the GNU General Public License, version 2, 1991. ********************************************/ /* - * $MawkId: scan.c,v 1.11 2009/07/27 20:39:41 tom Exp $ + * $MawkId: scan.c,v 1.12 2009/09/17 09:35:28 tom Exp $ * @Log: scan.c,v @ * Revision 1.8 1996/07/28 21:47:05 mike * gnuish patch @@ -1003,7 +1003,8 @@ collect_RE(void) string_buff); mawk_exit(2); } - switch (scan_code[(UChar) (*p++ = (char) next())]) { + c = (UChar) (*p++ = (char) next()); + switch (scan_code[c]) { case SC_POW: if (p == first + 1) { first = p; @@ -1016,9 +1017,17 @@ collect_RE(void) * started, so we can make comparisons to handle things like * "[]xxxx]" and "[^]xxxx]". */ - if (!boxed) + if (!boxed) { first = p; - ++boxed; + ++boxed; + } else if (p != first + 1) { + ++boxed; + } else { + if (next() == ':') { + ++boxed; + } + un_next(); + } break; case SC_RBOX: diff --git a/test/reg-awk.out b/test/reg-awk.out index 9daad79..e844811 100644 --- a/test/reg-awk.out +++ b/test/reg-awk.out @@ -4,92 +4,151 @@ reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: to >> +reg4.5<<: to >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: Some care is needed so that things like >> +reg4.5<<: Some care is needed so that things like >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: static unsigned last_dhash ; >> +reg4.5<<: static unsigned last_dhash ; >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: ARRAY A ; >> +reg4.5<<: ARRAY A ; >> reg4.3<<: STRING *sval ; >> +reg4.5<<: STRING *sval ; >> reg4.3<<: { >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: { >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: } >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: { >> reg4.3<<: else >> +reg4.5<<: else >> reg4.3<<: } >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: } >> reg4.3<<: return p ; >> +reg4.5<<: return p ; >> reg4.3<<: } >> +reg4.4<<: } >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: ARRAY A ; >> +reg4.5<<: ARRAY A ; >> reg4.3<<: double d ; >> +reg4.5<<: double d ; >> reg4.3<<: int cflag ; >> +reg4.5<<: int cflag ; >> reg4.3<<: { >> +reg4.4<<: { >> reg4.3<<: ANODE *ap ; >> +reg4.5<<: ANODE *ap ; >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.3<<: break ; >> +reg4.5<<: break ; >> reg4.3<<: } >> reg4.3<<: } >> reg4.3<<: else >> +reg4.5<<: else >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> reg4.1<<: >> reg4.2<<: >> reg4.3<<: >> +reg4.4<<: >> +reg4.5<<: >> 26..12: each array is of size A_HASH_PRIME. reg5.1<> 26..12: each array is of size A_HASH_PRIME. diff --git a/test/reg4.awk b/test/reg4.awk index 05ab715..483708c 100644 --- a/test/reg4.awk +++ b/test/reg4.awk @@ -1,15 +1,23 @@ -# $MawkId: reg4.awk,v 1.4 2009/07/12 22:23:58 tom Exp $ +# $MawkId: reg4.awk,v 1.7 2009/09/17 23:29:01 tom Exp $ { if ($0 ~/^[-+()0-9.,$%/'"]*$/) - { + { print ("reg4.1<<:",$0,">>") } if ($0 ~/^[]+()0-9.,$%/'"-]*$/) - { + { print ("reg4.2<<:",$0,">>") } if ($0 ~/^[^]+()0-9.,$%/'"-]*$/) - { + { print ("reg4.3<<:",$0,">>") } + if ($0 ~/^[[+(){}0-9.,$%/'"-]*$/) + { + print ("reg4.4<<:",$0,">>") + } + if ($0 ~/^[^[+(){}0-9.,$%/'"-]*$/) + { + print ("reg4.5<<:",$0,">>") + } } diff --git a/test/reg5.awk b/test/reg5.awk index 666c405..e74aa9b 100644 --- a/test/reg5.awk +++ b/test/reg5.awk @@ -1,4 +1,4 @@ -# $MawkId: reg5.awk,v 1.1 2009/07/27 18:55:24 tom Exp $ +# $MawkId: reg5.awk,v 1.2 2009/09/17 00:51:34 tom Exp $ BEGIN { pat1="([[:upper:][:digit:]])+(_[[:upper:][:digit:]]+)+" pat2="0x[[:xdigit:]]+" @@ -22,4 +22,5 @@ BEGIN { printf "%d..%d:%s\n", RSTART, RLENGTH, $0 printf ("reg5.3<<%s>>\n",substr($0,RSTART,RLENGTH)) } + # add patterns like those in reg4.awk which exercise [, ] at beginning }