From ea317ccb31f6a2617f030e911668f46874f7dacb Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 4 Feb 2012 17:08:58 -0700 Subject: [PATCH] regcomp.c: Use compiled-in inversion lists This uses the compiled inversion lists to generate Posix character classes and things like \v, \s inside bracketed character classes. This paves the way for future optimizations, and fixes the bug which has no formal bug number that /[[:ascii:]]/i matched non-Ascii characters, such as the Kelvin sign, unlike /\p{ascii}/i. --- pod/perldelta.pod | 7 + regcomp.c | 453 +++++++++++++++++++++++++++++++--------------- t/re/re_tests | 3 +- utf8.c | 15 +- 4 files changed, 332 insertions(+), 146 deletions(-) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index cf38e2fbd8..1345581fa3 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -629,6 +629,13 @@ with "\n". This has been fixed [perl #109206]. =item * +C and C now match identically (when not +under a differing locale). This fixes a regression introduced in 5.14 +in which the first expression could match characters outside of ASCII, +such as the KELVIN SIGN. + +=item * + Method calls whose arguments were all surrounded with C or C (as in C<<$object->method(my($a,$b)) >>) used to force lvalue context on the subroutine. This would prevent lvalue methods from returning certain diff --git a/regcomp.c b/regcomp.c index f55b361881..157e06ed1c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10029,91 +10029,132 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } -/* No locale test, and always Unicode semantics, no ignore-case differences */ -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ -ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '!'; \ - what = WORD; \ - break +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } -/* Like the above, but there are differences if we are in uni-8-bit or not, so - * there are two tests passed in, to use depending on that. There aren't any - * cases where the label is different from the name, so no need for that - * parameter. - * Sets 'what' to WORD which is the property name for non-bitmap code points; - * But, uses FOLD_WORD instead if /i has been selected, to allow a different - * property name */ -#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \ -ANYOF_##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - set_regclass_bit(pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - yesno = '+'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break; \ -case ANYOF_N##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (! TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - if (AT_LEAST_ASCII_RESTRICTED) { \ - for (value = 128; value < 256; value++) { \ - stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ - } \ - else { \ - /* For a non-ut8 target string with DEPENDS semantics, all above \ - * ASCII Latin1 code points match the complement of any of the \ - * classes. But in utf8, they have their Unicode semantics, so \ - * can't just set them in the bitmap, or else regexec.c will think \ - * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ - } \ - } \ - yesno = '!'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : l1_sourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } STATIC U8 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) @@ -10751,77 +10792,201 @@ parseit: * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { - - case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum"); - case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha"); - case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank"); - case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl"); - case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph"); - case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i"); - case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint"); - case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace"); - case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct"); - case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i"); - /* \s, \w match all unicode if utf8. */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word"); - case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit"); - case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); - case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { - for (value = 0; value < 128; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - yesno = '+'; - what = NULL; /* Doesn't match outside ascii, so - don't want to add +utf8:: */ + else { + _invlist_union(properties, PL_ASCII, &properties); + } break; case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { - for (value = 128; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - yesno = '!'; - what = "ASCII"; - break; + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); - } - yesno = '+'; - what = "Digit"; + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); break; case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_HorizSpace, &nonbitmap); + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_HorizSpace, &nonbitmap); + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching the corresponding Alpha property */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1PosixAlpha; + Xname = "__XposixAlpha_i"; + } else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); - for (value = '9' + 1; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; } - yesno = '!'; - what = "Digit"; - if (AT_LEAST_ASCII_RESTRICTED ) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); } - break; + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1PosixAlpha; + Xname = "__XposixAlpha_i"; + } + else { + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; + } + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; diff --git a/t/re/re_tests b/t/re/re_tests index e37cfb1cc5..2bdb24cdb8 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1565,5 +1565,6 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer /sst/ui s\N{LATIN SMALL LIGATURE LONG S T} y $& s\N{LATIN SMALL LIGATURE LONG S T} # /i properties shouldn't match more than the property itself -/[[:ascii:]]/i \N{KELVIN SIGN} Tn - - +/[[:ascii:]]/i \N{KELVIN SIGN} n - - + # vim: softtabstop=0 noexpandtab diff --git a/utf8.c b/utf8.c index e61f5746ab..b9efb366b0 100644 --- a/utf8.c +++ b/utf8.c @@ -1514,7 +1514,9 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags) return CALL_FOLD_CASE(p, p, lenp, flags); } -/* for now these all assume no locale info available for Unicode > 255 */ +/* for now these all assume no locale info available for Unicode > 255; and + * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been + * called instead, so that these don't get called for < 255 */ bool Perl_is_uni_alnum_lc(pTHX_ UV c) @@ -1628,6 +1630,17 @@ static bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char *const swashname) { + /* returns a boolean giving whether or not the UTF8-encoded character that + * starts at

is in the swash indicated by . + * contains a pointer to where the swash indicated by + * is to be stored; which this routine will do, so that future calls will + * look at <*swash> and only generate a swash if it is not null + * + * Note that it is assumed that the buffer length of

is enough to + * contain all the bytes that comprise the character. Thus, <*p> should + * have been checked before this call for mal-formedness enough to assure + * that. */ + dVAR; PERL_ARGS_ASSERT_IS_UTF8_COMMON;