toke.c: Avoid UTF8SKIPs

This value is now returned from the isID(FIRST|CONT)_lazy_if_safe macros. Use it instead of re-deriving it.
2026-01-26 08:38:23 +00:00 · 2025-09-08 10:13:58 -06:00 · 2025-09-08 10:13:58 -06:00 · 5ea209f5a7
commit 5ea209f5a7
parent 08c5ab5861
1 changed files with 51 additions and 30 deletions
--- a/toke.c
+++ b/toke.c
@ -2088,8 +2088,10 @@ S_check_uni(pTHX)
    while (isSPACE(*PL_last_uni))
        PL_last_uni++;
    s = PL_last_uni;
-    while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
-        s += UTF ? UTF8SKIP(s) : 1;
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
+           || (advance = (*s == '-')))
+        s += advance;
    if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
        return;

@ -5193,10 +5195,11 @@ S_check_scalar_slice(pTHX_ char *s)
    {
        return;
    }
-    while (    isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
-           || (*s && memCHRs(" \t$#+-'\"", *s)))
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
+           || (advance = (*s && memCHRs(" \t$#+-'\"", *s))))
    {
-        s += UTF ? UTF8SKIP(s) : 1;
+        s += advance;
    }
    if (*s == '}' || *s == ']')
        pl_yylval.ival = OPpSLICEWARNING;
@ -5402,8 +5405,11 @@ yyl_dollar(pTHX_ char *s)
                            while (t < PL_bufend && *t == ' ') t++;

                            /* strip off the name of the var */
-                            while (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
-                                t += UTF ? UTF8SKIP(t) : 1;
+                            Size_t advance;
+                            while ((advance = (isWORDCHAR_lazy_if_safe(t,
+                                                                    PL_bufend,
+                                                                    UTF))))
+                                t += advance;
                            /* consumed a varname */
                        } else if (isDIGIT(*t)) {
                            /* deal with hex constants like 0x11 */
@ -6407,6 +6413,7 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack)
             * GSAR 97-07-21
             */
            t = s;
+            Size_t advance;
            if (*s == '\'' || *s == '"' || *s == '`') {
                /* common case: get past first string, handling escapes */
                for (t++; t < PL_bufend && *t != *s;)
@ -6455,20 +6462,24 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack)
                    }
                    t++;
                }
-                else
+                else {
                    /* skip plain q word */
-                    while (   t < PL_bufend
-                           && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                    while (   t < PL_bufend 
+                           && (advance = isWORDCHAR_lazy_if_safe(t,
+                                                                 PL_bufend,
+                                                                 UTF)))
                    {
-                        t += UTF ? UTF8SKIP(t) : 1;
+                        t += advance;
                    }
+                }
            }
-            else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
-                t += UTF ? UTF8SKIP(t) : 1;
+            else if ((advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))) {
+                t += advance;
                while (   t < PL_bufend
-                       && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                       && (advance = isWORDCHAR_lazy_if_safe(t, PL_bufend,
+                                                             UTF)))
                {
-                    t += UTF ? UTF8SKIP(t) : 1;
+                    t += advance;
                }
            }
            while (t < PL_bufend && isSPACE(*t))
@ -10125,11 +10136,12 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
        s++;
    while (s < PL_bufend && isSPACE(*s))
        s++;
-    if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
+    Size_t advance;
+    if ((advance = isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))) {
        const char * const w = s;
-        s += UTF ? UTF8SKIP(s) : 1;
-        while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
-            s += UTF ? UTF8SKIP(s) : 1;
+        s += advance;
+        while ((advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)))
+            s += advance;
        while (s < PL_bufend && isSPACE(*s))
            s++;
        if (*s == ',') {
@ -10281,15 +10293,18 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
    while (*s < PL_bufend) {
        if (*d >= e)
            croak("%s", ident_too_long);
-        if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
+        Size_t advance;
+        if (is_utf8 && (advance = isIDFIRST_utf8_safe(*s, PL_bufend))) {
             /* The UTF-8 case must come first, otherwise things
             * like c\N{COMBINING TILDE} would start failing, as the
             * isWORDCHAR_A case below would gobble the 'c' up.
             */

-            char *t = *s + UTF8SKIP(*s);
-            while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
-                t += UTF8SKIP(t);
+            char *t = *s + advance;
+            while ((advance = isIDCONT_utf8_safe((const U8*) t,
+                                                 (const U8*) PL_bufend)))
+            {
+                t += advance;
            }
            if (*d + (t - *s) > e)
                croak("%s", ident_too_long);
@ -10496,11 +10511,12 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
            /* note we have to check for a normal identifier first,
             * as it handles utf8 symbols, and only after that has
             * been ruled out can we look at the caret words */
-            if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) {
+            Size_t advance;
+            if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) {
                /* if it starts as a valid identifier, assume that it is one.
                   (the later check for } being at the expected point will trap
                   cases where this doesn't pan out.)  */
-                d += is_utf8 ? UTF8SKIP(d) : 1;
+                d += advance;
                parse_ident(&s, &d, e, 1, is_utf8, TRUE);
                *d = '\0';
            }
@ -10998,8 +11014,9 @@ S_scan_heredoc(pTHX_ char *s)

        peek = s;

-        while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) {
-            peek += UTF ? UTF8SKIP(peek) : 1;
+        Size_t advance;
+        while ((advance = isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))) {
+            peek += advance;
        }

        len = (peek - s >= e - d) ? (e - d) : (peek - s);
@ -11442,9 +11459,13 @@ S_scan_inputsymbol(pTHX_ char *start)
    if (*d == '$' && d[1]) d++;

    /* allow <Pkg'VALUE> or <Pkg::VALUE> */
-    while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == ':'
-           || (*d == '\'' && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)) {
-        d += UTF ? UTF8SKIP(d) : 1;
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(d, e, UTF))
+           || (advance = (   *d == ':'
+                          || (   *d == '\''
+                              && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED))))
+    {
+        d += advance;
    }

    /* If we've tried to read what we allow filehandles to look like, and