utf8.c: Remove unnecessary validation

These two functions are to be called only on strings known to be valid, so we can skip the validation.
2026-01-26 16:39:36 +00:00 · 2012-04-28 18:48:52 -06:00 · 2012-04-28 18:48:52 -06:00 · 3986bb7cb3
commit 3986bb7cb3
parent 979f77b669
1 changed files with 31 additions and 8 deletions
--- a/utf8.c
+++ b/utf8.c
@ -915,16 +915,17 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 }

 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>.  Currently, some
- * malformations are checked for, but this checking likely will be removed in
- * the future */
+ * there are no malformations in the input UTF-8 string C<s>.  surrogates,
+ * non-character code points, and non-Unicode code points are allowed */

 UV
 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 {
+    const UV uv = valid_utf8_to_uvuni(s, retlen);
+
    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;

-    return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
+    return UNI_TO_NATIVE(uv);
 }

 /*
@ -993,16 +994,38 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 }

 /* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>.  Currently, some
- * malformations are checked for, but this checking likely will be removed in
- * the future */
+ * there are no malformations in the input UTF-8 string C<s>.  surrogates,
+ * non-character code points, and non-Unicode code points are allowed */

 UV
 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 {
+    UV expectlen = UTF8SKIP(s);
+    const U8* send = s + expectlen;
+    UV uv = NATIVE_TO_UTF(*s);
+
    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;

-    return utf8_to_uvuni_buf(s, s + UTF8_MAXBYTES, retlen);
+    if (retlen) {
+	*retlen = expectlen;
+    }
+
+    /* An invariant is trivially returned */
+    if (expectlen == 1) {
+	return uv;
+    }
+
+    /* Remove the leading bits that indicate the number of bytes, leaving just
+     * the bits that are part of the value */
+    uv &= UTF_START_MASK(expectlen);
+
+    /* Now, loop through the remaining bytes, accumulating each into the
+     * working total as we go */
+    for (++s; s < send; s++) {
+	uv = UTF8_ACCUMULATE(uv, *s);
+    }
+
+    return uv;
 }

 /*