Add valid_utf8_to_uv()

This is identical to valid_utf8_to_uvchr(). They are both internal
functions designed for when you are certain that the utf8 string to be
translated is well formed; generally you created it yourself earlier.

The only reason for this new synonym is to lessen the cognitive load on
programmers who should be using the "_uv" suffix functions, and not the
"_uvchr" suffix ones for these sorts of tasks. By having this synonym,
one doesn't have to learn that there are two.
This commit is contained in:
Karl Williamson 2025-08-21 13:47:39 -06:00
parent 738383d65e
commit 8543a7ac33
6 changed files with 36 additions and 9 deletions

View File

@ -3906,7 +3906,10 @@ Adp |bool |valid_identifier_pvn \
|U32 flags
Adp |bool |valid_identifier_sv \
|NULLOK SV *sv
CRTdip |UV |valid_utf8_to_uvchr \
CRTdip |UV |valid_utf8_to_uv \
|NN const U8 *s \
|NULLOK STRLEN *retlen
CRTdmp |UV |valid_utf8_to_uvchr \
|NN const U8 *s \
|NULLOK STRLEN *retlen
Adp |int |vcmp |NN SV *lhv \

View File

@ -841,7 +841,8 @@
# define valid_identifier_pve(a,b,c) Perl_valid_identifier_pve(aTHX_ a,b,c)
# define valid_identifier_pvn(a,b,c) Perl_valid_identifier_pvn(aTHX_ a,b,c)
# define valid_identifier_sv(a) Perl_valid_identifier_sv(aTHX_ a)
# define valid_utf8_to_uvchr Perl_valid_utf8_to_uvchr
# define valid_utf8_to_uv Perl_valid_utf8_to_uv
# define Perl_valid_utf8_to_uvchr valid_utf8_to_uvchr
# define vcmp(a,b) Perl_vcmp(aTHX_ a,b)
# define vcroak(a,b) Perl_vcroak(aTHX_ a,b)
# define vdeb(a,b) Perl_vdeb(aTHX_ a,b)

View File

@ -1306,25 +1306,36 @@ Perl_utf8_to_bytes_overwrite(pTHX_ U8 **s_ptr, STRLEN *lenp)
}
/*
=for apidoc valid_utf8_to_uvchr
Like C<L<perlapi/utf8_to_uv>>, but should only be called when it is
=for apidoc valid_utf8_to_uv
=for apidoc_item valid_utf8_to_uvchr
These are synonymous.
These are like C<L<perlapi/utf8_to_uv>>, but should only be called when it is
known that the next character in the input UTF-8 string C<s> is well-formed
(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
points, and non-Unicode code points are allowed.
The only use for these is that they should run slightly faster than
C<utf8_to_uv> because no error checking is done.
The C<_uv> form is slightly preferred so as to have a consistent spelling with
the other C<_uv> forms that are definitely preferred over the older and
problematic C<_uvchr> forms.
=cut
*/
PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
Perl_valid_utf8_to_uv(const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_VALID_UTF8_TO_UV;
const UV expectlen = UTF8SKIP(s);
const U8* send = s + expectlen;
UV uv = *s;
PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
if (retlen) {
*retlen = expectlen;
}

View File

@ -350,6 +350,13 @@ well.
XXX
=item *
A new function C<valid_utf8_to_uv> has been added. This is synonymous
with C<valid_utf8_to_uvchr>; its reason for existence is to have
consistent spelling with the names of the other functions that translate
from UTF-8, so you don't have to remember a different spelling.
=back
=head1 Selected Bug Fixes

View File

@ -5428,6 +5428,10 @@ PERL_CALLCONV bool
Perl_valid_identifier_sv(pTHX_ SV *sv);
#define PERL_ARGS_ASSERT_VALID_IDENTIFIER_SV
/* PERL_CALLCONV UV
Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
__attribute__warn_unused_result__; */
#define PERL_ARGS_ASSERT_VALIDATE_PROTO \
assert(name)
@ -10298,9 +10302,9 @@ Perl_uv_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
assert(d)
PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
Perl_valid_utf8_to_uv(const U8 *s, STRLEN *retlen)
__attribute__warn_unused_result__;
# define PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR \
# define PERL_ARGS_ASSERT_VALID_UTF8_TO_UV \
assert(s)
PERL_STATIC_INLINE void

1
utf8.h
View File

@ -191,6 +191,7 @@ For details, see the description for L<perlapi/uv_to_utf8_flags>.
#define c9strict_utf8_to_uv(s, e, cp_p, advance_p) \
utf8_to_uv_flags( s, e, cp_p, advance_p, \
UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
#define valid_utf8_to_uvchr(s, advance_p) valid_utf8_to_uv(s, advance_p)
#define utf16_to_utf8(p, d, bytelen, newlen) \
utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)