From 9c2eed89a71ad4aa624f782ac01805da8f75cff0 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 2 Dec 2024 18:07:03 -0700 Subject: [PATCH] Document uv_to_utf8_family --- pod/perldelta.pod | 11 +++++++++++ utf8.c | 26 +++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index bd202557c4..1dcc264c90 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -414,6 +414,11 @@ L> replaces L> (which is retained for backwards compatibility), but you should convert to use the new form, as likely you aren't using the old one safely. +To convert in the opposite direction, you can now use +L>. This is not a new function, but a new synonym +for L>. It is added so you don't have to learn +two sets of names. + There are also two new functions, L> and L> which do the same thing except when the input string represents a code point that Unicode doesn't accept as @@ -440,6 +445,12 @@ L> replaces L>. L> replaces L>. +Also added are the inverse functions L> +and L>, which are synonyms for the existing +functions, L> and +L> respectively. These are provided only +so you don't have to learn two sets of names. + =item * Three new API functions are introduced to convert strings encoded in diff --git a/utf8.c b/utf8.c index 74eea0c060..50c5accfa6 100644 --- a/utf8.c +++ b/utf8.c @@ -121,14 +121,14 @@ S_new_msg_hv(pTHX_ const char * const message, /* The message text */ =for apidoc uvoffuni_to_utf8_flags THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Instead, B or -L>. +Instead, B or +L>. This function is like them, but the input is a strict Unicode (as opposed to native) code point. Only in very rare circumstances should code not be using the native code point. -For details, see the description for L. +For details, see the description for L. =cut */ @@ -155,9 +155,11 @@ const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," #define MASK UTF_CONTINUATION_MASK /* -=for apidoc uvchr_to_utf8_flags_msgs +=for apidoc uv_to_utf8_msgs +=for apidoc_item uvchr_to_utf8_flags_msgs -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +These functions are identical. THEY SHOULD BE USED IN ONLY VERY SPECIALIZED +CIRCUMSTANCES. Most code should use C()> rather than call this directly. @@ -367,7 +369,9 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs) } /* -=for apidoc uvchr_to_utf8 +=for apidoc uv_to_utf8 +=for apidoc_item uv_to_utf8_flags +=for apidoc_item uvchr_to_utf8 =for apidoc_item uvchr_to_utf8_flags These each add the UTF-8 representation of the native code point C to the @@ -375,18 +379,22 @@ end of the string C; C should have at least C (up to C) free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, - d = uvchr_to_utf8(d, uv); + d = uv_to_utf8(d, uv); This is the Unicode-aware way of saying *(d++) = uv; -C is used to make some classes of code points problematic in some way. -C is effectively the same as calling C +(C is a synonym for C.) + +C is used to make some classes of code points problematic in +some way. C is effectively the same as calling C with C set to 0, meaning no class of code point is considered problematic. That means any input code point from 0..C is considered to be fine. C is typically 0x7FFF_FFFF in a 32-bit word. +(C is a synonym for C). + A code point can be problematic in one of two ways. Its use could just raise a warning, and/or it could be forbidden with the function failing, and returning NULL.