mirror of
https://github.com/Perl/perl5.git
synced 2026-01-27 01:44:43 +00:00
Fixup POSIX::mbtowc, wctomb
This commit enhances these functions so that on threaded perls, they use mbrtowc and wcrtomb when available, making them thread safe. The substitution isn't completely transparent, as no effort is made to hide any differences in errno setting upon error. And there may be slight differences in edge case behavior on some platforms. This commit also changes the behaviors so that they take a scalar parameter instead of a char *, and this might be 'undef' or not be forceable into a valid PV. If not a PV, the functions initialize the shift state. Previously the shift state was always reinitialized with every call, which meant these could not work on locales with shift states. In addition, there were several issues in mbtowc and wctomb that this commit fixes. mbtowc and wctomb, when used, are now run with a semaphore. This avoids races if called at the same time in another thread. The returned wide character from mbtowc() could well have been garbage. The final parameter to mbtowc is now optional, as passing an SV allows us to determine the length without the need for an extra parameter. It is now used only to restrict the parsing of the string to shorter than the actual length. wctomb would segfault if the string parameter was shared or hadn't been pre-allocated with a string of sufficient length to hold the result.
This commit is contained in:
parent
63bebc1439
commit
5a6637f01a
@ -204,6 +204,7 @@
|
||||
#define PL_max_intro_pending (vTHX->Imax_intro_pending)
|
||||
#define PL_maxsysfd (vTHX->Imaxsysfd)
|
||||
#define PL_mbrlen_ps (vTHX->Imbrlen_ps)
|
||||
#define PL_mbrtowc_ps (vTHX->Imbrtowc_ps)
|
||||
#define PL_memory_debug_header (vTHX->Imemory_debug_header)
|
||||
#define PL_mess_sv (vTHX->Imess_sv)
|
||||
#define PL_min_intro_pending (vTHX->Imin_intro_pending)
|
||||
@ -370,6 +371,7 @@
|
||||
#define PL_warnhook (vTHX->Iwarnhook)
|
||||
#define PL_watchaddr (vTHX->Iwatchaddr)
|
||||
#define PL_watchok (vTHX->Iwatchok)
|
||||
#define PL_wcrtomb_ps (vTHX->Iwcrtomb_ps)
|
||||
#define PL_xsubfilename (vTHX->Ixsubfilename)
|
||||
|
||||
#endif /* MULTIPLICITY */
|
||||
|
||||
@ -1545,7 +1545,7 @@ END_EXTERN_C
|
||||
#if ! defined(HAS_MBLEN) && ! defined(HAS_MBRLEN)
|
||||
#define mblen(a,b) not_here("mblen")
|
||||
#endif
|
||||
#ifndef HAS_MBTOWC
|
||||
#if ! defined(HAS_MBTOWC) && ! defined(HAS_MBRTOWC)
|
||||
#define mbtowc(pwc, s, n) not_here("mbtowc")
|
||||
#endif
|
||||
#ifndef HAS_WCTOMB
|
||||
@ -3392,31 +3392,103 @@ mblen(s, n = ~0)
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
int
|
||||
mbtowc(pwc, s, n)
|
||||
wchar_t * pwc
|
||||
char * s
|
||||
size_t n
|
||||
PREINIT:
|
||||
#if defined(USE_ITHREADS) && defined(HAS_MBRTOWC)
|
||||
mbstate_t ps;
|
||||
#endif
|
||||
CODE:
|
||||
#if defined(USE_ITHREADS) && defined(HAS_MBRTOWC)
|
||||
memset(&ps, 0, sizeof(ps));;
|
||||
PERL_UNUSED_RESULT(mbrtowc(pwc, NULL, 0, &ps));/* Reset any shift state */
|
||||
errno = 0;
|
||||
RETVAL = mbrtowc(pwc, s, n, &ps); /* Prefer reentrant version */
|
||||
#if defined(HAS_MBRTOWC) && (defined(USE_ITHREADS) || ! defined(HAS_MBTOWC))
|
||||
# define USE_MBRTOWC
|
||||
#else
|
||||
RETVAL = mbtowc(pwc, s, n);
|
||||
# undef USE_MBRTOWC
|
||||
#endif
|
||||
|
||||
int
|
||||
mbtowc(pwc, s, n = ~0)
|
||||
SV * pwc
|
||||
SV * s
|
||||
size_t n
|
||||
CODE:
|
||||
errno = 0;
|
||||
SvGETMAGIC(s);
|
||||
if (! SvOK(s)) { /* Initialize state */
|
||||
#ifdef USE_MBRTOWC
|
||||
/* Initialize the shift state to all zeros in PL_mbrtowc_ps. */
|
||||
memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
|
||||
RETVAL = 0;
|
||||
#else
|
||||
LOCALE_LOCK;
|
||||
RETVAL = mbtowc(NULL, NULL, 0);
|
||||
LOCALE_UNLOCK;
|
||||
#endif
|
||||
}
|
||||
else { /* Not resetting state */
|
||||
wchar_t wc;
|
||||
SV * byte_s = sv_2mortal(newSVsv_nomg(s));
|
||||
if (! sv_utf8_downgrade_nomg(byte_s, TRUE)) {
|
||||
SETERRNO(EINVAL, LIB_INVARG);
|
||||
RETVAL = -1;
|
||||
}
|
||||
else {
|
||||
size_t len;
|
||||
char * string = SvPV(byte_s, len);
|
||||
if (n < len) len = n;
|
||||
#ifdef USE_MBRTOWC
|
||||
RETVAL = (SSize_t) mbrtowc(&wc, string, len, &PL_mbrtowc_ps);
|
||||
#else
|
||||
/* Locking prevents races, but locales can be switched out
|
||||
* without locking, so this isn't a cure all */
|
||||
LOCALE_LOCK;
|
||||
RETVAL = mbtowc(&wc, string, len);
|
||||
LOCALE_UNLOCK;
|
||||
#endif
|
||||
if (RETVAL >= 0) {
|
||||
sv_setiv_mg(pwc, wc);
|
||||
}
|
||||
else { /* Use mbtowc() ret code for transparency */
|
||||
RETVAL = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
#if defined(HAS_WCRTOMB) && (defined(USE_ITHREADS) || ! defined(HAS_WCTOMB))
|
||||
# define USE_WCRTOMB
|
||||
#else
|
||||
# undef USE_WCRTOMB
|
||||
#endif
|
||||
|
||||
int
|
||||
wctomb(s, wchar)
|
||||
char * s
|
||||
SV * s
|
||||
wchar_t wchar
|
||||
CODE:
|
||||
errno = 0;
|
||||
SvGETMAGIC(s);
|
||||
if (s == &PL_sv_undef) {
|
||||
#ifdef USE_WCRTOMB
|
||||
/* The man pages khw looked at are in agreement that this works.
|
||||
* But probably memzero would too */
|
||||
RETVAL = wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
|
||||
#else
|
||||
LOCALE_LOCK;
|
||||
RETVAL = wctomb(NULL, L'\0');
|
||||
LOCALE_UNLOCK;
|
||||
#endif
|
||||
}
|
||||
else { /* Not resetting state */
|
||||
char buffer[MB_LEN_MAX];
|
||||
#ifdef USE_WCRTOMB
|
||||
RETVAL = wcrtomb(buffer, wchar, &PL_wcrtomb_ps);
|
||||
#else
|
||||
/* Locking prevents races, but locales can be switched out without
|
||||
* locking, so this isn't a cure all */
|
||||
LOCALE_LOCK;
|
||||
RETVAL = wctomb(buffer, wchar);
|
||||
LOCALE_UNLOCK;
|
||||
#endif
|
||||
if (RETVAL >= 0) {
|
||||
sv_setpvn_mg(s, buffer, RETVAL);
|
||||
}
|
||||
}
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
int
|
||||
strcoll(s1, s2)
|
||||
|
||||
@ -1098,9 +1098,35 @@ actual length of the first parameter string.
|
||||
|
||||
=item C<mbtowc>
|
||||
|
||||
This is identical to the C function C<mbtowc()>.
|
||||
This is the same as the C function C<mbtowc()> on unthreaded perls. On
|
||||
threaded perls, it transparently (almost) substitutes the more
|
||||
thread-safe L<C<mbrtowc>(3)>, if available, instead of C<mbtowc>.
|
||||
|
||||
See L</mblen>.
|
||||
Core Perl does not have any support for wide and multibyte locales,
|
||||
except Unicode UTF-8 locales. This function, in conjunction with
|
||||
L</mblen> and L</wctomb> may be used to roll your own decoding/encoding
|
||||
of other types of multi-byte locales.
|
||||
|
||||
The first parameter is a scalar into which, upon success, the wide
|
||||
character represented by the multi-byte string contained in the second
|
||||
parameter is stored. The optional third parameter is ignored if it is
|
||||
larger than the actual length of the second parameter string.
|
||||
|
||||
Use C<undef> as the second parameter to this function to get the effect
|
||||
of passing NULL as the second parameter to C<mbtowc>. This resets any
|
||||
shift state to its initial value. The return value is undefined if
|
||||
C<mbrtowc> was substituted, so you should never rely on it.
|
||||
|
||||
When the second parameter is a scalar containing a value that either is
|
||||
a PV string or can be forced into one, the return value is the number of
|
||||
bytes occupied by the first character of that string; or 0 if that first
|
||||
character is the wide NUL character; or negative if there is an error.
|
||||
This is based on the locale that currently underlies the program,
|
||||
regardless of whether or not the function is called from Perl code that
|
||||
is within the scope of S<C<use locale>>. Perl makes no attempt at
|
||||
hiding from your code any differences in the C<errno> setting between
|
||||
C<mbtowc> and C<mbrtowc>. It does set C<errno> to 0 before calling
|
||||
them.
|
||||
|
||||
=item C<memchr>
|
||||
|
||||
@ -2131,9 +2157,30 @@ See L</mblen>.
|
||||
|
||||
=item C<wctomb>
|
||||
|
||||
This is identical to the C function C<wctomb()>.
|
||||
This is the same as the C function C<wctomb()> on unthreaded perls. On
|
||||
threaded perls, it transparently (almost) substitutes the more
|
||||
thread-safe L<C<wcrtomb>(3)>, if available, instead of C<wctomb>.
|
||||
|
||||
See L</mblen>.
|
||||
Core Perl does not have any support for wide and multibyte locales,
|
||||
except Unicode UTF-8 locales. This function, in conjunction with
|
||||
L</mblen> and L</mbtowc> may be used to roll your own decoding/encoding
|
||||
of other types of multi-byte locales.
|
||||
|
||||
Use C<undef> as the first parameter to this function to get the effect
|
||||
of passing NULL as the first parameter to C<wctomb>. This resets any
|
||||
shift state to its initial value. The return value is undefined if
|
||||
C<wcrtomb> was substituted, so you should never rely on it.
|
||||
|
||||
When the first parameter is a scalar, the code point contained in the
|
||||
scalar second parameter is converted into a multi-byte string and stored
|
||||
into the first parameter scalar. This is based on the locale that
|
||||
currently underlies the program, regardless of whether or not the
|
||||
function is called from Perl code that is within the scope of S<C<use
|
||||
locale>>. The return value is the number of bytes stored; or negative
|
||||
if the code point isn't representable in the current locale. Perl makes
|
||||
no attempt at hiding from your code any differences in the C<errno>
|
||||
setting between C<wctomb> and C<wcrtomb>. It does set C<errno> to 0
|
||||
before calling them.
|
||||
|
||||
=item C<write>
|
||||
|
||||
|
||||
@ -19,8 +19,9 @@ BEGIN {
|
||||
require 'test.pl';
|
||||
}
|
||||
|
||||
plan tests => 6;
|
||||
my $utf8_locale = find_utf8_ctype_locale();
|
||||
|
||||
plan tests => 13;
|
||||
|
||||
use POSIX qw();
|
||||
|
||||
@ -33,7 +34,6 @@ SKIP: {
|
||||
skip("LC_CTYPE locale support not available", 4)
|
||||
unless locales_enabled('LC_CTYPE');
|
||||
|
||||
my $utf8_locale = find_utf8_ctype_locale();
|
||||
skip("no utf8 locale available", 4) unless $utf8_locale;
|
||||
|
||||
local $ENV{LC_CTYPE} = $utf8_locale;
|
||||
@ -69,3 +69,79 @@ SKIP: {
|
||||
-1, {}, 'mblen() returns -1 when input length is too short');
|
||||
}
|
||||
}
|
||||
|
||||
SKIP: {
|
||||
skip("mbtowc() not present", 5) unless $Config{d_mbtowc};
|
||||
|
||||
my $wide;
|
||||
|
||||
is(&POSIX::mbtowc($wide, "a"), 1, 'mbtowc() returns correct length on ASCII input');
|
||||
is($wide , ord "a", 'mbtowc() returns correct ordinal on ASCII input');
|
||||
|
||||
skip("LC_CTYPE locale support not available", 3)
|
||||
unless locales_enabled('LC_CTYPE');
|
||||
|
||||
skip("no utf8 locale available", 3) unless $utf8_locale;
|
||||
|
||||
local $ENV{LC_CTYPE} = $utf8_locale;
|
||||
local $ENV{LC_ALL};
|
||||
delete $ENV{LC_ALL};
|
||||
local $ENV{PERL_UNICODE};
|
||||
delete $ENV{PERL_UNICODE};
|
||||
|
||||
SKIP: {
|
||||
my ($major, $minor, $rest) = $Config{osvers} =~ / (\d+) \. (\d+) .* /x;
|
||||
skip("mbtowc() broken (at least for c.utf8) on early HP-UX", 3)
|
||||
if $Config{osname} eq 'hpux'
|
||||
&& $major < 11 || ($major == 11 && $minor < 31);
|
||||
|
||||
fresh_perl_is(
|
||||
'use POSIX; &POSIX::mbtowc(undef, undef,0); my $wide; print &POSIX::mbtowc($wide, "'
|
||||
. I8_to_native("\x{c3}\x{28}")
|
||||
. '", 2)',
|
||||
-1, {}, 'mbtowc() recognizes invalid multibyte characters');
|
||||
|
||||
fresh_perl_is(
|
||||
'use POSIX; &POSIX::mbtowc(undef,undef,0);
|
||||
my $sigma = "\N{GREEK SMALL LETTER SIGMA}";
|
||||
utf8::encode($sigma);
|
||||
my $wide; my $len = &POSIX::mbtowc($wide, $sigma, 2);
|
||||
print "$len:$wide"',
|
||||
"2:963", {}, 'mbtowc() works on UTF-8 characters');
|
||||
|
||||
fresh_perl_is(
|
||||
'use POSIX; &POSIX::mbtowc(undef,undef,0);
|
||||
my $wide; print &POSIX::mbtowc($wide, "\N{GREEK SMALL LETTER SIGMA}", 1);',
|
||||
-1, {}, 'mbtowc() returns -1 when input length is too short');
|
||||
}
|
||||
}
|
||||
|
||||
SKIP: {
|
||||
skip("mbtowc or wctomb() not present", 2) unless $Config{d_mbtowc} && $Config{d_wctomb};
|
||||
|
||||
fresh_perl_is('use POSIX; &POSIX::wctomb(undef,0); my $string; my $len = &POSIX::wctomb($string, ord "A"); print "$len:$string"',
|
||||
"1:A", {}, 'wctomb() works on ASCII input');
|
||||
|
||||
skip("LC_CTYPE locale support not available", 1)
|
||||
unless locales_enabled('LC_CTYPE');
|
||||
|
||||
skip("no utf8 locale available", 1) unless $utf8_locale;
|
||||
|
||||
local $ENV{LC_CTYPE} = $utf8_locale;
|
||||
local $ENV{LC_ALL};
|
||||
delete $ENV{LC_ALL};
|
||||
local $ENV{PERL_UNICODE};
|
||||
delete $ENV{PERL_UNICODE};
|
||||
|
||||
SKIP: {
|
||||
my ($major, $minor, $rest) = $Config{osvers} =~ / (\d+) \. (\d+) .* /x;
|
||||
skip("wctomb() broken (at least for c.utf8) on early HP-UX", 1)
|
||||
if $Config{osname} eq 'hpux'
|
||||
&& $major < 11 || ($major == 11 && $minor < 31);
|
||||
|
||||
fresh_perl_is('use POSIX; &POSIX::wctomb(undef,0); my $string; my $len = &POSIX::wctomb($string, 0x100); print "$len:$string"',
|
||||
"2:" . I8_to_native("\x{c4}\x{80}"),
|
||||
{}, 'wctomb() works on UTF-8 characters');
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -941,6 +941,12 @@ PERLVAR(I, Private_Use, SV *)
|
||||
#ifdef HAS_MBRLEN
|
||||
PERLVAR(I, mbrlen_ps, mbstate_t)
|
||||
#endif
|
||||
#ifdef HAS_MBRTOWC
|
||||
PERLVAR(I, mbrtowc_ps, mbstate_t)
|
||||
#endif
|
||||
#ifdef HAS_WCRTOMB
|
||||
PERLVAR(I, wcrtomb_ps, mbstate_t)
|
||||
#endif
|
||||
|
||||
/* If you are adding a U8 or U16, check to see if there are 'Space' comments
|
||||
* above on where there are gaps which currently will be structure padding. */
|
||||
|
||||
10
locale.c
10
locale.c
@ -3461,11 +3461,17 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
|
||||
# endif
|
||||
# endif /* DEBUGGING */
|
||||
|
||||
/* Initialize the per-thread mbrFOO() state variable. See POSIX.xs for
|
||||
* why this particular incantation is used. */
|
||||
/* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
|
||||
* why these particular incantations are used. */
|
||||
#ifdef HAS_MBRLEN
|
||||
memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
|
||||
#endif
|
||||
#ifdef HAS_MBRTOWC
|
||||
memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
|
||||
#endif
|
||||
#ifdef HAS_WCTOMBR
|
||||
wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
|
||||
#endif
|
||||
|
||||
/* Initialize the cache of the program's UTF-8ness for the always known
|
||||
* locales C and POSIX */
|
||||
|
||||
@ -54,26 +54,25 @@ patterns using the above syntaxes, as an alternative to C<\N{...}>.
|
||||
A comparison of the two methods is given in
|
||||
L<perlunicode/Comparison of \N{...} and \p{name=...}>.
|
||||
|
||||
=head2 The C<POSIX::mblen()> function now works on shift state locales
|
||||
and is thread-safe on C99 and above compilers; the length parameter is
|
||||
now optional
|
||||
=head2 The C<POSIX::mblen()>, C<mbtowc>, and C<wctomb> functions now
|
||||
work on shift state locales and are thread-safe on C99 and above
|
||||
compilers when executed on a platform that has locale thread-safety; the
|
||||
length parameters are now optional.
|
||||
|
||||
This function is always executed under the current C language locale.
|
||||
These functions are always executed under the current C language locale.
|
||||
(See L<perllocale>.) Most locales are stateless, but a few, notably the
|
||||
very rarely encountered ISO 2022, maintain a state between calls to this
|
||||
function. Previously the state was cleared on every call to this
|
||||
function, but now the state is not reset unless the first parameter is
|
||||
C<undef>.
|
||||
very rarely encountered ISO 2022, maintain a state between calls to
|
||||
these functions. Previously the state was cleared on every call, but
|
||||
now the state is not reset unless the appropriate parameter is C<undef>.
|
||||
|
||||
On threaded perls, the C99 function L<mbrlen(3)>,
|
||||
when available, is substituted for plain
|
||||
C<mblen>.
|
||||
This makes this function thread-safe when executing on a locale
|
||||
On threaded perls, the C99 functions L<mbrlen(3)>, L<mbrtowc(3)>, and
|
||||
L<wcrtomb(3)>, when available, are substituted for the plain functions.
|
||||
This makes these functions thread-safe when executing on a locale
|
||||
thread-safe platform.
|
||||
|
||||
The string length parameter is now optional; useful only if you wish to
|
||||
restrict the length parsed in the source string to less than the actual
|
||||
length.
|
||||
The string length parameters in C<mblen> and C<mbtowc> are now optional;
|
||||
useful only if you wish to restrict the length parsed in the source
|
||||
string to less than the actual length.
|
||||
|
||||
=head1 Security
|
||||
|
||||
@ -477,6 +476,19 @@ made:
|
||||
F<t/run/switches.t> no longer uses (and re-uses) the F<tmpinplace/>
|
||||
directory under F<t/>. This may prevent spurious failures. [GH #17424]
|
||||
|
||||
=item *
|
||||
|
||||
Various bugs in C<POSIX::mbtowc> were fixed. Potential races with
|
||||
other threads are now avoided, and previously the returned wide
|
||||
character could well be garbage.
|
||||
|
||||
=item *
|
||||
|
||||
Various bugs in C<POSIX::wctomb> were fixed. Potential races with other
|
||||
threads are now avoided, and previously it would segfault if the string
|
||||
parameter was shared or hadn't been pre-allocated with a string of
|
||||
sufficient length to hold the result.
|
||||
|
||||
=back
|
||||
|
||||
=head1 Platform Support
|
||||
|
||||
6
sv.c
6
sv.c
@ -15691,6 +15691,12 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
|
||||
#ifdef HAS_MBRLEN
|
||||
PL_mbrlen_ps = proto_perl->Imbrlen_ps;
|
||||
#endif
|
||||
#ifdef HAS_MBRTOWC
|
||||
PL_mbrtowc_ps = proto_perl->Imbrtowc_ps;
|
||||
#endif
|
||||
#ifdef HAS_WCRTOMB
|
||||
PL_wcrtomb_ps = proto_perl->Iwcrtomb_ps;
|
||||
#endif
|
||||
|
||||
PL_langinfo_buf = NULL;
|
||||
PL_langinfo_bufsize = 0;
|
||||
|
||||
@ -201,6 +201,7 @@ Math::Random::MT::Perl
|
||||
Math::Random::Secure
|
||||
Math::TrulyRandom
|
||||
mbrlen(3)
|
||||
mbrtowc(3)
|
||||
md5sum(1)
|
||||
Method::Signatures
|
||||
mmap(2)
|
||||
@ -350,6 +351,7 @@ wait4(2)
|
||||
waitpid(2)
|
||||
waitpid(3)
|
||||
Want
|
||||
wcrtomb(3)
|
||||
wget(1)
|
||||
Win32::Locale
|
||||
write(2)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user