mirror of
https://github.com/Perl/perl5.git
synced 2026-01-26 16:39:36 +00:00
Use the new utf8 to code point functions
These functions should be used in preference to the old ones which can read beyond the end of the input string.
This commit is contained in:
parent
27d6c58a7e
commit
4b88fb76ef
@ -176,7 +176,7 @@ utf8_to_wide(const char *buf)
|
||||
|
||||
setlocale(LC_CTYPE, "utf-8");
|
||||
wbuf = (wchar_t *) safemalloc(wlen);
|
||||
/* utf8_to_uvuni(pathname, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
/* utf8_to_uvuni_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
wlen = mbsrtowcs(wbuf, (const char**)&buf, wlen, &mbs);
|
||||
|
||||
if (oldlocale) setlocale(LC_CTYPE, oldlocale);
|
||||
@ -283,7 +283,7 @@ XS(XS_Cygwin_win_to_posix_path)
|
||||
mbstate_t mbs;
|
||||
char *oldlocale = setlocale(LC_CTYPE, NULL);
|
||||
setlocale(LC_CTYPE, "utf-8");
|
||||
/* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
/* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
|
||||
if (wlen > 0)
|
||||
err = cygwin_conv_path(what, wpath, wbuf, wlen);
|
||||
@ -370,7 +370,7 @@ XS(XS_Cygwin_posix_to_win_path)
|
||||
setlocale(LC_CTYPE, "utf-8");
|
||||
if (!IN_BYTES) {
|
||||
mbstate_t mbs;
|
||||
/* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
/* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
|
||||
wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
|
||||
if (wlen > 0)
|
||||
err = cygwin_conv_path(what, wpath, wbuf, wlen);
|
||||
|
||||
4
dist/Data-Dumper/Dumper.pm
vendored
4
dist/Data-Dumper/Dumper.pm
vendored
@ -10,7 +10,7 @@
|
||||
package Data::Dumper;
|
||||
|
||||
BEGIN {
|
||||
$VERSION = '2.135_05'; # Don't forget to set version and release
|
||||
$VERSION = '2.135_06'; # Don't forget to set version and release
|
||||
} # date in POD!
|
||||
|
||||
#$| = 1;
|
||||
@ -1332,7 +1332,7 @@ modify it under the same terms as Perl itself.
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Version 2.135_05 (February 18 2012)
|
||||
Version 2.135_06 (March 20 2012)
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
|
||||
12
dist/Data-Dumper/Dumper.xs
vendored
12
dist/Data-Dumper/Dumper.xs
vendored
@ -37,17 +37,17 @@ static I32 DD_dump (pTHX_ SV *val, const char *name, STRLEN namelen, SV *retval,
|
||||
# endif
|
||||
|
||||
UV
|
||||
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
|
||||
Perl_utf8_to_uvchr_buf(pTHX_ U8 *s, U8 *send, STRLEN *retlen)
|
||||
{
|
||||
const UV uv = utf8_to_uv(s, UTF8_MAXLEN, retlen,
|
||||
const UV uv = utf8_to_uv(s, send - s, retlen,
|
||||
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
|
||||
return UNI_TO_NATIVE(uv);
|
||||
}
|
||||
|
||||
# if !defined(PERL_IMPLICIT_CONTEXT)
|
||||
# define utf8_to_uvchr Perl_utf8_to_uvchr
|
||||
# define utf8_to_uvchr_buf Perl_utf8_to_uvchr_buf
|
||||
# else
|
||||
# define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
|
||||
# define utf8_to_uvchr_buf(a,b) Perl_utf8_to_uvchr_buf(aTHX_ a,b)
|
||||
# endif
|
||||
|
||||
#endif /* PERL_VERSION <= 6 */
|
||||
@ -147,7 +147,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen)
|
||||
|
||||
/* this will need EBCDICification */
|
||||
for (s = src; s < send; s += increment) {
|
||||
const UV k = utf8_to_uvchr((U8*)s, NULL);
|
||||
const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
|
||||
|
||||
/* check for invalid utf8 */
|
||||
increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
|
||||
@ -184,7 +184,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen)
|
||||
*r++ = '"';
|
||||
|
||||
for (s = src; s < send; s += UTF8SKIP(s)) {
|
||||
const UV k = utf8_to_uvchr((U8*)s, NULL);
|
||||
const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
|
||||
|
||||
if (k == '"' || k == '\\' || k == '$' || k == '@') {
|
||||
*r++ = '\\';
|
||||
|
||||
4
dump.c
4
dump.c
@ -281,7 +281,7 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str,
|
||||
isuni = 1;
|
||||
|
||||
for ( ; (pv < end && (!max || (wrote < max))) ; pv += readsize ) {
|
||||
const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv;
|
||||
const UV u= (isuni) ? utf8_to_uvchr_buf((U8*)pv, (U8*) end, &readsize) : (U8)*pv;
|
||||
const U8 c = (U8)u & 0xFF;
|
||||
|
||||
if ( ( u > 255 )
|
||||
@ -2420,7 +2420,7 @@ Perl_sv_catxmlpvn(pTHX_ SV *dsv, const char *pv, STRLEN len, int utf8)
|
||||
retry:
|
||||
while (pv < e) {
|
||||
if (utf8) {
|
||||
c = utf8_to_uvchr((U8*)pv, &cl);
|
||||
c = utf8_to_uvchr_buf((U8*)pv, (U8*)e, &cl);
|
||||
if (cl == 0) {
|
||||
SvCUR(dsv) = dsvcur;
|
||||
pv = start;
|
||||
|
||||
@ -5,7 +5,7 @@ use strict;
|
||||
use warnings;
|
||||
use Carp;
|
||||
|
||||
our $VERSION = '0.36';
|
||||
our $VERSION = '0.37';
|
||||
|
||||
require XSLoader;
|
||||
|
||||
|
||||
@ -148,7 +148,7 @@ bitflip_key(pTHX_ IV action, SV *field) {
|
||||
const char *const end = p + len;
|
||||
while (p < end) {
|
||||
STRLEN len;
|
||||
UV chr = utf8_to_uvuni((U8 *)p, &len);
|
||||
UV chr = utf8_to_uvuni_buf((U8 *)p, (U8 *) end, &len);
|
||||
new_p = (char *)uvuni_to_utf8((U8 *)new_p, chr ^ 32);
|
||||
p += len;
|
||||
}
|
||||
|
||||
27
handy.h
27
handy.h
@ -949,7 +949,8 @@ EXTCONST U32 PL_charclass[];
|
||||
*((p)+1))) \
|
||||
: function(p))
|
||||
|
||||
/* Note that all ignore 'use bytes' */
|
||||
/* Note that all assume that the utf8 has been validated, and ignore 'use
|
||||
* bytes' */
|
||||
|
||||
#define isALNUM_utf8(p) generic_utf8(isWORDCHAR, is_utf8_alnum, p)
|
||||
/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
|
||||
@ -992,18 +993,18 @@ EXTCONST U32 PL_charclass[];
|
||||
: isSPACE_utf8(p)))
|
||||
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
|
||||
|
||||
#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0))
|
||||
#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
|
||||
|
||||
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
|
||||
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
|
||||
|
||||
@ -2670,17 +2670,18 @@ character like this (the UTF8_IS_INVARIANT() is a macro that tests
|
||||
whether the byte can be encoded as a single byte even in UTF-8):
|
||||
|
||||
U8 *utf;
|
||||
U8 *utf_end; /* 1 beyond buffer pointed to by utf */
|
||||
UV uv; /* Note: a UV, not a U8, not a char */
|
||||
STRLEN len; /* length of character in bytes */
|
||||
|
||||
if (!UTF8_IS_INVARIANT(*utf))
|
||||
/* Must treat this as UTF-8 */
|
||||
uv = utf8_to_uvchr(utf, &len);
|
||||
uv = utf8_to_uvchr_buf(utf, utf_end, &len);
|
||||
else
|
||||
/* OK to treat this character as a byte */
|
||||
uv = *utf;
|
||||
|
||||
You can also see in that example that we use C<utf8_to_uvchr> to get the
|
||||
You can also see in that example that we use C<utf8_to_uvchr_buf> to get the
|
||||
value of the character; the inverse function C<uvchr_to_utf8> is available
|
||||
for putting a UV into UTF-8:
|
||||
|
||||
@ -2792,7 +2793,7 @@ it's not - if you pass on the PV to somewhere, pass on the flag too.
|
||||
|
||||
=item *
|
||||
|
||||
If a string is UTF-8, B<always> use C<utf8_to_uvchr> to get at the value,
|
||||
If a string is UTF-8, B<always> use C<utf8_to_uvchr_buf> to get at the value,
|
||||
unless C<UTF8_IS_INVARIANT(*s)> in which case you can use C<*s>.
|
||||
|
||||
=item *
|
||||
|
||||
@ -1514,7 +1514,8 @@ pointing after the UTF-8 bytes. It works appropriately on EBCDIC machines.
|
||||
|
||||
=item *
|
||||
|
||||
C<utf8_to_uvchr(buf, lenp)> reads UTF-8 encoded bytes from a buffer and
|
||||
C<utf8_to_uvchr_buf(buf, bufend, lenp)> reads UTF-8 encoded bytes from a
|
||||
buffer and
|
||||
returns the Unicode character code point and, optionally, the length of
|
||||
the UTF-8 byte sequence. It works appropriately on EBCDIC machines.
|
||||
|
||||
|
||||
6
pp.c
6
pp.c
@ -3383,7 +3383,7 @@ PP(pp_chr)
|
||||
sv_recode_to_utf8(TARG, PL_encoding);
|
||||
tmps = SvPVX(TARG);
|
||||
if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
|
||||
UNICODE_IS_REPLACEMENT(utf8_to_uvchr((U8*)tmps, NULL))) {
|
||||
UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
|
||||
SvGROW(TARG, 2);
|
||||
tmps = SvPVX(TARG);
|
||||
SvCUR_set(TARG, 1);
|
||||
@ -3795,7 +3795,7 @@ PP(pp_uc)
|
||||
uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
|
||||
cBOOL(IN_LOCALE_RUNTIME), &tainted);
|
||||
if (uv == GREEK_CAPITAL_LETTER_IOTA
|
||||
&& utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
|
||||
&& utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
|
||||
{
|
||||
in_iota_subscript = TRUE;
|
||||
}
|
||||
@ -5344,7 +5344,7 @@ PP(pp_reverse)
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
if (!utf8_to_uvchr(s, 0))
|
||||
if (!utf8_to_uvchr_buf(s, send, 0))
|
||||
break;
|
||||
up = (char*)s;
|
||||
s += UTF8SKIP(s);
|
||||
|
||||
11
regcomp.c
11
regcomp.c
@ -3480,8 +3480,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
|
||||
UV uc;
|
||||
if (UTF) {
|
||||
const U8 * const s = (U8*)STRING(scan);
|
||||
uc = utf8_to_uvchr_buf(s, s + l, NULL);
|
||||
l = utf8_length(s, s + l);
|
||||
uc = utf8_to_uvchr(s, NULL);
|
||||
} else {
|
||||
uc = *((U8*)STRING(scan));
|
||||
}
|
||||
@ -3575,8 +3575,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
|
||||
}
|
||||
if (UTF) {
|
||||
const U8 * const s = (U8 *)STRING(scan);
|
||||
uc = utf8_to_uvchr_buf(s, s + l, NULL);
|
||||
l = utf8_length(s, s + l);
|
||||
uc = utf8_to_uvchr(s, NULL);
|
||||
}
|
||||
else if (has_exactf_sharp_s) {
|
||||
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
|
||||
@ -9822,7 +9822,10 @@ tryagain:
|
||||
for (foldbuf = tmpbuf;
|
||||
foldlen;
|
||||
foldlen -= numlen) {
|
||||
ender = utf8_to_uvchr(foldbuf, &numlen);
|
||||
|
||||
/* tmpbuf has been constructed by us, so we
|
||||
* know it is valid utf8 */
|
||||
ender = valid_utf8_to_uvchr(foldbuf, &numlen);
|
||||
if (numlen > 0) {
|
||||
const STRLEN unilen = reguni(pRExC_state, ender, s);
|
||||
s += unilen;
|
||||
@ -9858,7 +9861,7 @@ tryagain:
|
||||
for (foldbuf = tmpbuf;
|
||||
foldlen;
|
||||
foldlen -= numlen) {
|
||||
ender = utf8_to_uvchr(foldbuf, &numlen);
|
||||
ender = valid_utf8_to_uvchr(foldbuf, &numlen);
|
||||
if (numlen > 0) {
|
||||
const STRLEN unilen = reguni(pRExC_state, ender, s);
|
||||
len += unilen;
|
||||
|
||||
@ -364,7 +364,9 @@ int CPerlBase::ConsoleRead(const int fd, char* buf, int n)
|
||||
#else
|
||||
dTHX;
|
||||
for (i = 0; i < nUtf8; i+= UTF8SKIP(pUtf8 + i)) {
|
||||
unsigned long u = utf8_to_uvchr((U8*)(pUtf8 + i), 0);
|
||||
unsigned long u = utf8_to_uvchr_buf((U8*)(pUtf8 + i),
|
||||
(U8*)(pUtf8 + nUtf8),
|
||||
0);
|
||||
if (u > 0xFF) {
|
||||
iConsole->Printf(_L("(keycode > 0xFF)\n"));
|
||||
buf[i] = 0;
|
||||
@ -401,7 +403,7 @@ int CPerlBase::ConsoleWrite(const int fd, const char* buf, int n)
|
||||
dTHX;
|
||||
if (is_utf8_string((U8*)buf, n)) {
|
||||
for (int i = 0; i < n; i += UTF8SKIP(buf + i)) {
|
||||
TChar u = utf8_to_uvchr((U8*)(buf + i), 0);
|
||||
TChar u = valid_utf8_to_uvchr((U8*)(buf + i), 0);
|
||||
iConsole->Printf(_L("%c"), u);
|
||||
wrote++;
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
|
||||
utf8.c AOK
|
||||
|
||||
[utf8_to_uvchr]
|
||||
[utf8_to_uvchr_buf]
|
||||
Malformed UTF-8 character
|
||||
my $a = ord "\x80" ;
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
<<<<<< Add a test when something actually calls utf16_to_utf8
|
||||
|
||||
__END__
|
||||
# utf8.c [utf8_to_uvchr] -W
|
||||
# utf8.c [utf8_to_uvchr_buf] -W
|
||||
BEGIN {
|
||||
if (ord('A') == 193) {
|
||||
print "SKIPPED\n# ebcdic platforms do not generate Malformed UTF-8 warnings.";
|
||||
|
||||
2
toke.c
2
toke.c
@ -9883,7 +9883,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
|
||||
termlen = 1;
|
||||
}
|
||||
else {
|
||||
termcode = utf8_to_uvchr((U8*)s, &termlen);
|
||||
termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
|
||||
Copy(s, termstr, termlen, U8);
|
||||
if (!UTF8_IS_INVARIANT(term))
|
||||
has_utf8 = TRUE;
|
||||
|
||||
46
utf8.c
46
utf8.c
@ -839,7 +839,11 @@ Returns the native code point of the first character in the string C<s>
|
||||
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
|
||||
length, in bytes, of that character.
|
||||
|
||||
If C<s> does not point to a well-formed UTF-8 character, zero is
|
||||
Some, but not all, UTF-8 malformations are detected, and in fact, some
|
||||
malformed input could cause reading beyond the end of the input buffer.
|
||||
Use L</utf8_to_uvchr_buf> instead.
|
||||
|
||||
If C<s> points to one of the detected malformations, zero is
|
||||
returned and C<retlen> is set, if possible, to -1.
|
||||
|
||||
=cut
|
||||
@ -850,8 +854,7 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
|
||||
{
|
||||
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
|
||||
|
||||
return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
|
||||
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
|
||||
return valid_utf8_to_uvchr(s, retlen);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -902,10 +905,11 @@ Returns the Unicode code point of the first character in the string C<s>
|
||||
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
|
||||
length, in bytes, of that character.
|
||||
|
||||
This function should only be used when the returned UV is considered
|
||||
an index into the Unicode semantic tables (e.g. swashes).
|
||||
Some, but not all, UTF-8 malformations are detected, and in fact, some
|
||||
malformed input could cause reading beyond the end of the input buffer.
|
||||
Use L</utf8_to_uvuni_buf> instead.
|
||||
|
||||
If C<s> does not point to a well-formed UTF-8 character, zero is
|
||||
If C<s> points to one of the detected malformations, zero is
|
||||
returned and C<retlen> is set, if possible, to -1.
|
||||
|
||||
=cut
|
||||
@ -916,9 +920,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
|
||||
{
|
||||
PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
|
||||
|
||||
/* Call the low level routine asking for checks */
|
||||
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
|
||||
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
|
||||
return valid_utf8_to_uvuni(s, retlen);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1128,7 +1130,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
|
||||
d = s = save;
|
||||
while (s < send) {
|
||||
STRLEN ulen;
|
||||
*d++ = (U8)utf8_to_uvchr(s, &ulen);
|
||||
*d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
|
||||
s += ulen;
|
||||
}
|
||||
*d = '\0';
|
||||
@ -2154,7 +2156,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
|
||||
dVAR;
|
||||
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
|
||||
STRLEN len = 0;
|
||||
const UV uv0 = utf8_to_uvchr(p, NULL);
|
||||
const UV uv0 = valid_utf8_to_uvchr(p, NULL);
|
||||
/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
|
||||
* are necessary in EBCDIC, they are redundant no-ops
|
||||
* in ASCII-ish platforms, and hopefully optimized away. */
|
||||
@ -2302,7 +2304,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
|
||||
bad_crossing:
|
||||
|
||||
/* Failed, have to return the original */
|
||||
original = utf8_to_uvchr(p, lenp);
|
||||
original = valid_utf8_to_uvchr(p, lenp);
|
||||
Copy(p, ustrp, *lenp, char);
|
||||
return original;
|
||||
}
|
||||
@ -3508,7 +3510,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
|
||||
"unexpectedly is not a string, flags=%lu",
|
||||
(unsigned long)SvFLAGS(sv_to));
|
||||
}
|
||||
/*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
|
||||
/*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
|
||||
|
||||
/* Each key in the inverse list is a mapped-to value, and the key's
|
||||
* hash value is a list of the strings (each in utf8) that map to
|
||||
@ -3575,7 +3577,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
|
||||
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
|
||||
}
|
||||
|
||||
/* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
|
||||
/* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
|
||||
for (j = 0; j <= av_len(from_list); j++) {
|
||||
entryp = av_fetch(from_list, j, FALSE);
|
||||
if (entryp == NULL) {
|
||||
@ -3583,9 +3585,11 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
|
||||
}
|
||||
|
||||
/* When i==j this adds itself to the list */
|
||||
av_push(i_list, newSVuv(utf8_to_uvchr(
|
||||
(U8*) SvPVX(*entryp), 0)));
|
||||
/*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
|
||||
av_push(i_list, newSVuv(utf8_to_uvchr_buf(
|
||||
(U8*) SvPVX(*entryp),
|
||||
(U8*) SvPVX(*entryp) + SvCUR(*entryp),
|
||||
0)));
|
||||
/*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3931,7 +3935,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
|
||||
STRLEN char_len;
|
||||
if (UTF8_IS_SUPER(s)) {
|
||||
if (ckWARN_d(WARN_NON_UNICODE)) {
|
||||
UV uv = utf8_to_uvchr(s, &char_len);
|
||||
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
|
||||
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
|
||||
"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
|
||||
ok = FALSE;
|
||||
@ -3939,7 +3943,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
|
||||
}
|
||||
else if (UTF8_IS_SURROGATE(s)) {
|
||||
if (ckWARN_d(WARN_SURROGATE)) {
|
||||
UV uv = utf8_to_uvchr(s, &char_len);
|
||||
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
|
||||
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
|
||||
"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
|
||||
ok = FALSE;
|
||||
@ -3949,7 +3953,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
|
||||
((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
|
||||
&& (ckWARN_d(WARN_NONCHAR)))
|
||||
{
|
||||
UV uv = utf8_to_uvchr(s, &char_len);
|
||||
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
|
||||
Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
|
||||
"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
|
||||
ok = FALSE;
|
||||
@ -3999,7 +4003,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
|
||||
truncated++;
|
||||
break;
|
||||
}
|
||||
u = utf8_to_uvchr((U8*)s, 0);
|
||||
u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
|
||||
if (u < 256) {
|
||||
const unsigned char c = (unsigned char)u & 0xFF;
|
||||
if (flags & UNI_DISPLAY_BACKSLASH) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user