mirror of
https://github.com/Perl/perl5.git
synced 2026-01-27 01:44:43 +00:00
utf8.c: Allow Changed behavior of utf8 under locale
This changes the 4 case changing functions to take extra parameters to specify if the utf8 string is to be processed under locale rules when the code points are < 256. The current functions are changed to macros that call the new versions so that current behavior is unchanged. An additional, static, function is created that makes sure that the 255/256 boundary is not crossed during the case change.
This commit is contained in:
parent
4b59338969
commit
051a06d4bf
12
embed.fnc
12
embed.fnc
@ -1386,11 +1386,14 @@ Ap |void |taint_env
|
||||
Ap |void |taint_proper |NULLOK const char* f|NN const char *const s
|
||||
Apd |UV |to_utf8_case |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp \
|
||||
|NN SV **swashp|NN const char *normal|NULLOK const char *special
|
||||
Apd |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
Apd |UV |to_utf8_upper |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
Apd |UV |to_utf8_title |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
Apdm |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
EXMp |UV |_to_utf8_lower_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
|
||||
Apdm |UV |to_utf8_upper |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
EXMp |UV |_to_utf8_upper_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
|
||||
Apdm |UV |to_utf8_title |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
EXMp |UV |_to_utf8_title_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
|
||||
Ampd |UV |to_utf8_fold |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
|
||||
AMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|U8 flags
|
||||
EXMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|U8 flags|NULLOK bool* tainted_ptr
|
||||
#if defined(PERL_IN_MG_C) || defined(PERL_IN_PP_C)
|
||||
p |bool |translate_substr_offsets|STRLEN curlen|IV pos1_iv \
|
||||
|bool pos1_is_uv|IV len_iv \
|
||||
@ -2142,6 +2145,7 @@ sn |NV|mulexp10 |NV value|I32 exponent
|
||||
|
||||
#if defined(PERL_IN_UTF8_C)
|
||||
sRn |STRLEN |is_utf8_char_slow|NN const U8 *s|const STRLEN len
|
||||
sRM |UV |check_locale_boundary_crossing|NN const U8* const p|const UV result|NN U8* const ustrp|NN STRLEN *lenp
|
||||
sR |bool |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname
|
||||
sR |SV* |swash_get |NN SV* swash|UV start|UV span
|
||||
#endif
|
||||
|
||||
9
embed.h
9
embed.h
@ -28,7 +28,6 @@
|
||||
|
||||
#define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b)
|
||||
#define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
|
||||
#define _to_utf8_fold_flags(a,b,c,d) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d)
|
||||
#define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d)
|
||||
#define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
|
||||
#define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d)
|
||||
@ -657,9 +656,6 @@
|
||||
#define to_uni_upper(a,b,c) Perl_to_uni_upper(aTHX_ a,b,c)
|
||||
#define to_uni_upper_lc(a) Perl_to_uni_upper_lc(aTHX_ a)
|
||||
#define to_utf8_case(a,b,c,d,e,f) Perl_to_utf8_case(aTHX_ a,b,c,d,e,f)
|
||||
#define to_utf8_lower(a,b,c) Perl_to_utf8_lower(aTHX_ a,b,c)
|
||||
#define to_utf8_title(a,b,c) Perl_to_utf8_title(aTHX_ a,b,c)
|
||||
#define to_utf8_upper(a,b,c) Perl_to_utf8_upper(aTHX_ a,b,c)
|
||||
#define unpack_str(a,b,c,d,e,f,g,h) Perl_unpack_str(aTHX_ a,b,c,d,e,f,g,h)
|
||||
#define unpackstring(a,b,c,d,e) Perl_unpackstring(aTHX_ a,b,c,d,e)
|
||||
#define unsharepvn(a,b,c) Perl_unsharepvn(aTHX_ a,b,c)
|
||||
@ -849,6 +845,10 @@
|
||||
#endif
|
||||
#if defined(PERL_CORE) || defined(PERL_EXT)
|
||||
#define _is_utf8__perl_idstart(a) Perl__is_utf8__perl_idstart(aTHX_ a)
|
||||
#define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e)
|
||||
#define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
|
||||
#define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
|
||||
#define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
|
||||
#define av_reify(a) Perl_av_reify(aTHX_ a)
|
||||
#define is_utf8_X_L(a) Perl_is_utf8_X_L(aTHX_ a)
|
||||
#define is_utf8_X_LV(a) Perl_is_utf8_X_LV(aTHX_ a)
|
||||
@ -1584,6 +1584,7 @@
|
||||
# endif
|
||||
# if defined(PERL_IN_UTF8_C)
|
||||
#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
|
||||
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
|
||||
#define is_utf8_char_slow S_is_utf8_char_slow
|
||||
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
|
||||
#define swash_get(a,b,c) S_swash_get(aTHX_ a,b,c)
|
||||
|
||||
46
proto.h
46
proto.h
@ -35,12 +35,30 @@ PERL_CALLCONV UV Perl__to_uni_fold_flags(pTHX_ UV c, U8 *p, STRLEN *lenp, U8 fla
|
||||
#define PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS \
|
||||
assert(p); assert(lenp)
|
||||
|
||||
PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
|
||||
PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS \
|
||||
assert(p); assert(ustrp)
|
||||
|
||||
PERL_CALLCONV UV Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS \
|
||||
assert(p); assert(ustrp)
|
||||
|
||||
PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS \
|
||||
assert(p); assert(ustrp)
|
||||
|
||||
PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \
|
||||
assert(p); assert(ustrp)
|
||||
|
||||
PERL_CALLCONV PADOFFSET Perl_allocmy(pTHX_ const char *const name, const STRLEN len, const U32 flags)
|
||||
__attribute__nonnull__(pTHX_1);
|
||||
#define PERL_ARGS_ASSERT_ALLOCMY \
|
||||
@ -4438,23 +4456,17 @@ PERL_CALLCONV UV Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, S
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2); */
|
||||
|
||||
PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
/* PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT_TO_UTF8_LOWER \
|
||||
assert(p); assert(ustrp)
|
||||
__attribute__nonnull__(pTHX_2); */
|
||||
|
||||
PERL_CALLCONV UV Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
/* PERL_CALLCONV UV Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT_TO_UTF8_TITLE \
|
||||
assert(p); assert(ustrp)
|
||||
__attribute__nonnull__(pTHX_2); */
|
||||
|
||||
PERL_CALLCONV UV Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
/* PERL_CALLCONV UV Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_2);
|
||||
#define PERL_ARGS_ASSERT_TO_UTF8_UPPER \
|
||||
assert(p); assert(ustrp)
|
||||
__attribute__nonnull__(pTHX_2); */
|
||||
|
||||
PERL_CALLCONV bool Perl_try_amagic_bin(pTHX_ int method, int flags);
|
||||
PERL_CALLCONV bool Perl_try_amagic_un(pTHX_ int method, int flags);
|
||||
@ -7025,6 +7037,14 @@ PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, con
|
||||
#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
|
||||
assert(p); assert(lenp)
|
||||
|
||||
STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
|
||||
__attribute__warn_unused_result__
|
||||
__attribute__nonnull__(pTHX_1)
|
||||
__attribute__nonnull__(pTHX_3)
|
||||
__attribute__nonnull__(pTHX_4);
|
||||
#define PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING \
|
||||
assert(p); assert(ustrp); assert(lenp)
|
||||
|
||||
STATIC STRLEN S_is_utf8_char_slow(const U8 *s, const STRLEN len)
|
||||
__attribute__warn_unused_result__
|
||||
__attribute__nonnull__(1);
|
||||
|
||||
239
utf8.c
239
utf8.c
@ -2108,6 +2108,53 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
|
||||
return len ? utf8_to_uvchr(ustrp, 0) : 0;
|
||||
}
|
||||
|
||||
STATIC UV
|
||||
S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
|
||||
{
|
||||
/* This is called when changing the case of a utf8-encoded character above
|
||||
* the Latin1 range, and the operation is in locale. If the result
|
||||
* contains a character that crosses the 255/256 boundary, disallow the
|
||||
* change, and return the original code point. See L<perlfunc/lc> for why;
|
||||
*
|
||||
* p points to the original string whose case was changed
|
||||
* result the code point of the first character in the changed-case string
|
||||
* ustrp points to the changed-case string (<result> represents its first char)
|
||||
* lenp points to the length of <ustrp> */
|
||||
|
||||
UV original; /* To store the first code point of <p> */
|
||||
|
||||
PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
|
||||
|
||||
assert(! UTF8_IS_INVARIANT(*p) && ! UTF8_IS_DOWNGRADEABLE_START(*p));
|
||||
|
||||
/* We know immediately if the first character in the string crosses the
|
||||
* boundary, so can skip */
|
||||
if (result > 255) {
|
||||
|
||||
/* Look at every character in the result; if any cross the
|
||||
* boundary, the whole thing is disallowed */
|
||||
U8* s = ustrp + UTF8SKIP(ustrp);
|
||||
U8* e = ustrp + *lenp;
|
||||
while (s < e) {
|
||||
if (UTF8_IS_INVARIANT(*s) || UTF8_IS_DOWNGRADEABLE_START(*s))
|
||||
{
|
||||
goto bad_crossing;
|
||||
}
|
||||
s += UTF8SKIP(s);
|
||||
}
|
||||
|
||||
/* Here, no characters crossed, result is ok as-is */
|
||||
return result;
|
||||
}
|
||||
|
||||
bad_crossing:
|
||||
|
||||
/* Failed, have to return the original */
|
||||
original = utf8_to_uvchr(p, lenp);
|
||||
Copy(p, ustrp, *lenp, char);
|
||||
return original;
|
||||
}
|
||||
|
||||
/*
|
||||
=for apidoc to_utf8_upper
|
||||
|
||||
@ -2121,22 +2168,61 @@ The first character of the uppercased version is returned
|
||||
|
||||
=cut */
|
||||
|
||||
/* Not currently externally documented, and subject to change:
|
||||
* <flags> is set iff locale semantics are to be used for code points < 256
|
||||
* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
|
||||
* were used in the calculation; otherwise unchanged. */
|
||||
|
||||
UV
|
||||
Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
{
|
||||
dVAR;
|
||||
|
||||
PERL_ARGS_ASSERT_TO_UTF8_UPPER;
|
||||
UV result;
|
||||
|
||||
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
|
||||
|
||||
if (UTF8_IS_INVARIANT(*p)) {
|
||||
if (flags) {
|
||||
result = toUPPER_LC(*p);
|
||||
}
|
||||
else {
|
||||
return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
|
||||
}
|
||||
}
|
||||
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
|
||||
if (flags) {
|
||||
result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
|
||||
}
|
||||
else {
|
||||
return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
|
||||
ustrp, lenp, 'S');
|
||||
}
|
||||
}
|
||||
else { /* utf8, ord above 255 */
|
||||
result = CALL_UPPER_CASE(p, ustrp, lenp);
|
||||
|
||||
if (flags) {
|
||||
result = check_locale_boundary_crossing(p, result, ustrp, lenp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
return CALL_UPPER_CASE(p, ustrp, lenp);
|
||||
/* Here, used locale rules. Convert back to utf8 */
|
||||
if (UTF8_IS_INVARIANT(result)) {
|
||||
*ustrp = (U8) result;
|
||||
*lenp = 1;
|
||||
}
|
||||
else {
|
||||
*ustrp = UTF8_EIGHT_BIT_HI(result);
|
||||
*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
|
||||
*lenp = 2;
|
||||
}
|
||||
|
||||
if (tainted_ptr) {
|
||||
*tainted_ptr = TRUE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2152,22 +2238,63 @@ The first character of the titlecased version is returned
|
||||
|
||||
=cut */
|
||||
|
||||
/* Not currently externally documented, and subject to change:
|
||||
* <flags> is set iff locale semantics are to be used for code points < 256
|
||||
* Since titlecase is not defined in POSIX, uppercase is used instead
|
||||
* for these/
|
||||
* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
|
||||
* were used in the calculation; otherwise unchanged. */
|
||||
|
||||
UV
|
||||
Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
{
|
||||
dVAR;
|
||||
|
||||
PERL_ARGS_ASSERT_TO_UTF8_TITLE;
|
||||
UV result;
|
||||
|
||||
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
|
||||
|
||||
if (UTF8_IS_INVARIANT(*p)) {
|
||||
if (flags) {
|
||||
result = toUPPER_LC(*p);
|
||||
}
|
||||
else {
|
||||
return _to_upper_title_latin1(*p, ustrp, lenp, 's');
|
||||
}
|
||||
}
|
||||
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
|
||||
if (flags) {
|
||||
result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
|
||||
}
|
||||
else {
|
||||
return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
|
||||
ustrp, lenp, 's');
|
||||
}
|
||||
}
|
||||
else { /* utf8, ord above 255 */
|
||||
result = CALL_TITLE_CASE(p, ustrp, lenp);
|
||||
|
||||
if (flags) {
|
||||
result = check_locale_boundary_crossing(p, result, ustrp, lenp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
return CALL_TITLE_CASE(p, ustrp, lenp);
|
||||
/* Here, used locale rules. Convert back to utf8 */
|
||||
if (UTF8_IS_INVARIANT(result)) {
|
||||
*ustrp = (U8) result;
|
||||
*lenp = 1;
|
||||
}
|
||||
else {
|
||||
*ustrp = UTF8_EIGHT_BIT_HI(result);
|
||||
*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
|
||||
*lenp = 2;
|
||||
}
|
||||
|
||||
if (tainted_ptr) {
|
||||
*tainted_ptr = TRUE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2183,21 +2310,61 @@ The first character of the lowercased version is returned
|
||||
|
||||
=cut */
|
||||
|
||||
/* Not currently externally documented, and subject to change:
|
||||
* <flags> is set iff locale semantics are to be used for code points < 256
|
||||
* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
|
||||
* were used in the calculation; otherwise unchanged. */
|
||||
|
||||
UV
|
||||
Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
|
||||
Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
|
||||
{
|
||||
UV result;
|
||||
|
||||
dVAR;
|
||||
|
||||
PERL_ARGS_ASSERT_TO_UTF8_LOWER;
|
||||
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
|
||||
|
||||
if (UTF8_IS_INVARIANT(*p)) {
|
||||
if (flags) {
|
||||
result = toLOWER_LC(*p);
|
||||
}
|
||||
else {
|
||||
return to_lower_latin1(*p, ustrp, lenp);
|
||||
}
|
||||
}
|
||||
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
|
||||
if (flags) {
|
||||
result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
|
||||
}
|
||||
else {
|
||||
return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), ustrp, lenp);
|
||||
}
|
||||
}
|
||||
else { /* utf8, ord above 255 */
|
||||
result = CALL_LOWER_CASE(p, ustrp, lenp);
|
||||
|
||||
if (flags) {
|
||||
result = check_locale_boundary_crossing(p, result, ustrp, lenp);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
return CALL_LOWER_CASE(p, ustrp, lenp);
|
||||
/* Here, used locale rules. Convert back to utf8 */
|
||||
if (UTF8_IS_INVARIANT(result)) {
|
||||
*ustrp = (U8) result;
|
||||
*lenp = 1;
|
||||
}
|
||||
else {
|
||||
*ustrp = UTF8_EIGHT_BIT_HI(result);
|
||||
*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
|
||||
*lenp = 2;
|
||||
}
|
||||
|
||||
if (tainted_ptr) {
|
||||
*tainted_ptr = TRUE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2214,25 +2381,67 @@ The first character of the foldcased version is returned
|
||||
|
||||
=cut */
|
||||
|
||||
/* Not currently externally documented is 'flags', which currently is non-zero
|
||||
* if full case folds are to be used; otherwise simple folds */
|
||||
/* Not currently externally documented, and subject to change,
|
||||
* in <flags>
|
||||
* bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
|
||||
* points < 256. Since foldcase is not defined in
|
||||
* POSIX, lowercase is used instead
|
||||
* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
|
||||
* otherwise simple folds
|
||||
* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
|
||||
* were used in the calculation; otherwise unchanged. */
|
||||
|
||||
UV
|
||||
Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
|
||||
Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
|
||||
{
|
||||
dVAR;
|
||||
|
||||
UV result;
|
||||
|
||||
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
|
||||
|
||||
if (UTF8_IS_INVARIANT(*p)) {
|
||||
return _to_fold_latin1(*p, ustrp, lenp, flags);
|
||||
if (flags & FOLD_FLAGS_LOCALE) {
|
||||
result = toLOWER_LC(*p);
|
||||
}
|
||||
else {
|
||||
return _to_fold_latin1(*p, ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
|
||||
}
|
||||
}
|
||||
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
|
||||
if (flags & FOLD_FLAGS_LOCALE) {
|
||||
result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
|
||||
}
|
||||
else {
|
||||
return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
|
||||
ustrp, lenp, flags);
|
||||
ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
|
||||
}
|
||||
}
|
||||
else { /* utf8, ord above 255 */
|
||||
result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
|
||||
|
||||
if ((flags & FOLD_FLAGS_LOCALE)) {
|
||||
result = check_locale_boundary_crossing(p, result, ustrp, lenp);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
return CALL_FOLD_CASE(p, ustrp, lenp, flags);
|
||||
/* Here, used locale rules. Convert back to utf8 */
|
||||
if (UTF8_IS_INVARIANT(result)) {
|
||||
*ustrp = (U8) result;
|
||||
*lenp = 1;
|
||||
}
|
||||
else {
|
||||
*ustrp = UTF8_EIGHT_BIT_HI(result);
|
||||
*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
|
||||
*lenp = 2;
|
||||
}
|
||||
|
||||
if (tainted_ptr) {
|
||||
*tainted_ptr = TRUE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Note:
|
||||
|
||||
10
utf8.h
10
utf8.h
@ -16,8 +16,16 @@
|
||||
# define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8)
|
||||
#endif
|
||||
|
||||
/* For to_utf8_fold_flags, q.v. */
|
||||
#define FOLD_FLAGS_LOCALE 0x1
|
||||
#define FOLD_FLAGS_FULL 0x2
|
||||
|
||||
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, 1)
|
||||
#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, 1)
|
||||
#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
|
||||
FOLD_FLAGS_FULL, NULL)
|
||||
#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
|
||||
#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0, NULL)
|
||||
#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0, NULL)
|
||||
|
||||
/* Source backward compatibility. */
|
||||
#define uvuni_to_utf8(d, uv) uvuni_to_utf8_flags(d, uv, 0)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user