my $utf8here, our $utf8here, and package variable $utf8here.

The actual minimal fix is in utf8.c and from NI-S,
the rest are the tests (in fresh_perl since I couldn't get
them easily to work elsewhere) and a slight behaviour change:
previously UTF-8 identifiers had to start with an alphabetic
character.  No more so, now they can start with an (Unicode)
ID_Continue character (which however is not a (Unicode) digit).
(Limiting the first character to ID_Start would be rather
restrictive, since ID_Start allows only alphabetic letters.)

TODO: use vars qw($utf8here).  This I don't find to be
a showstopper.

p4raw-id: //depot/perl@15943
This commit is contained in:
Jarkko Hietaniemi 2002-04-16 03:59:00 +00:00
parent 58858581d2
commit 82686b017b
12 changed files with 84 additions and 13 deletions

View File

@ -342,6 +342,7 @@ Apd |bool |is_utf8_string |U8 *s|STRLEN len
Ap |bool |is_utf8_alnum |U8 *p
Ap |bool |is_utf8_alnumc |U8 *p
Ap |bool |is_utf8_idfirst|U8 *p
Ap |bool |is_utf8_idcont |U8 *p
Ap |bool |is_utf8_alpha |U8 *p
Ap |bool |is_utf8_ascii |U8 *p
Ap |bool |is_utf8_space |U8 *p

View File

@ -325,6 +325,7 @@
#define is_utf8_alnum Perl_is_utf8_alnum
#define is_utf8_alnumc Perl_is_utf8_alnumc
#define is_utf8_idfirst Perl_is_utf8_idfirst
#define is_utf8_idcont Perl_is_utf8_idcont
#define is_utf8_alpha Perl_is_utf8_alpha
#define is_utf8_ascii Perl_is_utf8_ascii
#define is_utf8_space Perl_is_utf8_space
@ -1892,6 +1893,7 @@
#define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a)
#define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a)
#define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a)
#define is_utf8_idcont(a) Perl_is_utf8_idcont(aTHX_ a)
#define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a)
#define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a)
#define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a)

View File

@ -427,6 +427,8 @@
#define PL_utf8_cntrl (PERL_GET_INTERP->Iutf8_cntrl)
#define PL_utf8_digit (PERL_GET_INTERP->Iutf8_digit)
#define PL_utf8_graph (PERL_GET_INTERP->Iutf8_graph)
#define PL_utf8_idcont (PERL_GET_INTERP->Iutf8_idcont)
#define PL_utf8_idstart (PERL_GET_INTERP->Iutf8_idstart)
#define PL_utf8_lower (PERL_GET_INTERP->Iutf8_lower)
#define PL_utf8_mark (PERL_GET_INTERP->Iutf8_mark)
#define PL_utf8_print (PERL_GET_INTERP->Iutf8_print)
@ -726,6 +728,8 @@
#define PL_utf8_cntrl (vTHX->Iutf8_cntrl)
#define PL_utf8_digit (vTHX->Iutf8_digit)
#define PL_utf8_graph (vTHX->Iutf8_graph)
#define PL_utf8_idcont (vTHX->Iutf8_idcont)
#define PL_utf8_idstart (vTHX->Iutf8_idstart)
#define PL_utf8_lower (vTHX->Iutf8_lower)
#define PL_utf8_mark (vTHX->Iutf8_mark)
#define PL_utf8_print (vTHX->Iutf8_print)
@ -1028,6 +1032,8 @@
#define PL_Iutf8_cntrl PL_utf8_cntrl
#define PL_Iutf8_digit PL_utf8_digit
#define PL_Iutf8_graph PL_utf8_graph
#define PL_Iutf8_idcont PL_utf8_idcont
#define PL_Iutf8_idstart PL_utf8_idstart
#define PL_Iutf8_lower PL_utf8_lower
#define PL_Iutf8_mark PL_utf8_mark
#define PL_Iutf8_print PL_utf8_print

View File

@ -460,7 +460,10 @@ Converts the specified character to lowercase.
#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */
#define isALNUM_utf8(p) is_utf8_alnum(p)
#define isIDFIRST_utf8(p) is_utf8_idfirst(p)
/* The ID_Start of Unicode is quite limiting: it assumes a L-class
* character (meaning that you cannot have, say, a CJK character).
* Instead, let's allow ID_Continue but not digits. */
#define isIDFIRST_utf8(p) (is_utf8_idcont(p) && !is_utf8_digit(p))
#define isALPHA_utf8(p) is_utf8_alpha(p)
#define isSPACE_utf8(p) is_utf8_space(p)
#define isDIGIT_utf8(p) is_utf8_digit(p)

View File

@ -517,6 +517,9 @@ PERLVAR(IOpSlab,I32 *)
PERLVAR(Iwantutf8, bool) /* want utf8 as the default discipline */
PERLVAR(Iutf8_idstart, SV *)
PERLVAR(Iutf8_idcont, SV *)
/* New variables must be added to the very end for binary compatibility.
* XSUB.h provides wrapper functions via perlapi.h that make this
* irrelevant, but not all code may be expected to #include XSUB.h. */

View File

@ -12,6 +12,10 @@ sub import {
my ($pack, @imports) = @_;
my ($sym, $ch);
foreach (@imports) {
# TODO: UTF-8 names: (the unpack is quite wrong,
# /^(.)(.*)/ would probably be better.) While you
# are at it, until declaring empty package is made
# to work the * is too lenient.
($ch, $sym) = unpack('a1a*', $_);
if ($sym =~ tr/A-Za-z_0-9//c) {
# time for a more-detailed check-up
@ -20,10 +24,9 @@ sub import {
Carp::croak("Can't declare individual elements of hash or array");
} elsif (warnings::enabled() and length($sym) == 1 and $sym !~ tr/a-zA-Z//) {
warnings::warn("No need to declare built-in vars");
} elsif (($^H &= strict::bits('vars')) &&
# Either no 'use utf8' or if utf8, no non-word
($^H & 0x00800000 == 0 || # matches $utf8::hint_bits
$sym =~ /\W/) ) {
} elsif (($^H &= strict::bits('vars'))) {
# TODO: UTF-8 names: be careful to load the UTF-8
# machinery only if the symbol requires it.
require Carp;
Carp::croak("'$_' is not a valid variable name under strict vars");
}

4
perl.c
View File

@ -686,6 +686,8 @@ perl_destruct(pTHXx)
SvREFCNT_dec(PL_utf8_totitle);
SvREFCNT_dec(PL_utf8_tolower);
SvREFCNT_dec(PL_utf8_tofold);
SvREFCNT_dec(PL_utf8_idstart);
SvREFCNT_dec(PL_utf8_idcont);
PL_utf8_alnum = Nullsv;
PL_utf8_alnumc = Nullsv;
PL_utf8_ascii = Nullsv;
@ -704,6 +706,8 @@ perl_destruct(pTHXx)
PL_utf8_totitle = Nullsv;
PL_utf8_tolower = Nullsv;
PL_utf8_tofold = Nullsv;
PL_utf8_idstart = Nullsv;
PL_utf8_idcont = Nullsv;
if (!specialWARN(PL_compiling.cop_warnings))
SvREFCNT_dec(PL_compiling.cop_warnings);

View File

@ -582,6 +582,10 @@ END_EXTERN_C
#define PL_utf8_digit (*Perl_Iutf8_digit_ptr(aTHX))
#undef PL_utf8_graph
#define PL_utf8_graph (*Perl_Iutf8_graph_ptr(aTHX))
#undef PL_utf8_idcont
#define PL_utf8_idcont (*Perl_Iutf8_idcont_ptr(aTHX))
#undef PL_utf8_idstart
#define PL_utf8_idstart (*Perl_Iutf8_idstart_ptr(aTHX))
#undef PL_utf8_lower
#define PL_utf8_lower (*Perl_Iutf8_lower_ptr(aTHX))
#undef PL_utf8_mark

View File

@ -377,6 +377,7 @@ PERL_CALLCONV bool Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len);
PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_alnumc(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_idcont(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_alpha(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_ascii(pTHX_ U8 *p);
PERL_CALLCONV bool Perl_is_utf8_space(pTHX_ U8 *p);

2
sv.c
View File

@ -10312,6 +10312,8 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param);
PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param);
PL_utf8_tofold = sv_dup_inc(proto_perl->Iutf8_tofold, param);
PL_utf8_idstart = sv_dup_inc(proto_perl->Iutf8_idstart, param);
PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param);
/* swatch cache */
PL_last_swash_hv = Nullhv; /* reinits on demand */

View File

@ -788,3 +788,26 @@ package main;
$test = Foo->new(); # must be package var
EXPECT
ok
######## example from Camel 5, ch. 15, pp.406 (with my)
use strict;
use utf8;
my $ = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph
$++; # a child is born
print $, "\n";
EXPECT
3
######## example from Camel 5, ch. 15, pp.406 (with our)
use strict;
use utf8;
our $ = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph
$++; # a child is born
print $, "\n";
EXPECT
3
######## example from Camel 5, ch. 15, pp.406 (with package vars)
use utf8;
$ = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph
$++; # a child is born
print $, "\n";
EXPECT
3

35
utf8.c
View File

@ -170,12 +170,11 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
=for apidoc A|STRLEN|is_utf8_char|U8 *s
Tests if some arbitrary number of bytes begins in a valid UTF-8
character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
The actual number of bytes in the UTF-8 character will be returned if
it is valid, otherwise 0.
character. Note that an INVARIANT (i.e. ASCII) character is a valid
UTF-8 character. The actual number of bytes in the UTF-8 character
will be returned if it is valid, otherwise 0.
=cut
*/
=cut */
STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
{
@ -1156,9 +1155,27 @@ Perl_is_utf8_alnumc(pTHX_ U8 *p)
}
bool
Perl_is_utf8_idfirst(pTHX_ U8 *p)
Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
{
return *p == '_' || is_utf8_alpha(p);
if (*p == '_')
return TRUE;
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_idstart, p, TRUE);
}
bool
Perl_is_utf8_idcont(pTHX_ U8 *p)
{
if (*p == '_')
return TRUE;
if (!is_utf8_char(p))
return FALSE;
if (!PL_utf8_idcont)
PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
return swash_fetch(PL_utf8_idcont, p, TRUE);
}
bool
@ -1514,9 +1531,11 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
SAVEI32(PL_hints);
PL_hints = 0;
save_re_context();
if (PL_curcop == &PL_compiling)
if (PL_curcop == &PL_compiling) {
/* XXX ought to be handled by lex_start */
SAVEI32(PL_in_my);
sv_setpv(tokenbufsv, PL_tokenbuf);
}
errsv_save = newSVsv(ERRSV);
if (call_method("SWASHNEW", G_SCALAR))
retval = newSVsv(*PL_stack_sp--);