regcomp.c: Optimize a single Unicode property in a [character class]

All Unicode properties actually turn into bracketed character classes,
whether explicitly done or not.  A swash is generated for each property
in the class.  If that is the only thing not in the class's bitmap, it
specifies completely the non-bitmap behavior of the class, and can be
passed explicitly to regexec.c.  This avoids having to regenerate the
swash.  It also means that the same swash is used for multiple instances
of a property.  And that means the number of duplicated data structures
is greatly reduced.  This currently doesn't extend to cases where
multiple Unicode properties are used in the same class
[\p{greek}\p{latin}] will not share the same swash as another character
class with the same components.  This is because I don't know of a
an efficient method to determine if a new class being parsed has the
same components as one already generated.  I suppose some sort of
checksum could be generated, but that is for future consideration.
This commit is contained in:
Karl Williamson 2011-11-28 12:32:02 -07:00
parent dc415b4a4c
commit 88d45d285b
4 changed files with 29 additions and 12 deletions

View File

@ -10088,7 +10088,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
UV n;
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. */
* being parsed. If this swash is the only above-latin1 component of the
* character class, an optimization is to pass it directly on to the
* execution engine. Otherwise, it is set to NULL to indicate that there
* are other things in the class that have to be dealt with at execution
* time */
SV* swash = NULL; /* Code points that match \p{} \P{} */
/* Set if a component of this character class is user-defined; just passed
@ -11099,11 +11103,17 @@ parseit:
av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
? &PL_sv_undef
: listsv);
av_store(av, 1, NULL); /* Placeholder for generated swash */
if (swash) {
av_store(av, 1, swash);
SvREFCNT_dec(nonbitmap);
}
else {
av_store(av, 1, NULL);
if (nonbitmap) {
av_store(av, 3, nonbitmap);
av_store(av, 4, newSVuv(has_user_defined_property));
}
}
/* Store any computed multi-char folds only if we are allowing
* them */

View File

@ -198,7 +198,9 @@ Code point 0x110000 is not Unicode, all \p{} matches fail; all \P{} matches succ
require "../test.pl";
use warnings 'utf8';
sub Is_Super { return '!utf8::Any' }
print "\x{1100000}" =~ /\p{Is_Super}/, "\n";
# The extra char is to avoid an optimization that avoids the problem when the
# property is the only non-latin1 char in a class
print "\x{1100000}" =~ /^[\p{Is_Super}\x{100}]$/, "\n";
EXPECT
1
########

View File

@ -20,9 +20,14 @@ BEGIN { # Make sure catches compile time references
my $s = 'foo';
$s =~ m/[\p{Hiragana}]/;
$s =~ m/[\p{Hiragana}]/;
$s =~ m/[\p{Hiragana}]/;
$s =~ m/[\p{Hiragana}]/;
# The second value is to prevent an optimization that exists at the time this
# is written to re-use a property without trying to look it up if it is the
# only thing in a character class. They differ in order to make sure that any
# future optimizations that don't re-use identical character classes don't come
# into play
$s =~ m/[\p{Hiragana}\x{101}]/;
$s =~ m/[\p{Hiragana}\x{102}]/;
$s =~ m/[\p{Hiragana}\x{103}]/;
$s =~ m/[\p{Hiragana}\x{104}]/;
is($::count, 1, "Swatch hash caching kept us from reloading swatch hash.");

10
utf8.c
View File

@ -2588,7 +2588,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
SV** swash_invlistsvp = NULL;
SV* swash_invlist = NULL;
bool invlist_in_swash_is_valid = FALSE;
HV* swash_hv;
HV* swash_hv = NULL;
/* If this operation fetched a swash, get its already existing
* inversion list or create one for it */
@ -2945,10 +2945,10 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
HV *const hv = MUTABLE_HV(SvRV(swash));
SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
SV** listsvp; /* The string containing the main body of the table */
SV** extssvp;
SV** invert_it_svp;
U8* typestr;
SV** listsvp = NULL; /* The string containing the main body of the table */
SV** extssvp = NULL;
SV** invert_it_svp = NULL;
U8* typestr = NULL;
STRLEN bits;
STRLEN octets; /* if bits == 1, then octets == 0 */
UV none;