mirror of
https://github.com/ruby/ruby.git
synced 2026-01-26 12:14:51 +00:00
Fix regexp performance regression for patterns starting with s/k
Commit 981ee02c7c ("Fix performance problem with /k/i and /s/i") was
merged for Ruby 4.0 to enable partial Boyer-Moore optimization for
patterns containing 's' or 'k' by using the prefix before those
characters.
However, when 's' or 'k' appears at the start of a pattern (no usable
prefix), set_bm_skip() returns 0 and the code returned early without
setting any optimization mode, leaving reg->optimize at
ONIG_OPTIMIZE_NONE. This caused up to 30x slowdown for patterns like
/slackware/i when matched against strings with non-ASCII characters.
This patch keeps the improvement from 981ee02c7c for patterns with
3+ char prefix, while fixing the regression by falling back to
ONIG_OPTIMIZE_EXACT_IC with the full pattern when the usable prefix
is less than 3 characters.
Before: /\bslackware\b/i with non-ASCII string: 2.24 us/op
After: /\bslackware\b/i with non-ASCII string: 0.70 us/op (3.2x faster)
[Bug #21824]
This commit is contained in:
parent
09cd13114a
commit
5de4cc5608
Notes:
git
2026-01-13 01:20:10 +00:00
14
regcomp.c
14
regcomp.c
@ -5264,18 +5264,24 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
|
||||
|
||||
if (e->ignore_case > 0) {
|
||||
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
|
||||
int orig_len = e->len;
|
||||
e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
|
||||
reg->map, 1);
|
||||
reg->exact_end = reg->exact + e->len;
|
||||
if (e->len >= 3) {
|
||||
reg->exact_end = reg->exact + e->len;
|
||||
reg->optimize = (allow_reverse != 0
|
||||
? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
|
||||
}
|
||||
else if (e->len > 0) {
|
||||
else {
|
||||
/* Even if BM skip table can't be built (e.g., pattern starts with
|
||||
's' or 'k' which have multi-byte case fold variants), we should
|
||||
still use EXACT_IC optimization with the original pattern.
|
||||
Without this fallback, patterns like /slackware/i have no
|
||||
optimization at all, causing severe performance regression
|
||||
especially with non-ASCII strings. See [Bug #21824] */
|
||||
e->len = orig_len; /* Restore original length for EXACT_IC */
|
||||
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user