Improve performance of UnicodeNormalize.canonical_ordering_one

Use array_of_integer.sort! instead of buble-sort-like algorithm
This commit is contained in:
tompng 2025-09-09 21:21:22 +09:00 committed by Mari Imaizumi
parent 31e14ac7da
commit 377aa2a336
Notes: git 2025-10-25 12:20:02 +00:00
2 changed files with 33 additions and 8 deletions

View File

@ -82,16 +82,22 @@ module UnicodeNormalize # :nodoc:
## Canonical Ordering
def self.canonical_ordering_one(string)
sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
(sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
(0..i).each do |j|
later_class = sorting[j+1].last
if 0<later_class and later_class<sorting[j].last
sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
end
result = ''
unordered = []
chars = string.chars
n = chars.size
chars.each_with_index do |char, i|
ccc = CLASS_TABLE[char]
if ccc == 0
unordered.sort!.each { result << chars[it % n] }
unordered.clear
result << char
else
unordered << ccc * n + i
end
end
return sorting.collect(&:first).join('')
unordered.sort!.each { result << chars[it % n] }
result
end
## Normalization Forms for Patterns (not whole Strings)

View File

@ -209,4 +209,23 @@ class TestUnicodeNormalize
assert_equal true, ascii_string.unicode_normalized?(:nfkc)
assert_equal true, ascii_string.unicode_normalized?(:nfkd)
end
def test_canonical_ordering
a = "\u03B1\u0313\u0300\u0345"
a_unordered1 = "\u03B1\u0345\u0313\u0300"
a_unordered2 = "\u03B1\u0313\u0345\u0300"
u1 = "U\u0308\u0304"
u2 = "U\u0304\u0308"
s = "s\u0323\u0307"
s_unordered = "s\u0307\u0323"
o = "\u{1611e}\u{1611e}\u{1611f}"
# Actual cases called through String#unicode_normalize
assert_equal(s + o, UnicodeNormalize.canonical_ordering_one(s_unordered + o))
assert_equal(a[1..], UnicodeNormalize.canonical_ordering_one(a_unordered1[1..]))
assert_equal(a[1..] + o, UnicodeNormalize.canonical_ordering_one(a_unordered2[1..] + o))
# Artificial cases
assert_equal(a + u1 + o + u2 + s, UnicodeNormalize.canonical_ordering_one(a + u1 + o + u2 + s))
assert_equal(s[1..] + a + a, UnicodeNormalize.canonical_ordering_one(s_unordered[1..] + a_unordered1 + a_unordered2))
assert_equal(o + s + u1 + a + o + a + u2 + o, UnicodeNormalize.canonical_ordering_one(o + s_unordered + u1 + a_unordered1 + o + a_unordered2 + u2 + o))
end
end