[ruby/json] Use __builtin_memcpy, if available, to copy overlapping byte ranges in copy_remaining_bytes to avoid a branch to MEMCPY. Additionally use a space as padding byte instead of an 'X' so it can be represented diretly on AArch64 with a single instruction.

https://github.com/ruby/json/commit/643ee11fed
2026-01-26 12:14:51 +00:00 · 2026-01-15 19:12:41 -06:00 · 2026-01-15 19:12:41 -06:00 · 456ef9140a
commit 456ef9140a
parent bc6c895d7b
3 changed files with 50 additions and 4 deletions
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@ -297,13 +297,43 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
    char *s = (buf->ptr + buf->len);

    // Pad the buffer with dummy characters that won't need escaping.
-    // This seem wateful at first sight, but memset of vector length is very fast.
-    memset(s, 'X', vec_len);
+    // This seem wasteful at first sight, but memset of vector length is very fast.
+    // This is a space as it can be directly represented as an immediate on AArch64.
+    memset(s, ' ', vec_len);

    // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
    // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
-    MEMCPY(s, search->ptr, char, len);
+#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)

+#ifdef RBIMPL_ASSERT_OR_ASSUME
+    RBIMPL_ASSERT_OR_ASSUME(len < 16);
+#endif
+
+    if (vec_len == 16 && len >= 4) {
+        // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
+        // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
+        // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
+        // position in both copies.
+
+        // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
+        // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
+        // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
+        // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
+        // plus two loads and stores generated when using __builtin_memcpy.
+        if (len >= 8) {
+            __builtin_memcpy(s, search->ptr, 8);
+            __builtin_memcpy(s + len - 8, search->ptr + len - 8, 8);
+        } else {
+            __builtin_memcpy(s, search->ptr, 4);
+            __builtin_memcpy(s + len - 4, search->ptr + len - 4, 4);
+        }
+    } else {
+        MEMCPY(s, search->ptr, char, len);
+    }
+#else
+    MEMCPY(s, search->ptr, char, len);
+#endif
+    
    return s;
 }

--- a/ext/json/simd/simd.h
+++ b/ext/json/simd/simd.h
@ -58,7 +58,7 @@ static inline int trailing_zeros(int input)

 #ifdef JSON_ENABLE_SIMD

-#define SIMD_MINIMUM_THRESHOLD 6
+#define SIMD_MINIMUM_THRESHOLD 4

 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #include <arm_neon.h>
--- a/test/json/json_generator_test.rb
+++ b/test/json/json_generator_test.rb
@ -650,6 +650,22 @@ class JSONGeneratorTest < Test::Unit::TestCase
    json = '"\\nabc"'
    assert_equal json, generate(data)
    #
+    data = "\n"
+    json = '"\\n"'
+    assert_equal json, generate(data)
+    #
+    (0..16).each do |i|
+      data = ('a' * i) + "\n"
+      json = '"' + ('a' * i) + '\\n"'
+      assert_equal json, generate(data)
+    end
+    #
+    (0..16).each do |i|
+      data =  "\n" + ('a' * i)
+      json = '"' + '\\n' + ('a' * i) + '"'
+      assert_equal json, generate(data)
+    end
+    #
    data = ["'"]
    json = '["\\\'"]'
    assert_equal '["\'"]', generate(data)