[ruby/json] Extract json_fast_memcpy16 for readability

https://github.com/ruby/json/commit/1b276c8623
2026-01-27 04:24:23 +00:00 · 2026-01-16 18:43:08 +01:00 · 2026-01-16 18:43:08 +01:00 · 3164d4e8a2
commit 3164d4e8a2
parent 456ef9140a
3 changed files with 37 additions and 28 deletions
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@ -288,6 +288,8 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len)

 ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len)
 {
+    RBIMPL_ASSERT_OR_ASSUME(len < vec_len);
+
    // Flush the buffer so everything up until the last 'len' characters are unflushed.
    search_flush(search);

@ -303,37 +305,13 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned

    // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
    // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
-#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
-
-#ifdef RBIMPL_ASSERT_OR_ASSUME
-    RBIMPL_ASSERT_OR_ASSUME(len < 16);
-#endif
-
-    if (vec_len == 16 && len >= 4) {
-        // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
-        // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
-        // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
-        // position in both copies.
-
-        // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
-        // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
-        // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
-        // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
-        // plus two loads and stores generated when using __builtin_memcpy.
-        if (len >= 8) {
-            __builtin_memcpy(s, search->ptr, 8);
-            __builtin_memcpy(s + len - 8, search->ptr + len - 8, 8);
-        } else {
-            __builtin_memcpy(s, search->ptr, 4);
-            __builtin_memcpy(s + len - 4, search->ptr + len - 4, 4);
-        }
+    if (vec_len == 16) {
+        RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD);
+        json_fast_memcpy16(s, search->ptr, len);
    } else {
        MEMCPY(s, search->ptr, char, len);
    }
-#else
-    MEMCPY(s, search->ptr, char, len);
-#endif
-    
+
    return s;
 }

--- a/ext/json/json.h
+++ b/ext/json/json.h
@ -5,6 +5,10 @@
 #include "ruby/encoding.h"
 #include <stdint.h>

+#ifndef RBIMPL_ASSERT_OR_ASSUME
+# define RBIMPL_ASSERT_OR_ASSUME(x)
+#endif
+
 #if defined(RUBY_DEBUG) && RUBY_DEBUG
 # define JSON_ASSERT RUBY_ASSERT
 #else
--- a/ext/json/simd/simd.h
+++ b/ext/json/simd/simd.h
@ -60,6 +60,33 @@ static inline int trailing_zeros(int input)

 #define SIMD_MINIMUM_THRESHOLD 4

+ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
+{
+    RBIMPL_ASSERT_OR_ASSUME(len < 16);
+    RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
+#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
+    // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
+    // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
+    // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
+    // position in both copies.
+
+    // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
+    // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
+    // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
+    // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
+    // plus two loads and stores generated when using __builtin_memcpy.
+    if (len >= 8) {
+        __builtin_memcpy(dst, src, 8);
+        __builtin_memcpy(dst + len - 8, src + len - 8, 8);
+    } else {
+        __builtin_memcpy(dst, src, 4);
+        __builtin_memcpy(dst + len - 4, src + len - 4, 4);
+    }
+#else
+    MEMCPY(dst, src, char, len);
+#endif
+}
+
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #include <arm_neon.h>