mirror of
https://github.com/ruby/ruby.git
synced 2026-01-27 20:44:20 +00:00
1673 lines
53 KiB
C
1673 lines
53 KiB
C
#include "../json.h"
|
|
#include "../vendor/ryu.h"
|
|
#include "../simd/simd.h"
|
|
|
|
static VALUE mJSON, eNestingError, Encoding_UTF_8;
|
|
static VALUE CNaN, CInfinity, CMinusInfinity;
|
|
|
|
static ID i_new, i_try_convert, i_uminus, i_encode;
|
|
|
|
static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, sym_symbolize_names, sym_freeze,
|
|
sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
|
|
|
|
static int binary_encindex;
|
|
static int utf8_encindex;
|
|
|
|
#ifndef HAVE_RB_HASH_BULK_INSERT
|
|
// For TruffleRuby
|
|
static void
|
|
rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
|
|
{
|
|
long index = 0;
|
|
while (index < count) {
|
|
VALUE name = pairs[index++];
|
|
VALUE value = pairs[index++];
|
|
rb_hash_aset(hash, name, value);
|
|
}
|
|
RB_GC_GUARD(hash);
|
|
}
|
|
#endif
|
|
|
|
#ifndef HAVE_RB_HASH_NEW_CAPA
|
|
#define rb_hash_new_capa(n) rb_hash_new()
|
|
#endif
|
|
|
|
#ifndef HAVE_RB_STR_TO_INTERNED_STR
|
|
static VALUE rb_str_to_interned_str(VALUE str)
|
|
{
|
|
return rb_funcall(rb_str_freeze(str), i_uminus, 0);
|
|
}
|
|
#endif
|
|
|
|
/* name cache */
|
|
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
|
|
// Object names are likely to be repeated, and are frozen.
|
|
// As such we can re-use them if we keep a cache of the ones we've seen so far,
|
|
// and save much more expensive lookups into the global fstring table.
|
|
// This cache implementation is deliberately simple, as we're optimizing for compactness,
|
|
// to be able to fit safely on the stack.
|
|
// As such, binary search into a sorted array gives a good tradeoff between compactness and
|
|
// performance.
|
|
#define JSON_RVALUE_CACHE_CAPA 63
|
|
typedef struct rvalue_cache_struct {
|
|
int length;
|
|
VALUE entries[JSON_RVALUE_CACHE_CAPA];
|
|
} rvalue_cache;
|
|
|
|
static rb_encoding *enc_utf8;
|
|
|
|
#define JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH 55
|
|
|
|
static inline VALUE build_interned_string(const char *str, const long length)
|
|
{
|
|
# ifdef HAVE_RB_ENC_INTERNED_STR
|
|
return rb_enc_interned_str(str, length, enc_utf8);
|
|
# else
|
|
VALUE rstring = rb_utf8_str_new(str, length);
|
|
return rb_funcall(rb_str_freeze(rstring), i_uminus, 0);
|
|
# endif
|
|
}
|
|
|
|
static inline VALUE build_symbol(const char *str, const long length)
|
|
{
|
|
return rb_str_intern(build_interned_string(str, length));
|
|
}
|
|
|
|
static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring)
|
|
{
|
|
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
|
|
cache->length++;
|
|
cache->entries[index] = rstring;
|
|
}
|
|
|
|
#define rstring_cache_memcmp memcmp
|
|
|
|
#if JSON_CPU_LITTLE_ENDIAN_64BITS
|
|
#if __has_builtin(__builtin_bswap64)
|
|
#undef rstring_cache_memcmp
|
|
ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
|
|
{
|
|
// The libc memcmp has numerous complex optimizations, but in this particular case,
|
|
// we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
|
|
// inline a simpler memcmp outperforms calling the libc version.
|
|
long i = 0;
|
|
|
|
for (; i + 8 <= length; i += 8) {
|
|
uint64_t a, b;
|
|
memcpy(&a, str + i, 8);
|
|
memcpy(&b, rptr + i, 8);
|
|
if (a != b) {
|
|
a = __builtin_bswap64(a);
|
|
b = __builtin_bswap64(b);
|
|
return (a < b) ? -1 : 1;
|
|
}
|
|
}
|
|
|
|
for (; i < length; i++) {
|
|
if (str[i] != rptr[i]) {
|
|
return (str[i] < rptr[i]) ? -1 : 1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
|
|
{
|
|
const char *rstring_ptr;
|
|
long rstring_length;
|
|
|
|
RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
|
|
|
|
if (length == rstring_length) {
|
|
return rstring_cache_memcmp(str, rstring_ptr, length);
|
|
} else {
|
|
return (int)(length - rstring_length);
|
|
}
|
|
}
|
|
|
|
ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
|
|
{
|
|
int low = 0;
|
|
int high = cache->length - 1;
|
|
|
|
while (low <= high) {
|
|
int mid = (high + low) >> 1;
|
|
VALUE entry = cache->entries[mid];
|
|
int cmp = rstring_cache_cmp(str, length, entry);
|
|
|
|
if (cmp == 0) {
|
|
return entry;
|
|
} else if (cmp > 0) {
|
|
low = mid + 1;
|
|
} else {
|
|
high = mid - 1;
|
|
}
|
|
}
|
|
|
|
VALUE rstring = build_interned_string(str, length);
|
|
|
|
if (cache->length < JSON_RVALUE_CACHE_CAPA) {
|
|
rvalue_cache_insert_at(cache, low, rstring);
|
|
}
|
|
return rstring;
|
|
}
|
|
|
|
static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
|
|
{
|
|
int low = 0;
|
|
int high = cache->length - 1;
|
|
|
|
while (low <= high) {
|
|
int mid = (high + low) >> 1;
|
|
VALUE entry = cache->entries[mid];
|
|
int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
|
|
|
|
if (cmp == 0) {
|
|
return entry;
|
|
} else if (cmp > 0) {
|
|
low = mid + 1;
|
|
} else {
|
|
high = mid - 1;
|
|
}
|
|
}
|
|
|
|
VALUE rsymbol = build_symbol(str, length);
|
|
|
|
if (cache->length < JSON_RVALUE_CACHE_CAPA) {
|
|
rvalue_cache_insert_at(cache, low, rsymbol);
|
|
}
|
|
return rsymbol;
|
|
}
|
|
|
|
/* rvalue stack */
|
|
|
|
#define RVALUE_STACK_INITIAL_CAPA 128
|
|
|
|
enum rvalue_stack_type {
|
|
RVALUE_STACK_HEAP_ALLOCATED = 0,
|
|
RVALUE_STACK_STACK_ALLOCATED = 1,
|
|
};
|
|
|
|
typedef struct rvalue_stack_struct {
|
|
enum rvalue_stack_type type;
|
|
long capa;
|
|
long head;
|
|
VALUE *ptr;
|
|
} rvalue_stack;
|
|
|
|
static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref);
|
|
|
|
static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalue_stack **stack_ref)
|
|
{
|
|
long required = stack->capa * 2;
|
|
|
|
if (stack->type == RVALUE_STACK_STACK_ALLOCATED) {
|
|
stack = rvalue_stack_spill(stack, handle, stack_ref);
|
|
} else {
|
|
REALLOC_N(stack->ptr, VALUE, required);
|
|
stack->capa = required;
|
|
}
|
|
return stack;
|
|
}
|
|
|
|
static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, rvalue_stack **stack_ref)
|
|
{
|
|
if (RB_UNLIKELY(stack->head >= stack->capa)) {
|
|
stack = rvalue_stack_grow(stack, handle, stack_ref);
|
|
}
|
|
stack->ptr[stack->head] = value;
|
|
stack->head++;
|
|
return value;
|
|
}
|
|
|
|
static inline VALUE *rvalue_stack_peek(rvalue_stack *stack, long count)
|
|
{
|
|
return stack->ptr + (stack->head - count);
|
|
}
|
|
|
|
static inline void rvalue_stack_pop(rvalue_stack *stack, long count)
|
|
{
|
|
stack->head -= count;
|
|
}
|
|
|
|
static void rvalue_stack_mark(void *ptr)
|
|
{
|
|
rvalue_stack *stack = (rvalue_stack *)ptr;
|
|
long index;
|
|
for (index = 0; index < stack->head; index++) {
|
|
rb_gc_mark(stack->ptr[index]);
|
|
}
|
|
}
|
|
|
|
static void rvalue_stack_free(void *ptr)
|
|
{
|
|
rvalue_stack *stack = (rvalue_stack *)ptr;
|
|
if (stack) {
|
|
ruby_xfree(stack->ptr);
|
|
ruby_xfree(stack);
|
|
}
|
|
}
|
|
|
|
static size_t rvalue_stack_memsize(const void *ptr)
|
|
{
|
|
const rvalue_stack *stack = (const rvalue_stack *)ptr;
|
|
return sizeof(rvalue_stack) + sizeof(VALUE) * stack->capa;
|
|
}
|
|
|
|
static const rb_data_type_t JSON_Parser_rvalue_stack_type = {
|
|
"JSON::Ext::Parser/rvalue_stack",
|
|
{
|
|
.dmark = rvalue_stack_mark,
|
|
.dfree = rvalue_stack_free,
|
|
.dsize = rvalue_stack_memsize,
|
|
},
|
|
0, 0,
|
|
RUBY_TYPED_FREE_IMMEDIATELY,
|
|
};
|
|
|
|
static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref)
|
|
{
|
|
rvalue_stack *stack;
|
|
*handle = TypedData_Make_Struct(0, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack);
|
|
*stack_ref = stack;
|
|
MEMCPY(stack, old_stack, rvalue_stack, 1);
|
|
|
|
stack->capa = old_stack->capa << 1;
|
|
stack->ptr = ALLOC_N(VALUE, stack->capa);
|
|
stack->type = RVALUE_STACK_HEAP_ALLOCATED;
|
|
MEMCPY(stack->ptr, old_stack->ptr, VALUE, old_stack->head);
|
|
return stack;
|
|
}
|
|
|
|
static void rvalue_stack_eagerly_release(VALUE handle)
|
|
{
|
|
if (handle) {
|
|
rvalue_stack *stack;
|
|
TypedData_Get_Struct(handle, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack);
|
|
RTYPEDDATA_DATA(handle) = NULL;
|
|
rvalue_stack_free(stack);
|
|
}
|
|
}
|
|
|
|
static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
|
|
{
|
|
int len = 1;
|
|
if (ch <= 0x7F) {
|
|
buf[0] = (char) ch;
|
|
} else if (ch <= 0x07FF) {
|
|
buf[0] = (char) ((ch >> 6) | 0xC0);
|
|
buf[1] = (char) ((ch & 0x3F) | 0x80);
|
|
len++;
|
|
} else if (ch <= 0xFFFF) {
|
|
buf[0] = (char) ((ch >> 12) | 0xE0);
|
|
buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80);
|
|
buf[2] = (char) ((ch & 0x3F) | 0x80);
|
|
len += 2;
|
|
} else if (ch <= 0x1fffff) {
|
|
buf[0] =(char) ((ch >> 18) | 0xF0);
|
|
buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80);
|
|
buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80);
|
|
buf[3] =(char) ((ch & 0x3F) | 0x80);
|
|
len += 3;
|
|
} else {
|
|
buf[0] = '?';
|
|
}
|
|
return len;
|
|
}
|
|
|
|
enum duplicate_key_action {
|
|
JSON_DEPRECATED = 0,
|
|
JSON_IGNORE,
|
|
JSON_RAISE,
|
|
};
|
|
|
|
typedef struct JSON_ParserStruct {
|
|
VALUE on_load_proc;
|
|
VALUE decimal_class;
|
|
ID decimal_method_id;
|
|
enum duplicate_key_action on_duplicate_key;
|
|
int max_nesting;
|
|
bool allow_nan;
|
|
bool allow_trailing_comma;
|
|
bool allow_control_characters;
|
|
bool symbolize_names;
|
|
bool freeze;
|
|
} JSON_ParserConfig;
|
|
|
|
typedef struct JSON_ParserStateStruct {
|
|
VALUE stack_handle;
|
|
const char *start;
|
|
const char *cursor;
|
|
const char *end;
|
|
rvalue_stack *stack;
|
|
rvalue_cache name_cache;
|
|
int in_array;
|
|
int current_nesting;
|
|
} JSON_ParserState;
|
|
|
|
static inline size_t rest(JSON_ParserState *state) {
|
|
return state->end - state->cursor;
|
|
}
|
|
|
|
static inline bool eos(JSON_ParserState *state) {
|
|
return state->cursor >= state->end;
|
|
}
|
|
|
|
static inline char peek(JSON_ParserState *state)
|
|
{
|
|
if (RB_UNLIKELY(eos(state))) {
|
|
return 0;
|
|
}
|
|
return *state->cursor;
|
|
}
|
|
|
|
static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
|
|
{
|
|
const char *cursor = state->cursor;
|
|
long column = 0;
|
|
long line = 1;
|
|
|
|
while (cursor >= state->start) {
|
|
if (*cursor-- == '\n') {
|
|
break;
|
|
}
|
|
column++;
|
|
}
|
|
|
|
while (cursor >= state->start) {
|
|
if (*cursor-- == '\n') {
|
|
line++;
|
|
}
|
|
}
|
|
*line_out = line;
|
|
*column_out = column;
|
|
}
|
|
|
|
static void emit_parse_warning(const char *message, JSON_ParserState *state)
|
|
{
|
|
long line, column;
|
|
cursor_position(state, &line, &column);
|
|
|
|
VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column);
|
|
rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning);
|
|
}
|
|
|
|
#define PARSE_ERROR_FRAGMENT_LEN 32
|
|
|
|
#ifdef RBIMPL_ATTR_NORETURN
|
|
RBIMPL_ATTR_NORETURN()
|
|
#endif
|
|
static void raise_parse_error(const char *format, JSON_ParserState *state)
|
|
{
|
|
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
|
|
long line, column;
|
|
cursor_position(state, &line, &column);
|
|
|
|
const char *ptr = "EOF";
|
|
if (state->cursor && state->cursor < state->end) {
|
|
ptr = state->cursor;
|
|
size_t len = 0;
|
|
while (len < PARSE_ERROR_FRAGMENT_LEN) {
|
|
char ch = ptr[len];
|
|
if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') {
|
|
break;
|
|
}
|
|
len++;
|
|
}
|
|
|
|
if (len) {
|
|
buffer[0] = '\'';
|
|
MEMCPY(buffer + 1, ptr, char, len);
|
|
|
|
while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte
|
|
len--;
|
|
}
|
|
|
|
if (buffer[len] >= 0xC0) { // multibyte character start
|
|
len--;
|
|
}
|
|
|
|
buffer[len + 1] = '\'';
|
|
buffer[len + 2] = '\0';
|
|
ptr = (const char *)buffer;
|
|
}
|
|
}
|
|
|
|
VALUE msg = rb_sprintf(format, ptr);
|
|
VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column);
|
|
RB_GC_GUARD(msg);
|
|
|
|
VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message);
|
|
rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line));
|
|
rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column));
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
#ifdef RBIMPL_ATTR_NORETURN
|
|
RBIMPL_ATTR_NORETURN()
|
|
#endif
|
|
static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
|
|
{
|
|
state->cursor = at;
|
|
raise_parse_error(format, state);
|
|
}
|
|
|
|
/* unicode */
|
|
|
|
static const signed char digit_values[256] = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1,
|
|
-1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1
|
|
};
|
|
|
|
static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
|
|
{
|
|
if (RB_UNLIKELY(sp > spe - 4)) {
|
|
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
|
|
}
|
|
|
|
const unsigned char *p = (const unsigned char *)sp;
|
|
|
|
const signed char b0 = digit_values[p[0]];
|
|
const signed char b1 = digit_values[p[1]];
|
|
const signed char b2 = digit_values[p[2]];
|
|
const signed char b3 = digit_values[p[3]];
|
|
|
|
if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
|
|
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
|
|
}
|
|
|
|
return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
|
|
}
|
|
|
|
#define GET_PARSER_CONFIG \
|
|
JSON_ParserConfig *config; \
|
|
TypedData_Get_Struct(self, JSON_ParserConfig, &JSON_ParserConfig_type, config)
|
|
|
|
static const rb_data_type_t JSON_ParserConfig_type;
|
|
|
|
static void
|
|
json_eat_comments(JSON_ParserState *state)
|
|
{
|
|
const char *start = state->cursor;
|
|
state->cursor++;
|
|
|
|
switch (peek(state)) {
|
|
case '/': {
|
|
state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
|
|
if (!state->cursor) {
|
|
state->cursor = state->end;
|
|
} else {
|
|
state->cursor++;
|
|
}
|
|
break;
|
|
}
|
|
case '*': {
|
|
state->cursor++;
|
|
|
|
while (true) {
|
|
const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
|
|
if (!next_match) {
|
|
raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
|
|
}
|
|
|
|
state->cursor = next_match + 1;
|
|
if (peek(state) == '/') {
|
|
state->cursor++;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
raise_parse_error_at("unexpected token %s", state, start);
|
|
break;
|
|
}
|
|
}
|
|
|
|
ALWAYS_INLINE(static) void
|
|
json_eat_whitespace(JSON_ParserState *state)
|
|
{
|
|
while (true) {
|
|
switch (peek(state)) {
|
|
case ' ':
|
|
state->cursor++;
|
|
break;
|
|
case '\n':
|
|
state->cursor++;
|
|
|
|
// Heuristic: if we see a newline, there is likely consecutive spaces after it.
|
|
#if JSON_CPU_LITTLE_ENDIAN_64BITS
|
|
while (rest(state) > 8) {
|
|
uint64_t chunk;
|
|
memcpy(&chunk, state->cursor, sizeof(uint64_t));
|
|
if (chunk == 0x2020202020202020) {
|
|
state->cursor += 8;
|
|
continue;
|
|
}
|
|
|
|
uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
|
|
state->cursor += consecutive_spaces;
|
|
break;
|
|
}
|
|
#endif
|
|
break;
|
|
case '\t':
|
|
case '\r':
|
|
state->cursor++;
|
|
break;
|
|
case '/':
|
|
json_eat_comments(state);
|
|
break;
|
|
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline VALUE build_string(const char *start, const char *end, bool intern, bool symbolize)
|
|
{
|
|
if (symbolize) {
|
|
intern = true;
|
|
}
|
|
VALUE result;
|
|
# ifdef HAVE_RB_ENC_INTERNED_STR
|
|
if (intern) {
|
|
result = rb_enc_interned_str(start, (long)(end - start), enc_utf8);
|
|
} else {
|
|
result = rb_utf8_str_new(start, (long)(end - start));
|
|
}
|
|
# else
|
|
result = rb_utf8_str_new(start, (long)(end - start));
|
|
if (intern) {
|
|
result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
|
|
}
|
|
# endif
|
|
|
|
if (symbolize) {
|
|
result = rb_str_intern(result);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static inline bool json_string_cacheable_p(const char *string, size_t length)
|
|
{
|
|
// We mostly want to cache strings that are likely to be repeated.
|
|
// Simple heuristics:
|
|
// - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
|
|
// - If the first character isn't a letter, we're much less likely to see this string again.
|
|
return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
|
|
}
|
|
|
|
static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
|
|
{
|
|
bool intern = is_name || config->freeze;
|
|
bool symbolize = is_name && config->symbolize_names;
|
|
size_t bufferSize = stringEnd - string;
|
|
|
|
if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
|
|
VALUE cached_key;
|
|
if (RB_UNLIKELY(symbolize)) {
|
|
cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
|
|
} else {
|
|
cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
|
|
}
|
|
|
|
if (RB_LIKELY(cached_key)) {
|
|
return cached_key;
|
|
}
|
|
}
|
|
|
|
return build_string(string, stringEnd, intern, symbolize);
|
|
}
|
|
|
|
#define JSON_MAX_UNESCAPE_POSITIONS 16
|
|
typedef struct _json_unescape_positions {
|
|
long size;
|
|
const char **positions;
|
|
unsigned long additional_backslashes;
|
|
} JSON_UnescapePositions;
|
|
|
|
static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
|
|
{
|
|
while (positions->size) {
|
|
positions->size--;
|
|
const char *next_position = positions->positions[0];
|
|
positions->positions++;
|
|
if (next_position >= pe) {
|
|
return next_position;
|
|
}
|
|
}
|
|
|
|
if (positions->additional_backslashes) {
|
|
positions->additional_backslashes--;
|
|
return memchr(pe, '\\', stringEnd - pe);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
|
|
{
|
|
bool intern = is_name || config->freeze;
|
|
bool symbolize = is_name && config->symbolize_names;
|
|
size_t bufferSize = stringEnd - string;
|
|
const char *p = string, *pe = string, *bufferStart;
|
|
char *buffer;
|
|
|
|
VALUE result = rb_str_buf_new(bufferSize);
|
|
rb_enc_associate_index(result, utf8_encindex);
|
|
buffer = RSTRING_PTR(result);
|
|
bufferStart = buffer;
|
|
|
|
#define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
|
|
|
|
while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
|
|
if (pe > p) {
|
|
MEMCPY(buffer, p, char, pe - p);
|
|
buffer += pe - p;
|
|
}
|
|
switch (*++pe) {
|
|
case '"':
|
|
case '/':
|
|
p = pe; // nothing to unescape just need to skip the backslash
|
|
break;
|
|
case '\\':
|
|
APPEND_CHAR('\\');
|
|
break;
|
|
case 'n':
|
|
APPEND_CHAR('\n');
|
|
break;
|
|
case 'r':
|
|
APPEND_CHAR('\r');
|
|
break;
|
|
case 't':
|
|
APPEND_CHAR('\t');
|
|
break;
|
|
case 'b':
|
|
APPEND_CHAR('\b');
|
|
break;
|
|
case 'f':
|
|
APPEND_CHAR('\f');
|
|
break;
|
|
case 'u': {
|
|
uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
|
|
pe += 3;
|
|
/* To handle values above U+FFFF, we take a sequence of
|
|
* \uXXXX escapes in the U+D800..U+DBFF then
|
|
* U+DC00..U+DFFF ranges, take the low 10 bits from each
|
|
* to make a 20-bit number, then add 0x10000 to get the
|
|
* final codepoint.
|
|
*
|
|
* See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
|
|
* Surrogate Pairs in UTF-16", and 23.6 "Surrogates
|
|
* Area".
|
|
*/
|
|
if ((ch & 0xFC00) == 0xD800) {
|
|
pe++;
|
|
if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
|
|
uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
|
|
|
|
if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
|
|
raise_parse_error_at("invalid surrogate pair at %s", state, p);
|
|
}
|
|
|
|
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
|
|
pe += 5;
|
|
} else {
|
|
raise_parse_error_at("incomplete surrogate pair at %s", state, p);
|
|
break;
|
|
}
|
|
}
|
|
|
|
int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
|
|
buffer += unescape_len;
|
|
p = ++pe;
|
|
break;
|
|
}
|
|
default:
|
|
if ((unsigned char)*pe < 0x20) {
|
|
if (!config->allow_control_characters) {
|
|
if (*pe == '\n') {
|
|
raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
|
|
}
|
|
raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
|
|
}
|
|
} else {
|
|
raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
#undef APPEND_CHAR
|
|
|
|
if (stringEnd > p) {
|
|
MEMCPY(buffer, p, char, stringEnd - p);
|
|
buffer += stringEnd - p;
|
|
}
|
|
rb_str_set_len(result, buffer - bufferStart);
|
|
|
|
if (symbolize) {
|
|
result = rb_str_intern(result);
|
|
} else if (intern) {
|
|
result = rb_str_to_interned_str(result);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
#define MAX_FAST_INTEGER_SIZE 18
|
|
|
|
static VALUE json_decode_large_integer(const char *start, long len)
|
|
{
|
|
VALUE buffer_v;
|
|
char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
|
|
MEMCPY(buffer, start, char, len);
|
|
buffer[len] = '\0';
|
|
VALUE number = rb_cstr2inum(buffer, 10);
|
|
RB_ALLOCV_END(buffer_v);
|
|
return number;
|
|
}
|
|
|
|
static inline VALUE
|
|
json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
|
|
{
|
|
if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
|
|
if (negative) {
|
|
return INT64T2NUM(-((int64_t)mantissa));
|
|
}
|
|
return UINT64T2NUM(mantissa);
|
|
}
|
|
|
|
return json_decode_large_integer(start, end - start);
|
|
}
|
|
|
|
static VALUE json_decode_large_float(const char *start, long len)
|
|
{
|
|
if (RB_LIKELY(len < 64)) {
|
|
char buffer[64];
|
|
MEMCPY(buffer, start, char, len);
|
|
buffer[len] = '\0';
|
|
return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
|
|
}
|
|
|
|
VALUE buffer_v;
|
|
char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
|
|
MEMCPY(buffer, start, char, len);
|
|
buffer[len] = '\0';
|
|
VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1));
|
|
RB_ALLOCV_END(buffer_v);
|
|
return number;
|
|
}
|
|
|
|
/* Ruby JSON optimized float decoder using vendored Ryu algorithm
|
|
* Accepts pre-extracted mantissa and exponent from first-pass validation
|
|
*/
|
|
static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
|
|
const char *start, const char *end)
|
|
{
|
|
if (RB_UNLIKELY(config->decimal_class)) {
|
|
VALUE text = rb_str_new(start, end - start);
|
|
return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
|
|
}
|
|
|
|
// Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
|
|
// Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
|
|
if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
|
|
return json_decode_large_float(start, end - start);
|
|
}
|
|
|
|
return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
|
|
}
|
|
|
|
static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
|
|
{
|
|
VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->stack, count));
|
|
rvalue_stack_pop(state->stack, count);
|
|
|
|
if (config->freeze) {
|
|
RB_OBJ_FREEZE(array);
|
|
}
|
|
|
|
return array;
|
|
}
|
|
|
|
static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
|
|
{
|
|
VALUE set = rb_hash_new_capa(count / 2);
|
|
for (size_t index = 0; index < count; index += 2) {
|
|
size_t before = RHASH_SIZE(set);
|
|
VALUE key = pairs[index];
|
|
rb_hash_aset(set, key, Qtrue);
|
|
if (RHASH_SIZE(set) == before) {
|
|
if (RB_SYMBOL_P(key)) {
|
|
return rb_sym2str(key);
|
|
}
|
|
return key;
|
|
}
|
|
}
|
|
return Qfalse;
|
|
}
|
|
|
|
static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
|
|
{
|
|
VALUE message = rb_sprintf(
|
|
"detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
|
|
rb_inspect(duplicate_key)
|
|
);
|
|
|
|
emit_parse_warning(RSTRING_PTR(message), state);
|
|
RB_GC_GUARD(message);
|
|
}
|
|
|
|
#ifdef RBIMPL_ATTR_NORETURN
|
|
RBIMPL_ATTR_NORETURN()
|
|
#endif
|
|
static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
|
|
{
|
|
VALUE message = rb_sprintf(
|
|
"duplicate key %"PRIsVALUE,
|
|
rb_inspect(duplicate_key)
|
|
);
|
|
|
|
raise_parse_error(RSTRING_PTR(message), state);
|
|
RB_GC_GUARD(message);
|
|
}
|
|
|
|
static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count)
|
|
{
|
|
size_t entries_count = count / 2;
|
|
VALUE object = rb_hash_new_capa(entries_count);
|
|
const VALUE *pairs = rvalue_stack_peek(state->stack, count);
|
|
rb_hash_bulk_insert(count, pairs, object);
|
|
|
|
if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) {
|
|
switch (config->on_duplicate_key) {
|
|
case JSON_IGNORE:
|
|
break;
|
|
case JSON_DEPRECATED:
|
|
emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
|
|
break;
|
|
case JSON_RAISE:
|
|
raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
|
|
break;
|
|
}
|
|
}
|
|
|
|
rvalue_stack_pop(state->stack, count);
|
|
|
|
if (config->freeze) {
|
|
RB_OBJ_FREEZE(object);
|
|
}
|
|
|
|
return object;
|
|
}
|
|
|
|
static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
|
|
{
|
|
if (RB_UNLIKELY(config->on_load_proc)) {
|
|
value = rb_proc_call_with_block(config->on_load_proc, 1, &value, Qnil);
|
|
}
|
|
rvalue_stack_push(state->stack, value, &state->stack_handle, &state->stack);
|
|
return value;
|
|
}
|
|
|
|
static const bool string_scan_table[256] = {
|
|
// ASCII Control Characters
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
// ASCII Characters
|
|
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // '\\'
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
#ifdef HAVE_SIMD
|
|
static SIMD_Implementation simd_impl = SIMD_NONE;
|
|
#endif /* HAVE_SIMD */
|
|
|
|
ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
|
|
{
|
|
#ifdef HAVE_SIMD
|
|
#if defined(HAVE_SIMD_NEON)
|
|
|
|
uint64_t mask = 0;
|
|
if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
|
|
state->cursor += trailing_zeros64(mask) >> 2;
|
|
return true;
|
|
}
|
|
|
|
#elif defined(HAVE_SIMD_SSE2)
|
|
if (simd_impl == SIMD_SSE2) {
|
|
int mask = 0;
|
|
if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
|
|
state->cursor += trailing_zeros(mask);
|
|
return true;
|
|
}
|
|
}
|
|
#endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
|
|
#endif /* HAVE_SIMD */
|
|
|
|
while (!eos(state)) {
|
|
if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
|
|
return true;
|
|
}
|
|
state->cursor++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
|
|
{
|
|
const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
|
|
JSON_UnescapePositions positions = {
|
|
.size = 0,
|
|
.positions = backslashes,
|
|
.additional_backslashes = 0,
|
|
};
|
|
|
|
do {
|
|
switch (*state->cursor) {
|
|
case '"': {
|
|
VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
|
|
state->cursor++;
|
|
return json_push_value(state, config, string);
|
|
}
|
|
case '\\': {
|
|
if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
|
|
backslashes[positions.size] = state->cursor;
|
|
positions.size++;
|
|
} else {
|
|
positions.additional_backslashes++;
|
|
}
|
|
state->cursor++;
|
|
break;
|
|
}
|
|
default:
|
|
if (!config->allow_control_characters) {
|
|
raise_parse_error("invalid ASCII control character in string: %s", state);
|
|
}
|
|
break;
|
|
}
|
|
|
|
state->cursor++;
|
|
} while (string_scan(state));
|
|
|
|
raise_parse_error("unexpected end of input, expected closing \"", state);
|
|
return Qfalse;
|
|
}
|
|
|
|
ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
|
|
{
|
|
state->cursor++;
|
|
const char *start = state->cursor;
|
|
|
|
if (RB_UNLIKELY(!string_scan(state))) {
|
|
raise_parse_error("unexpected end of input, expected closing \"", state);
|
|
}
|
|
|
|
if (RB_LIKELY(*state->cursor == '"')) {
|
|
VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
|
|
state->cursor++;
|
|
return json_push_value(state, config, string);
|
|
}
|
|
return json_parse_escaped_string(state, config, is_name, start);
|
|
}
|
|
|
|
#if JSON_CPU_LITTLE_ENDIAN_64BITS
|
|
// From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
|
|
// Additional References:
|
|
// https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
// http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
|
|
static inline uint64_t decode_8digits_unrolled(uint64_t val) {
|
|
const uint64_t mask = 0x000000FF000000FF;
|
|
const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
|
|
const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
|
|
val -= 0x3030303030303030;
|
|
val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
|
|
val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
|
|
return val;
|
|
}
|
|
|
|
static inline uint64_t decode_4digits_unrolled(uint32_t val) {
|
|
const uint32_t mask = 0x000000FF;
|
|
const uint32_t mul1 = 100;
|
|
val -= 0x30303030;
|
|
val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
|
|
val = ((val & mask) * mul1) + (((val >> 16) & mask));
|
|
return val;
|
|
}
|
|
#endif
|
|
|
|
static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
|
|
{
|
|
const char *start = state->cursor;
|
|
|
|
#if JSON_CPU_LITTLE_ENDIAN_64BITS
|
|
while (rest(state) >= sizeof(uint64_t)) {
|
|
uint64_t next_8bytes;
|
|
memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
|
|
|
|
// From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
|
|
// Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
|
|
uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
|
|
|
|
if (match == 0x3333333333333333) { // 8 consecutive digits
|
|
*accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
|
|
state->cursor += 8;
|
|
continue;
|
|
}
|
|
|
|
uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
|
|
|
|
if (consecutive_digits >= 4) {
|
|
*accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
|
|
state->cursor += 4;
|
|
consecutive_digits -= 4;
|
|
}
|
|
|
|
while (consecutive_digits) {
|
|
*accumulator = *accumulator * 10 + (*state->cursor - '0');
|
|
consecutive_digits--;
|
|
state->cursor++;
|
|
}
|
|
|
|
return (int)(state->cursor - start);
|
|
}
|
|
#endif
|
|
|
|
char next_char;
|
|
while (rb_isdigit(next_char = peek(state))) {
|
|
*accumulator = *accumulator * 10 + (next_char - '0');
|
|
state->cursor++;
|
|
}
|
|
return (int)(state->cursor - start);
|
|
}
|
|
|
|
static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
|
|
{
|
|
bool integer = true;
|
|
const char first_digit = *state->cursor;
|
|
|
|
// Variables for Ryu optimization - extract digits during parsing
|
|
int32_t exponent = 0;
|
|
int decimal_point_pos = -1;
|
|
uint64_t mantissa = 0;
|
|
|
|
// Parse integer part and extract mantissa digits
|
|
int mantissa_digits = json_parse_digits(state, &mantissa);
|
|
|
|
if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
|
|
raise_parse_error_at("invalid number: %s", state, start);
|
|
}
|
|
|
|
// Parse fractional part
|
|
if (peek(state) == '.') {
|
|
integer = false;
|
|
decimal_point_pos = mantissa_digits; // Remember position of decimal point
|
|
state->cursor++;
|
|
|
|
int fractional_digits = json_parse_digits(state, &mantissa);
|
|
mantissa_digits += fractional_digits;
|
|
|
|
if (RB_UNLIKELY(!fractional_digits)) {
|
|
raise_parse_error_at("invalid number: %s", state, start);
|
|
}
|
|
}
|
|
|
|
// Parse exponent
|
|
if (rb_tolower(peek(state)) == 'e') {
|
|
integer = false;
|
|
state->cursor++;
|
|
|
|
bool negative_exponent = false;
|
|
const char next_char = peek(state);
|
|
if (next_char == '-' || next_char == '+') {
|
|
negative_exponent = next_char == '-';
|
|
state->cursor++;
|
|
}
|
|
|
|
uint64_t abs_exponent = 0;
|
|
int exponent_digits = json_parse_digits(state, &abs_exponent);
|
|
|
|
if (RB_UNLIKELY(!exponent_digits)) {
|
|
raise_parse_error_at("invalid number: %s", state, start);
|
|
}
|
|
|
|
exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
|
|
}
|
|
|
|
if (integer) {
|
|
return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
|
|
}
|
|
|
|
// Adjust exponent based on decimal point position
|
|
if (decimal_point_pos >= 0) {
|
|
exponent -= (mantissa_digits - decimal_point_pos);
|
|
}
|
|
|
|
return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
|
|
}
|
|
|
|
static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
|
|
{
|
|
return json_parse_number(state, config, false, state->cursor);
|
|
}
|
|
|
|
static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
|
|
{
|
|
const char *start = state->cursor;
|
|
state->cursor++;
|
|
return json_parse_number(state, config, true, start);
|
|
}
|
|
|
|
static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
|
|
{
|
|
json_eat_whitespace(state);
|
|
|
|
switch (peek(state)) {
|
|
case 'n':
|
|
if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
|
|
state->cursor += 4;
|
|
return json_push_value(state, config, Qnil);
|
|
}
|
|
|
|
raise_parse_error("unexpected token %s", state);
|
|
break;
|
|
case 't':
|
|
if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
|
|
state->cursor += 4;
|
|
return json_push_value(state, config, Qtrue);
|
|
}
|
|
|
|
raise_parse_error("unexpected token %s", state);
|
|
break;
|
|
case 'f':
|
|
// Note: memcmp with a small power of two compile to an integer comparison
|
|
if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
|
|
state->cursor += 5;
|
|
return json_push_value(state, config, Qfalse);
|
|
}
|
|
|
|
raise_parse_error("unexpected token %s", state);
|
|
break;
|
|
case 'N':
|
|
// Note: memcmp with a small power of two compile to an integer comparison
|
|
if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
|
|
state->cursor += 3;
|
|
return json_push_value(state, config, CNaN);
|
|
}
|
|
|
|
raise_parse_error("unexpected token %s", state);
|
|
break;
|
|
case 'I':
|
|
if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
|
|
state->cursor += 8;
|
|
return json_push_value(state, config, CInfinity);
|
|
}
|
|
|
|
raise_parse_error("unexpected token %s", state);
|
|
break;
|
|
case '-': {
|
|
// Note: memcmp with a small power of two compile to an integer comparison
|
|
if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
|
|
if (config->allow_nan) {
|
|
state->cursor += 9;
|
|
return json_push_value(state, config, CMinusInfinity);
|
|
} else {
|
|
raise_parse_error("unexpected token %s", state);
|
|
}
|
|
}
|
|
return json_push_value(state, config, json_parse_negative_number(state, config));
|
|
break;
|
|
}
|
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
return json_push_value(state, config, json_parse_positive_number(state, config));
|
|
break;
|
|
case '"': {
|
|
// %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
|
|
return json_parse_string(state, config, false);
|
|
break;
|
|
}
|
|
case '[': {
|
|
state->cursor++;
|
|
json_eat_whitespace(state);
|
|
long stack_head = state->stack->head;
|
|
|
|
if (peek(state) == ']') {
|
|
state->cursor++;
|
|
return json_push_value(state, config, json_decode_array(state, config, 0));
|
|
} else {
|
|
state->current_nesting++;
|
|
if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) {
|
|
rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
|
|
}
|
|
state->in_array++;
|
|
json_parse_any(state, config);
|
|
}
|
|
|
|
while (true) {
|
|
json_eat_whitespace(state);
|
|
|
|
const char next_char = peek(state);
|
|
|
|
if (RB_LIKELY(next_char == ',')) {
|
|
state->cursor++;
|
|
if (config->allow_trailing_comma) {
|
|
json_eat_whitespace(state);
|
|
if (peek(state) == ']') {
|
|
continue;
|
|
}
|
|
}
|
|
json_parse_any(state, config);
|
|
continue;
|
|
}
|
|
|
|
if (next_char == ']') {
|
|
state->cursor++;
|
|
long count = state->stack->head - stack_head;
|
|
state->current_nesting--;
|
|
state->in_array--;
|
|
return json_push_value(state, config, json_decode_array(state, config, count));
|
|
}
|
|
|
|
raise_parse_error("expected ',' or ']' after array value", state);
|
|
}
|
|
break;
|
|
}
|
|
case '{': {
|
|
const char *object_start_cursor = state->cursor;
|
|
|
|
state->cursor++;
|
|
json_eat_whitespace(state);
|
|
long stack_head = state->stack->head;
|
|
|
|
if (peek(state) == '}') {
|
|
state->cursor++;
|
|
return json_push_value(state, config, json_decode_object(state, config, 0));
|
|
} else {
|
|
state->current_nesting++;
|
|
if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) {
|
|
rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
|
|
}
|
|
|
|
if (peek(state) != '"') {
|
|
raise_parse_error("expected object key, got %s", state);
|
|
}
|
|
json_parse_string(state, config, true);
|
|
|
|
json_eat_whitespace(state);
|
|
if (peek(state) != ':') {
|
|
raise_parse_error("expected ':' after object key", state);
|
|
}
|
|
state->cursor++;
|
|
|
|
json_parse_any(state, config);
|
|
}
|
|
|
|
while (true) {
|
|
json_eat_whitespace(state);
|
|
|
|
const char next_char = peek(state);
|
|
if (next_char == '}') {
|
|
state->cursor++;
|
|
state->current_nesting--;
|
|
size_t count = state->stack->head - stack_head;
|
|
|
|
// Temporary rewind cursor in case an error is raised
|
|
const char *final_cursor = state->cursor;
|
|
state->cursor = object_start_cursor;
|
|
VALUE object = json_decode_object(state, config, count);
|
|
state->cursor = final_cursor;
|
|
|
|
return json_push_value(state, config, object);
|
|
}
|
|
|
|
if (next_char == ',') {
|
|
state->cursor++;
|
|
json_eat_whitespace(state);
|
|
|
|
if (config->allow_trailing_comma) {
|
|
if (peek(state) == '}') {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (RB_UNLIKELY(peek(state) != '"')) {
|
|
raise_parse_error("expected object key, got: %s", state);
|
|
}
|
|
json_parse_string(state, config, true);
|
|
|
|
json_eat_whitespace(state);
|
|
if (RB_UNLIKELY(peek(state) != ':')) {
|
|
raise_parse_error("expected ':' after object key, got: %s", state);
|
|
}
|
|
state->cursor++;
|
|
|
|
json_parse_any(state, config);
|
|
|
|
continue;
|
|
}
|
|
|
|
raise_parse_error("expected ',' or '}' after object value, got: %s", state);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 0:
|
|
raise_parse_error("unexpected end of input", state);
|
|
break;
|
|
|
|
default:
|
|
raise_parse_error("unexpected character: %s", state);
|
|
break;
|
|
}
|
|
|
|
raise_parse_error("unreachable: %s", state);
|
|
return Qundef;
|
|
}
|
|
|
|
static void json_ensure_eof(JSON_ParserState *state)
|
|
{
|
|
json_eat_whitespace(state);
|
|
if (!eos(state)) {
|
|
raise_parse_error("unexpected token at end of stream %s", state);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Document-class: JSON::Ext::Parser
|
|
*
|
|
* This is the JSON parser implemented as a C extension. It can be configured
|
|
* to be used by setting
|
|
*
|
|
* JSON.parser = JSON::Ext::Parser
|
|
*
|
|
* with the method parser= in JSON.
|
|
*
|
|
*/
|
|
|
|
static VALUE convert_encoding(VALUE source)
|
|
{
|
|
int encindex = RB_ENCODING_GET(source);
|
|
|
|
if (RB_LIKELY(encindex == utf8_encindex)) {
|
|
return source;
|
|
}
|
|
|
|
if (encindex == binary_encindex) {
|
|
// For historical reason, we silently reinterpret binary strings as UTF-8
|
|
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
|
|
}
|
|
|
|
return rb_funcall(source, i_encode, 1, Encoding_UTF_8);
|
|
}
|
|
|
|
static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
|
|
{
|
|
JSON_ParserConfig *config = (JSON_ParserConfig *)data;
|
|
|
|
if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
|
|
else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
|
|
else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
|
|
else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
|
|
else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
|
|
else if (key == sym_freeze) { config->freeze = RTEST(val); }
|
|
else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
|
|
else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
|
|
else if (key == sym_decimal_class) {
|
|
if (RTEST(val)) {
|
|
if (rb_respond_to(val, i_try_convert)) {
|
|
config->decimal_class = val;
|
|
config->decimal_method_id = i_try_convert;
|
|
} else if (rb_respond_to(val, i_new)) {
|
|
config->decimal_class = val;
|
|
config->decimal_method_id = i_new;
|
|
} else if (RB_TYPE_P(val, T_CLASS)) {
|
|
VALUE name = rb_class_name(val);
|
|
const char *name_cstr = RSTRING_PTR(name);
|
|
const char *last_colon = strrchr(name_cstr, ':');
|
|
if (last_colon) {
|
|
const char *mod_path_end = last_colon - 1;
|
|
VALUE mod_path = rb_str_substr(name, 0, mod_path_end - name_cstr);
|
|
config->decimal_class = rb_path_to_class(mod_path);
|
|
|
|
const char *method_name_beg = last_colon + 1;
|
|
long before_len = method_name_beg - name_cstr;
|
|
long len = RSTRING_LEN(name) - before_len;
|
|
VALUE method_name = rb_str_substr(name, before_len, len);
|
|
config->decimal_method_id = SYM2ID(rb_str_intern(method_name));
|
|
} else {
|
|
config->decimal_class = rb_mKernel;
|
|
config->decimal_method_id = SYM2ID(rb_str_intern(name));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
|
|
{
|
|
config->max_nesting = 100;
|
|
|
|
if (!NIL_P(opts)) {
|
|
Check_Type(opts, T_HASH);
|
|
if (RHASH_SIZE(opts) > 0) {
|
|
// We assume in most cases few keys are set so it's faster to go over
|
|
// the provided keys than to check all possible keys.
|
|
rb_hash_foreach(opts, parser_config_init_i, (VALUE)config);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/*
|
|
* call-seq: new(opts => {})
|
|
*
|
|
* Creates a new JSON::Ext::ParserConfig instance.
|
|
*
|
|
* It will be configured by the _opts_ hash. _opts_ can have the following
|
|
* keys:
|
|
*
|
|
* _opts_ can have the following keys:
|
|
* * *max_nesting*: The maximum depth of nesting allowed in the parsed data
|
|
* structures. Disable depth checking with :max_nesting => false|nil|0, it
|
|
* defaults to 100.
|
|
* * *allow_nan*: If set to true, allow NaN, Infinity and -Infinity in
|
|
* defiance of RFC 4627 to be parsed by the Parser. This option defaults to
|
|
* false.
|
|
* * *symbolize_names*: If set to true, returns symbols for the names
|
|
* (keys) in a JSON object. Otherwise strings are returned, which is
|
|
* also the default. It's not possible to use this option in
|
|
* conjunction with the *create_additions* option.
|
|
* * *decimal_class*: Specifies which class to use instead of the default
|
|
* (Float) when parsing decimal numbers. This class must accept a single
|
|
* string argument in its constructor.
|
|
*/
|
|
static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
|
|
{
|
|
rb_check_frozen(self);
|
|
GET_PARSER_CONFIG;
|
|
|
|
parser_config_init(config, opts);
|
|
|
|
RB_OBJ_WRITTEN(self, Qundef, config->decimal_class);
|
|
|
|
return self;
|
|
}
|
|
|
|
static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource)
|
|
{
|
|
Vsource = convert_encoding(StringValue(Vsource));
|
|
StringValue(Vsource);
|
|
|
|
VALUE rvalue_stack_buffer[RVALUE_STACK_INITIAL_CAPA];
|
|
rvalue_stack stack = {
|
|
.type = RVALUE_STACK_STACK_ALLOCATED,
|
|
.ptr = rvalue_stack_buffer,
|
|
.capa = RVALUE_STACK_INITIAL_CAPA,
|
|
};
|
|
|
|
long len;
|
|
const char *start;
|
|
RSTRING_GETMEM(Vsource, start, len);
|
|
|
|
JSON_ParserState _state = {
|
|
.start = start,
|
|
.cursor = start,
|
|
.end = start + len,
|
|
.stack = &stack,
|
|
};
|
|
JSON_ParserState *state = &_state;
|
|
|
|
VALUE result = json_parse_any(state, config);
|
|
|
|
// This may be skipped in case of exception, but
|
|
// it won't cause a leak.
|
|
rvalue_stack_eagerly_release(state->stack_handle);
|
|
|
|
json_ensure_eof(state);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* call-seq: parse(source)
|
|
*
|
|
* Parses the current JSON text _source_ and returns the complete data
|
|
* structure as a result.
|
|
* It raises JSON::ParserError if fail to parse.
|
|
*/
|
|
static VALUE cParserConfig_parse(VALUE self, VALUE Vsource)
|
|
{
|
|
GET_PARSER_CONFIG;
|
|
return cParser_parse(config, Vsource);
|
|
}
|
|
|
|
static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts)
|
|
{
|
|
Vsource = convert_encoding(StringValue(Vsource));
|
|
StringValue(Vsource);
|
|
|
|
JSON_ParserConfig _config = {0};
|
|
JSON_ParserConfig *config = &_config;
|
|
parser_config_init(config, opts);
|
|
|
|
return cParser_parse(config, Vsource);
|
|
}
|
|
|
|
static void JSON_ParserConfig_mark(void *ptr)
|
|
{
|
|
JSON_ParserConfig *config = ptr;
|
|
rb_gc_mark(config->on_load_proc);
|
|
rb_gc_mark(config->decimal_class);
|
|
}
|
|
|
|
static void JSON_ParserConfig_free(void *ptr)
|
|
{
|
|
JSON_ParserConfig *config = ptr;
|
|
ruby_xfree(config);
|
|
}
|
|
|
|
static size_t JSON_ParserConfig_memsize(const void *ptr)
|
|
{
|
|
return sizeof(JSON_ParserConfig);
|
|
}
|
|
|
|
static const rb_data_type_t JSON_ParserConfig_type = {
|
|
"JSON::Ext::Parser/ParserConfig",
|
|
{
|
|
JSON_ParserConfig_mark,
|
|
JSON_ParserConfig_free,
|
|
JSON_ParserConfig_memsize,
|
|
},
|
|
0, 0,
|
|
RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
|
|
};
|
|
|
|
static VALUE cJSON_parser_s_allocate(VALUE klass)
|
|
{
|
|
JSON_ParserConfig *config;
|
|
return TypedData_Make_Struct(klass, JSON_ParserConfig, &JSON_ParserConfig_type, config);
|
|
}
|
|
|
|
void Init_parser(void)
|
|
{
|
|
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
|
rb_ext_ractor_safe(true);
|
|
#endif
|
|
|
|
#undef rb_intern
|
|
rb_require("json/common");
|
|
mJSON = rb_define_module("JSON");
|
|
VALUE mExt = rb_define_module_under(mJSON, "Ext");
|
|
VALUE cParserConfig = rb_define_class_under(mExt, "ParserConfig", rb_cObject);
|
|
eNestingError = rb_path2class("JSON::NestingError");
|
|
rb_gc_register_mark_object(eNestingError);
|
|
rb_define_alloc_func(cParserConfig, cJSON_parser_s_allocate);
|
|
rb_define_method(cParserConfig, "initialize", cParserConfig_initialize, 1);
|
|
rb_define_method(cParserConfig, "parse", cParserConfig_parse, 1);
|
|
|
|
VALUE cParser = rb_define_class_under(mExt, "Parser", rb_cObject);
|
|
rb_define_singleton_method(cParser, "parse", cParser_m_parse, 2);
|
|
|
|
CNaN = rb_const_get(mJSON, rb_intern("NaN"));
|
|
rb_gc_register_mark_object(CNaN);
|
|
|
|
CInfinity = rb_const_get(mJSON, rb_intern("Infinity"));
|
|
rb_gc_register_mark_object(CInfinity);
|
|
|
|
CMinusInfinity = rb_const_get(mJSON, rb_intern("MinusInfinity"));
|
|
rb_gc_register_mark_object(CMinusInfinity);
|
|
|
|
rb_global_variable(&Encoding_UTF_8);
|
|
Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8"));
|
|
|
|
sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
|
|
sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
|
|
sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
|
|
sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
|
|
sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
|
|
sym_freeze = ID2SYM(rb_intern("freeze"));
|
|
sym_on_load = ID2SYM(rb_intern("on_load"));
|
|
sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
|
|
sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
|
|
|
|
i_new = rb_intern("new");
|
|
i_try_convert = rb_intern("try_convert");
|
|
i_uminus = rb_intern("-@");
|
|
i_encode = rb_intern("encode");
|
|
|
|
binary_encindex = rb_ascii8bit_encindex();
|
|
utf8_encindex = rb_utf8_encindex();
|
|
enc_utf8 = rb_utf8_encoding();
|
|
|
|
#ifdef HAVE_SIMD
|
|
simd_impl = find_simd_implementation();
|
|
#endif
|
|
}
|