--- JSON-XS/XS.xs 2007/08/26 22:27:32 1.62 +++ JSON-XS/XS.xs 2008/03/19 15:17:53 1.74 @@ -29,6 +29,8 @@ #define F_SHRINK 0x00000200UL #define F_ALLOW_BLESSED 0x00000400UL #define F_CONV_BLESSED 0x00000800UL +#define F_RELAXED 0x00001000UL + #define F_MAXDEPTH 0xf8000000UL #define S_MAXDEPTH 27 #define F_MAXSIZE 0x01f00000UL @@ -50,16 +52,20 @@ #define SE } while (0) #if __GNUC__ >= 3 -# define expect(expr,value) __builtin_expect ((expr),(value)) -# define inline inline +# define expect(expr,value) __builtin_expect ((expr), (value)) +# define INLINE static inline #else # define expect(expr,value) (expr) -# define inline static +# define INLINE static #endif #define expect_false(expr) expect ((expr) != 0, 0) #define expect_true(expr) expect ((expr) != 0, 1) +#define IN_RANGE_INC(type,val,beg,end) \ + ((unsigned type)((unsigned type)(val) - (unsigned type)(beg)) \ + <= (unsigned type)((unsigned type)(end) - (unsigned type)(beg))) + #ifdef USE_ITHREADS # define JSON_SLOW 1 # define JSON_STASH (json_stash ? json_stash : gv_stashpv ("JSON::XS", 1)) @@ -80,7 +86,7 @@ ///////////////////////////////////////////////////////////////////////////// // utility functions -inline void +INLINE void shrink (SV *sv) { sv_utf8_downgrade (sv, 1); @@ -99,21 +105,42 @@ // we special-case "safe" characters from U+80 .. U+7FF, // but use the very good perl function to parse anything else. // note that we never call this function for a ascii codepoints -inline UV +INLINE UV decode_utf8 (unsigned char *s, STRLEN len, STRLEN *clen) { - if (expect_false (s[0] > 0xdf || s[0] < 0xc2)) - return utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY); - else if (len > 1 && s[1] >= 0x80 && s[1] <= 0xbf) + if (expect_true (len >= 2 + && IN_RANGE_INC (char, s[0], 0xc2, 0xdf) + && IN_RANGE_INC (char, s[1], 0x80, 0xbf))) { *clen = 2; return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); } else - { - *clen = (STRLEN)-1; - return (UV)-1; - } + return utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY); +} + +// likewise for encoding, also never called for ascii codepoints +// this function takes advantage of this fact, although current gccs +// seem to optimise the check for >= 0x80 away anyways +INLINE unsigned char * +encode_utf8 (unsigned char *s, UV ch) +{ + if (expect_false (ch < 0x000080)) + *s++ = ch; + else if (expect_true (ch < 0x000800)) + *s++ = 0xc0 | ( ch >> 6), + *s++ = 0x80 | ( ch & 0x3f); + else if ( ch < 0x010000) + *s++ = 0xe0 | ( ch >> 12), + *s++ = 0x80 | ((ch >> 6) & 0x3f), + *s++ = 0x80 | ( ch & 0x3f); + else if ( ch < 0x110000) + *s++ = 0xf0 | ( ch >> 18), + *s++ = 0x80 | ((ch >> 12) & 0x3f), + *s++ = 0x80 | ((ch >> 6) & 0x3f), + *s++ = 0x80 | ( ch & 0x3f); + + return s; } ///////////////////////////////////////////////////////////////////////////// @@ -128,9 +155,10 @@ JSON json; U32 indent; // indentation level U32 maxdepth; // max. indentation/recursion level + UV limit; // escape character values >= this value when encoding } enc_t; -inline void +INLINE void need (enc_t *enc, STRLEN len) { if (expect_false (enc->cur + len >= enc->end)) @@ -142,7 +170,7 @@ } } -inline void +INLINE void encode_ch (enc_t *enc, char ch) { need (enc, 1); @@ -206,13 +234,13 @@ clen = 1; } - if (uch > 0x10FFFFUL) - croak ("out of range codepoint (0x%lx) encountered, unrepresentable in JSON", (unsigned long)uch); - - if (uch < 0x80 || enc->json.flags & F_ASCII || (enc->json.flags & F_LATIN1 && uch > 0xFF)) + if (uch < 0x80/*0x20*/ || uch >= enc->limit) { - if (uch > 0xFFFFUL) + if (uch >= 0x10000UL) { + if (uch >= 0x110000UL) + croak ("out of range codepoint (0x%lx) encountered, unrepresentable in JSON", (unsigned long)uch); + need (enc, len += 11); sprintf (enc->cur, "\\u%04x\\u%04x", (int)((uch - 0x10000) / 0x400 + 0xD800), @@ -250,7 +278,7 @@ else { need (enc, len += UTF8_MAXBYTES - 1); // never more than 11 bytes needed - enc->cur = uvuni_to_utf8_flags (enc->cur, uch, 0); + enc->cur = encode_utf8 (enc->cur, uch); ++str; } } @@ -261,7 +289,7 @@ } } -inline void +INLINE void encode_indent (enc_t *enc) { if (enc->json.flags & F_INDENT) @@ -274,14 +302,14 @@ } } -inline void +INLINE void encode_space (enc_t *enc) { need (enc, 1); encode_ch (enc, ' '); } -inline void +INLINE void encode_nl (enc_t *enc) { if (enc->json.flags & F_INDENT) @@ -291,7 +319,7 @@ } } -inline void +INLINE void encode_comma (enc_t *enc) { encode_ch (enc, ','); @@ -312,28 +340,31 @@ if (enc->indent >= enc->maxdepth) croak ("data structure too deep (hit recursion limit)"); - encode_ch (enc, '['); encode_nl (enc); - ++enc->indent; - - for (i = 0; i <= len; ++i) + encode_ch (enc, '['); + + if (len >= 0) { - SV **svp = av_fetch (av, i, 0); + encode_nl (enc); ++enc->indent; - encode_indent (enc); + for (i = 0; i <= len; ++i) + { + SV **svp = av_fetch (av, i, 0); - if (svp) - encode_sv (enc, *svp); - else - encode_str (enc, "null", 4, 0); + encode_indent (enc); - if (i < len) - encode_comma (enc); - } + if (svp) + encode_sv (enc, *svp); + else + encode_str (enc, "null", 4, 0); - encode_nl (enc); + if (i < len) + encode_comma (enc); + } - --enc->indent; - encode_indent (enc); encode_ch (enc, ']'); + encode_nl (enc); --enc->indent; encode_indent (enc); + } + + encode_ch (enc, ']'); } static void @@ -396,7 +427,7 @@ if (enc->indent >= enc->maxdepth) croak ("data structure too deep (hit recursion limit)"); - encode_ch (enc, '{'); encode_nl (enc); ++enc->indent; + encode_ch (enc, '{'); // for canonical output we have to sort by keys first // actually, this is mostly due to the stupid so-called @@ -459,6 +490,8 @@ LEAVE; } + encode_nl (enc); ++enc->indent; + while (count--) { encode_indent (enc); @@ -469,28 +502,34 @@ if (count) encode_comma (enc); } + + encode_nl (enc); --enc->indent; encode_indent (enc); } } else { if (hv_iterinit (hv) || SvMAGICAL (hv)) if ((he = hv_iternext (hv))) - for (;;) - { - encode_indent (enc); - encode_hk (enc, he); - encode_sv (enc, expect_false (SvMAGICAL (hv)) ? hv_iterval (hv, he) : HeVAL (he)); + { + encode_nl (enc); ++enc->indent; - if (!(he = hv_iternext (hv))) - break; + for (;;) + { + encode_indent (enc); + encode_hk (enc, he); + encode_sv (enc, expect_false (SvMAGICAL (hv)) ? hv_iterval (hv, he) : HeVAL (he)); - encode_comma (enc); - } - } + if (!(he = hv_iternext (hv))) + break; - encode_nl (enc); + encode_comma (enc); + } - --enc->indent; encode_indent (enc); encode_ch (enc, '}'); + encode_nl (enc); --enc->indent; encode_indent (enc); + } + } + + encode_ch (enc, '}'); } // encode objects, arrays and special \0=false and \1=true values. @@ -608,11 +647,14 @@ } else if (SvIOKp (sv)) { - // we assume we can always read an IV as a UV - if (SvUV (sv) & ~(UV)0x7fff) + // we assume we can always read an IV as a UV and vice versa + // we assume two's complement + // we assume no aliasing issues in the union + if (SvIsUV (sv) ? SvUVX (sv) > 59000 + : SvIVX (sv) > 59000 || SvIVX (sv) < -59000) { // large integer, use the (rather slow) snprintf way. - need (enc, sizeof (UV) * 3); + need (enc, sizeof (UV) * 5 / 2 + 1); // CHAR_BIT is at least 8 enc->cur += SvIsUV(sv) ? snprintf (enc->cur, sizeof (UV) * 3, "%"UVuf, (UV)SvUVX (sv)) @@ -622,7 +664,8 @@ { // optimise the "small number case" // code will likely be branchless and use only a single multiplication - I32 i = SvIV (sv); + // works for numbers up to 59074 + I32 i = SvIVX (sv); U32 u; char digit, nz = 0; @@ -669,6 +712,9 @@ enc.end = SvEND (enc.sv); enc.indent = 0; enc.maxdepth = DEC_DEPTH (enc.json.flags); + enc.limit = enc.json.flags & F_ASCII ? 0x000080UL + : enc.json.flags & F_LATIN1 ? 0x000100UL + : 0x110000UL; SvPOK_only (enc.sv); encode_sv (&enc, scalar); @@ -699,16 +745,36 @@ U32 maxdepth; // recursion depth limit } dec_t; -inline void +INLINE void +decode_comment (dec_t *dec) +{ + // only '#'-style comments allowed a.t.m. + + while (*dec->cur && *dec->cur != 0x0a && *dec->cur != 0x0d) + ++dec->cur; +} + +INLINE void decode_ws (dec_t *dec) { for (;;) { char ch = *dec->cur; - if (ch > 0x20 - || (ch != 0x20 && ch != 0x0a && ch != 0x0d && ch != 0x09)) - break; + if (ch > 0x20) + { + if (expect_false (ch == '#')) + { + if (dec->json.flags & F_RELAXED) + decode_comment (dec); + else + break; + } + else + break; + } + else if (ch != 0x20 && ch != 0x0a && ch != 0x0d && ch != 0x09) + break; // parse error, but let higher level handle it, gives better error messages ++dec->cur; } @@ -824,7 +890,7 @@ { utf8 = 1; - cur = (char *)uvuni_to_utf8_flags (cur, hi, 0); + cur = encode_utf8 (cur, hi); } else *cur++ = hi; @@ -836,7 +902,7 @@ ERR ("illegal backslash escape sequence in string"); } } - else if (expect_true (ch >= 0x20 && ch <= 0x7f)) + else if (expect_true (ch >= 0x20 && ch < 0x80)) *cur++ = ch; else if (ch >= 0x80) { @@ -969,22 +1035,24 @@ { int len = dec->cur - start; - // special case the rather common 1..4-digit-int case, assumes 32 bit ints or so + // special case the rather common 1..5-digit-int case if (*start == '-') switch (len) { - case 2: return newSViv (-( start [1] - '0' * 1)); - case 3: return newSViv (-( start [1] * 10 + start [2] - '0' * 11)); - case 4: return newSViv (-( start [1] * 100 + start [2] * 10 + start [3] - '0' * 111)); - case 5: return newSViv (-(start [1] * 1000 + start [2] * 100 + start [3] * 10 + start [4] - '0' * 1111)); + case 2: return newSViv (-( start [1] - '0' * 1)); + case 3: return newSViv (-( start [1] * 10 + start [2] - '0' * 11)); + case 4: return newSViv (-( start [1] * 100 + start [2] * 10 + start [3] - '0' * 111)); + case 5: return newSViv (-( start [1] * 1000 + start [2] * 100 + start [3] * 10 + start [4] - '0' * 1111)); + case 6: return newSViv (-(start [1] * 10000 + start [2] * 1000 + start [3] * 100 + start [4] * 10 + start [5] - '0' * 11111)); } else switch (len) { - case 1: return newSViv ( start [0] - '0' * 1); - case 2: return newSViv ( start [0] * 10 + start [1] - '0' * 11); - case 3: return newSViv ( start [0] * 100 + start [1] * 10 + start [2] - '0' * 111); - case 4: return newSViv ( start [0] * 1000 + start [1] * 100 + start [2] * 10 + start [3] - '0' * 1111); + case 1: return newSViv ( start [0] - '0' * 1); + case 2: return newSViv ( start [0] * 10 + start [1] - '0' * 11); + case 3: return newSViv ( start [0] * 100 + start [1] * 10 + start [2] - '0' * 111); + case 4: return newSViv ( start [0] * 1000 + start [1] * 100 + start [2] * 10 + start [3] - '0' * 1111); + case 5: return newSViv ( start [0] * 10000 + start [1] * 1000 + start [2] * 100 + start [3] * 10 + start [4] - '0' * 11111); } { @@ -1055,6 +1123,14 @@ ERR (", or ] expected while parsing array"); ++dec->cur; + + decode_ws (dec); + + if (*dec->cur == ']' && dec->json.flags & F_RELAXED) + { + ++dec->cur; + break; + } } DEC_DEC_DEPTH; @@ -1080,7 +1156,7 @@ else for (;;) { - decode_ws (dec); EXPECT_CH ('"'); + EXPECT_CH ('"'); // heuristic: assume that // a) decode_str + hv_store_ent are abysmally slow. @@ -1104,6 +1180,7 @@ decode_ws (dec); EXPECT_CH (':'); + decode_ws (dec); value = decode_sv (dec); if (!value) { @@ -1125,6 +1202,7 @@ decode_ws (dec); EXPECT_CH (':'); + decode_ws (dec); value = decode_sv (dec); if (!value) goto fail; @@ -1150,6 +1228,14 @@ ERR (", or } expected while parsing object/hash"); ++dec->cur; + + decode_ws (dec); + + if (*dec->cur == '}' && dec->json.flags & F_RELAXED) + { + ++dec->cur; + break; + } } DEC_DEC_DEPTH; @@ -1224,15 +1310,13 @@ static SV * decode_sv (dec_t *dec) { - decode_ws (dec); - // the beauty of JSON: you need exactly one character lookahead - // to parse anything. + // to parse everything. switch (*dec->cur) { case '"': ++dec->cur; return decode_str (dec); - case '[': ++dec->cur; return decode_av (dec); - case '{': ++dec->cur; return decode_hv (dec); + case '[': ++dec->cur; return decode_av (dec); + case '{': ++dec->cur; return decode_hv (dec); case '-': case '0': case '1': case '2': case '3': case '4': @@ -1319,6 +1403,8 @@ dec.json.flags |= F_HOOK; *dec.end = 0; // this should basically be a nop, too, but make sure it's there + + decode_ws (&dec); sv = decode_sv (&dec); if (!(offset_return || !sv)) @@ -1408,7 +1494,10 @@ SvPOK_only (pv); Zero (SvPVX (pv), 1, JSON); ((JSON *)SvPVX (pv))->flags = F_DEFAULT; - XPUSHs (sv_2mortal (sv_bless (newRV_noinc (pv), JSON_STASH))); + XPUSHs (sv_2mortal (sv_bless ( + newRV_noinc (pv), + strEQ (klass, "JSON::XS") ? JSON_STASH : gv_stashpv (klass, 1) + ))); } void ascii (JSON *self, int enable = 1) @@ -1425,6 +1514,7 @@ shrink = F_SHRINK allow_blessed = F_ALLOW_BLESSED convert_blessed = F_CONV_BLESSED + relaxed = F_RELAXED PPCODE: { if (enable) @@ -1435,6 +1525,23 @@ XPUSHs (ST (0)); } +void get_ascii (JSON *self) + ALIAS: + get_ascii = F_ASCII + get_latin1 = F_LATIN1 + get_utf8 = F_UTF8 + get_indent = F_INDENT + get_canonical = F_CANONICAL + get_space_before = F_SPACE_BEFORE + get_space_after = F_SPACE_AFTER + get_allow_nonref = F_ALLOW_NONREF + get_shrink = F_SHRINK + get_allow_blessed = F_ALLOW_BLESSED + get_convert_blessed = F_CONV_BLESSED + get_relaxed = F_RELAXED + PPCODE: + XPUSHs (boolSV (self->flags & ix)); + void max_depth (JSON *self, UV max_depth = 0x80000000UL) PPCODE: { @@ -1450,6 +1557,12 @@ XPUSHs (ST (0)); } +U32 get_max_depth (JSON *self) + CODE: + RETVAL = DEC_DEPTH (self->flags); + OUTPUT: + RETVAL + void max_size (JSON *self, UV max_size = 0) PPCODE: { @@ -1466,6 +1579,12 @@ XPUSHs (ST (0)); } +int get_max_size (JSON *self) + CODE: + RETVAL = DEC_SIZE (self->flags); + OUTPUT: + RETVAL + void filter_json_object (JSON *self, SV *cb = &PL_sv_undef) PPCODE: { @@ -1521,14 +1640,14 @@ PROTOTYPES: ENABLE -void to_json (SV *scalar) +void encode_json (SV *scalar) PPCODE: { JSON json = { F_DEFAULT | F_UTF8 }; XPUSHs (encode_json (scalar, &json)); } -void from_json (SV *jsonstr) +void decode_json (SV *jsonstr) PPCODE: { JSON json = { F_DEFAULT | F_UTF8 };