--- JSON-XS/XS.xs 2008/03/19 04:08:22 1.72 +++ JSON-XS/XS.xs 2008/03/26 02:35:45 1.82 @@ -6,6 +6,7 @@ #include #include #include +#include #include #if defined(__BORLANDC__) || defined(_MSC_VER) @@ -18,6 +19,8 @@ # define UTF8_MAXBYTES 13 #endif +#define IVUV_MAXCHARS (sizeof (UV) * CHAR_BIT * 28 / 93 + 2) + #define F_ASCII 0x00000001UL #define F_LATIN1 0x00000002UL #define F_UTF8 0x00000004UL @@ -77,10 +80,25 @@ static HV *json_stash, *json_boolean_stash; // JSON::XS:: static SV *json_true, *json_false; +enum { + INCR_M_WS = 0, // initial whitespace skipping, must be 0 + INCR_M_STR, // inside string + INCR_M_BS, // inside backslash + INCR_M_JSON // outside anything, count nesting +}; + +#define INCR_DONE(json) (!(json)->incr_nest && (json)->incr_mode == INCR_M_JSON) + typedef struct { U32 flags; SV *cb_object; HV *cb_sk_object; + + // for the incremental parser + SV *incr_text; // the source text so far + STRLEN incr_pos; // the current offset into the text + int incr_nest; // {[]}-nesting level + int incr_mode; } JSON; ///////////////////////////////////////////////////////////////////////////// @@ -125,13 +143,20 @@ INLINE unsigned char * encode_utf8 (unsigned char *s, UV ch) { - if (ch <= 0x7FF) - { - *s++ = (ch >> 6) | 0xc0; - *s++ = (ch & 0x3f) | 0x80; - } - else - s = uvuni_to_utf8_flags (s, ch, 0); + if (expect_false (ch < 0x000080)) + *s++ = ch; + else if (expect_true (ch < 0x000800)) + *s++ = 0xc0 | ( ch >> 6), + *s++ = 0x80 | ( ch & 0x3f); + else if ( ch < 0x010000) + *s++ = 0xe0 | ( ch >> 12), + *s++ = 0x80 | ((ch >> 6) & 0x3f), + *s++ = 0x80 | ( ch & 0x3f); + else if ( ch < 0x110000) + *s++ = 0xf0 | ( ch >> 18), + *s++ = 0x80 | ((ch >> 12) & 0x3f), + *s++ = 0x80 | ((ch >> 6) & 0x3f), + *s++ = 0x80 | ( ch & 0x3f); return s; } @@ -229,9 +254,9 @@ if (uch < 0x80/*0x20*/ || uch >= enc->limit) { - if (uch > 0xFFFFUL) + if (uch >= 0x10000UL) { - if (uch > 0x10FFFFUL) + if (uch >= 0x110000UL) croak ("out of range codepoint (0x%lx) encountered, unrepresentable in JSON", (unsigned long)uch); need (enc, len += 11); @@ -415,7 +440,6 @@ encode_hv (enc_t *enc, HV *hv) { HE *he; - int count; if (enc->indent >= enc->maxdepth) croak ("data structure too deep (hit recursion limit)"); @@ -640,21 +664,16 @@ } else if (SvIOKp (sv)) { - // we assume we can always read an IV as a UV - if (SvUV (sv) & ~(UV)0x7fff) - { - // large integer, use the (rather slow) snprintf way. - need (enc, sizeof (UV) * 3); - enc->cur += - SvIsUV(sv) - ? snprintf (enc->cur, sizeof (UV) * 3, "%"UVuf, (UV)SvUVX (sv)) - : snprintf (enc->cur, sizeof (UV) * 3, "%"IVdf, (IV)SvIVX (sv)); - } - else + // we assume we can always read an IV as a UV and vice versa + // we assume two's complement + // we assume no aliasing issues in the union + if (SvIsUV (sv) ? SvUVX (sv) <= 59000 + : SvIVX (sv) <= 59000 && SvIVX (sv) >= -59000) { // optimise the "small number case" // code will likely be branchless and use only a single multiplication - I32 i = SvIV (sv); + // works for numbers up to 59074 + I32 i = SvIVX (sv); U32 u; char digit, nz = 0; @@ -670,13 +689,22 @@ // and multiplying by 5 while moving the decimal point one to the right, // resulting in a net multiplication by 10. // we always write the digit to memory but conditionally increment - // the pointer, to ease the usage of conditional move instructions. - digit = u >> 28; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0xfffffff) * 5; - digit = u >> 27; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x7ffffff) * 5; - digit = u >> 26; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x3ffffff) * 5; - digit = u >> 25; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x1ffffff) * 5; + // the pointer, to enable the use of conditional move instructions. + digit = u >> 28; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0xfffffffUL) * 5; + digit = u >> 27; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x7ffffffUL) * 5; + digit = u >> 26; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x3ffffffUL) * 5; + digit = u >> 25; *enc->cur = digit + '0'; enc->cur += (nz = nz || digit); u = (u & 0x1ffffffUL) * 5; digit = u >> 24; *enc->cur = digit + '0'; enc->cur += 1; // correctly generate '0' } + else + { + // large integer, use the (rather slow) snprintf way. + need (enc, IVUV_MAXCHARS); + enc->cur += + SvIsUV(sv) + ? snprintf (enc->cur, IVUV_MAXCHARS, "%"UVuf, (UV)SvUVX (sv)) + : snprintf (enc->cur, IVUV_MAXCHARS, "%"IVdf, (IV)SvIVX (sv)); + } } else if (SvROK (sv)) encode_rv (enc, SvRV (sv)); @@ -703,7 +731,7 @@ enc.maxdepth = DEC_DEPTH (enc.json.flags); enc.limit = enc.json.flags & F_ASCII ? 0x000080UL : enc.json.flags & F_LATIN1 ? 0x000100UL - : 0x10FFFFUL; + : 0x110000UL; SvPOK_only (enc.sv); encode_sv (&enc, scalar); @@ -891,7 +919,7 @@ ERR ("illegal backslash escape sequence in string"); } } - else if (expect_true (ch >= 0x20 && ch <= 0x7f)) + else if (expect_true (ch >= 0x20 && ch < 0x80)) *cur++ = ch; else if (ch >= 0x80) { @@ -1159,7 +1187,7 @@ for (;;) { - // the >= 0x80 is true on most architectures + // the >= 0x80 is false on most architectures if (p == e || *p < 0x20 || *p >= 0x80 || *p == '\\') { // slow path, back up and use decode_str @@ -1300,12 +1328,12 @@ decode_sv (dec_t *dec) { // the beauty of JSON: you need exactly one character lookahead - // to parse anything. + // to parse everything. switch (*dec->cur) { case '"': ++dec->cur; return decode_str (dec); - case '[': ++dec->cur; return decode_av (dec); - case '{': ++dec->cur; return decode_hv (dec); + case '[': ++dec->cur; return decode_av (dec); + case '{': ++dec->cur; return decode_hv (dec); case '-': case '0': case '1': case '2': case '3': case '4': @@ -1361,10 +1389,10 @@ } static SV * -decode_json (SV *string, JSON *json, UV *offset_return) +decode_json (SV *string, JSON *json, STRLEN *offset_return) { dec_t dec; - UV offset; + STRLEN offset; SV *sv; SvGETMAGIC (string); @@ -1447,6 +1475,122 @@ } ///////////////////////////////////////////////////////////////////////////// +// incremental parser + +static void +incr_parse (JSON *self) +{ + const char *p = SvPVX (self->incr_text) + self->incr_pos; + + for (;;) + { + //printf ("loop pod %d *p<%c><%s>, mode %d nest %d\n", p - SvPVX (self->incr_text), *p, p, self->incr_mode, self->incr_nest);//D + switch (self->incr_mode) + { + // only used for intiial whitespace skipping + case INCR_M_WS: + for (;;) + { + if (*p > 0x20) + { + self->incr_mode = INCR_M_JSON; + goto incr_m_json; + } + else if (!*p) + goto interrupt; + + ++p; + } + + // skip a single char inside a string (for \\-processing) + case INCR_M_BS: + if (!*p) + goto interrupt; + + ++p; + self->incr_mode = INCR_M_STR; + goto incr_m_str; + + // inside a string + case INCR_M_STR: + incr_m_str: + for (;;) + { + if (*p == '"') + { + ++p; + self->incr_mode = INCR_M_JSON; + + if (!self->incr_nest) + goto interrupt; + + goto incr_m_json; + } + else if (*p == '\\') + { + ++p; // "virtually" consumes character after \ + + if (!*p) // if at end of string we have to switch modes + { + self->incr_mode = INCR_M_BS; + goto interrupt; + } + } + else if (!*p) + goto interrupt; + + ++p; + } + + // after initial ws, outside string + case INCR_M_JSON: + incr_m_json: + for (;;) + { + switch (*p++) + { + case 0: + --p; + goto interrupt; + + case 0x09: + case 0x0a: + case 0x0d: + case 0x20: + if (!self->incr_nest) + { + --p; // do not eat the whitespace, let the next round do it + goto interrupt; + } + break; + + case '"': + self->incr_mode = INCR_M_STR; + goto incr_m_str; + + case '[': + case '{': + ++self->incr_nest; + break; + + case ']': + case '}': + if (!--self->incr_nest) + goto interrupt; + } + } + } + + modechange: + ; + } + +interrupt: + self->incr_pos = p - SvPVX (self->incr_text); + //printf ("return pos %d mode %d nest %d\n", self->incr_pos, self->incr_mode, self->incr_nest);//D +} + +///////////////////////////////////////////////////////////////////////////// // XS interface functions MODULE = JSON::XS PACKAGE = JSON::XS @@ -1616,30 +1760,113 @@ void decode_prefix (JSON *self, SV *jsonstr) PPCODE: { - UV offset; + STRLEN offset; EXTEND (SP, 2); PUSHs (decode_json (jsonstr, self, &offset)); PUSHs (sv_2mortal (newSVuv (offset))); } +void incr_parse (JSON *self, SV *jsonstr = 0) + PPCODE: +{ + if (!self->incr_text) + self->incr_text = newSVpvn ("", 0); + + // append data, if any + if (jsonstr) + { + if (SvUTF8 (jsonstr) && !SvUTF8 (self->incr_text)) + { + /* utf-8-ness differs, need to upgrade */ + sv_utf8_upgrade (self->incr_text); + + if (self->incr_pos) + self->incr_pos = utf8_hop ((U8 *)SvPVX (self->incr_text), self->incr_pos) + - (U8 *)SvPVX (self->incr_text); + } + + { + STRLEN len; + const char *str = SvPV (jsonstr, len); + SvGROW (self->incr_text, SvCUR (self->incr_text) + len + 1); + Move (str, SvEND (self->incr_text), len, char); + SvCUR_set (self->incr_text, SvCUR (self->incr_text) + len); + *SvEND (self->incr_text) = 0; // this should basically be a nop, too, but make sure it's there + } + } + + if (GIMME_V != G_VOID) + do + { + STRLEN offset; + + if (!INCR_DONE (self)) + { + incr_parse (self); + if (!INCR_DONE (self)) + break; + } + + XPUSHs (decode_json (self->incr_text, self, &offset)); + + sv_chop (self->incr_text, SvPV_nolen (self->incr_text) + offset); + self->incr_pos -= offset; + self->incr_nest = 0; + self->incr_mode = 0; + } + while (GIMME_V == G_ARRAY); +} + +SV *incr_text (JSON *self) + ATTRS: lvalue + CODE: +{ + if (self->incr_pos) + croak ("incr_text can not be called when the incremental parser already started parsing"); + + RETVAL = self->incr_text ? SvREFCNT_inc (self->incr_text) : &PL_sv_undef; +} + OUTPUT: + RETVAL + +void incr_skip (JSON *self) + CODE: +{ + if (self->incr_pos) + { + sv_chop (self->incr_text, SvPV_nolen (self->incr_text) + self->incr_pos); + self->incr_pos = 0; + self->incr_nest = 0; + self->incr_mode = 0; + } +} + void DESTROY (JSON *self) CODE: SvREFCNT_dec (self->cb_sk_object); SvREFCNT_dec (self->cb_object); + SvREFCNT_dec (self->incr_text); PROTOTYPES: ENABLE void encode_json (SV *scalar) + ALIAS: + to_json_ = 0 + encode_json = F_UTF8 PPCODE: { - JSON json = { F_DEFAULT | F_UTF8 }; + JSON json = { F_DEFAULT | ix }; XPUSHs (encode_json (scalar, &json)); } void decode_json (SV *jsonstr) + ALIAS: + from_json_ = 0 + decode_json = F_UTF8 PPCODE: { - JSON json = { F_DEFAULT | F_UTF8 }; + JSON json = { F_DEFAULT | ix }; XPUSHs (decode_json (jsonstr, &json, 0)); } +