--- JSON-XS/XS.xs 2009/07/03 09:38:30 1.97 +++ JSON-XS/XS.xs 2011/08/11 17:06:53 1.112 @@ -14,12 +14,13 @@ #endif // some old perls do not have this, try to make it work, no -// guarentees, though. if it breaks, you get to keep the pieces. +// guarantees, though. if it breaks, you get to keep the pieces. #ifndef UTF8_MAXBYTES # define UTF8_MAXBYTES 13 #endif -#define IVUV_MAXCHARS (sizeof (UV) * CHAR_BIT * 28 / 93 + 2) +// three extra for rounding, sign, and end of string +#define IVUV_MAXCHARS (sizeof (UV) * CHAR_BIT * 28 / 93 + 3) #define F_ASCII 0x00000001UL #define F_LATIN1 0x00000002UL @@ -43,6 +44,8 @@ #define SHORT_STRING_LEN 16384 // special-case strings of up to this size +#define DECODE_WANTS_OCTETS(json) ((json)->flags & F_UTF8) + #define SB do { #define SE } while (0) @@ -78,6 +81,8 @@ INCR_M_WS = 0, // initial whitespace skipping, must be 0 INCR_M_STR, // inside string INCR_M_BS, // inside backslash + INCR_M_C0, // inside comment in initial whitespace sequence + INCR_M_C1, // inside comment in other places INCR_M_JSON // outside anything, count nesting }; @@ -187,6 +192,101 @@ } ///////////////////////////////////////////////////////////////////////////// +// fp hell + +// scan a group of digits, and a trailing exponent +static void +json_atof_scan1 (const char *s, NV *accum, int *expo, int postdp, int maxdepth) +{ + UV uaccum = 0; + int eaccum = 0; + + // if we recurse too deep, skip all remaining digits + // to avoid a stack overflow attack + if (expect_false (--maxdepth <= 0)) + while (((U8)*s - '0') < 10) + ++s; + + for (;;) + { + U8 dig = (U8)*s - '0'; + + if (expect_false (dig >= 10)) + { + if (dig == (U8)((U8)'.' - (U8)'0')) + { + ++s; + json_atof_scan1 (s, accum, expo, 1, maxdepth); + } + else if ((dig | ' ') == 'e' - '0') + { + int exp2 = 0; + int neg = 0; + + ++s; + + if (*s == '-') + { + ++s; + neg = 1; + } + else if (*s == '+') + ++s; + + while ((dig = (U8)*s - '0') < 10) + exp2 = exp2 * 10 + *s++ - '0'; + + *expo += neg ? -exp2 : exp2; + } + + break; + } + + ++s; + + uaccum = uaccum * 10 + dig; + ++eaccum; + + // if we have too many digits, then recurse for more + // we actually do this for rather few digits + if (uaccum >= (UV_MAX - 9) / 10) + { + if (postdp) *expo -= eaccum; + json_atof_scan1 (s, accum, expo, postdp, maxdepth); + if (postdp) *expo += eaccum; + + break; + } + } + + // this relies greatly on the quality of the pow () + // implementation of the platform, but a good + // implementation is hard to beat. + // (IEEE 754 conformant ones are required to be exact) + if (postdp) *expo -= eaccum; + *accum += uaccum * Perl_pow (10., *expo); + *expo += eaccum; +} + +static NV +json_atof (const char *s) +{ + NV accum = 0.; + int expo = 0; + int neg = 0; + + if (*s == '-') + { + ++s; + neg = 1; + } + + // a recursion depth of ten gives us >>500 bits + json_atof_scan1 (s, &accum, &expo, 0, 10); + + return neg ? -accum : accum; +} +///////////////////////////////////////////////////////////////////////////// // encoder // structure used for encoding JSON @@ -473,7 +573,7 @@ // actually, this is mostly due to the stupid so-called // security workaround added somewhere in 5.8.x // that randomises hash orderings - if (enc->json.flags & F_CANONICAL) + if (enc->json.flags & F_CANONICAL && !SvRMAGICAL (hv)) { int count = hv_iterinit (hv); @@ -761,6 +861,7 @@ SvPOK_only (enc.sv); encode_sv (&enc, scalar); + encode_nl (&enc); SvCUR_set (enc.sv, enc.cur - SvPVX (enc.sv)); *SvEND (enc.sv) = 0; // many xs functions expect a trailing 0 for text strings @@ -950,11 +1051,10 @@ else if (ch >= 0x80) { STRLEN clen; - UV uch; --dec_cur; - uch = decode_utf8 (dec_cur, dec->end - dec_cur, &clen); + decode_utf8 (dec_cur, dec->end - dec_cur, &clen); if (clen == (STRLEN)-1) ERR ("malformed UTF-8 character in JSON string"); @@ -1118,20 +1218,16 @@ len -= *start == '-' ? 1 : 0; // does not fit into IV or UV, try NV - if ((sizeof (NV) == sizeof (double) && DBL_DIG >= len) - #if defined (LDBL_DIG) - || (sizeof (NV) == sizeof (long double) && LDBL_DIG >= len) - #endif - ) + if (len <= NV_DIG) // fits into NV without loss of precision - return newSVnv (Atof (start)); + return newSVnv (json_atof (start)); // everything else fails, convert it to a string return newSVpvn (start, dec->cur - start); } // loss of precision here - return newSVnv (Atof (start)); + return newSVnv (json_atof (start)); fail: return 0; @@ -1310,6 +1406,7 @@ ENTER; SAVETMPS; PUSHMARK (SP); XPUSHs (HeVAL (he)); + sv_2mortal (sv); PUTBACK; count = call_sv (HeVAL (cb), G_ARRAY); SPAGAIN; @@ -1320,6 +1417,7 @@ return sv; } + SvREFCNT_inc (sv); FREETMPS; LEAVE; } } @@ -1425,10 +1523,12 @@ SV *sv; /* work around bugs in 5.10 where manipulating magic values - * will perl ignore the magic in subsequent accesses + * will perl ignore the magic in subsequent accesses. + * also make a copy of non-PV values, to get them into a clean + * state (SvPV should do that, but it's buggy, see below). */ /*SvGETMAGIC (string);*/ - if (SvMAGICAL (string)) + if (SvMAGICAL (string) || !SvPOK (string)) string = sv_2mortal (newSVsv (string)); SvUPGRADE (string, SVt_PV); @@ -1455,7 +1555,7 @@ (unsigned long)SvCUR (string), (unsigned long)json->max_size); } - if (json->flags & F_UTF8) + if (DECODE_WANTS_OCTETS (json)) sv_utf8_downgrade (string, 0); else sv_utf8_upgrade (string); @@ -1527,19 +1627,30 @@ { const char *p = SvPVX (self->incr_text) + self->incr_pos; + // the state machine here is a bit convoluted and could be simplified a lot + // but this would make it slower, so... + for (;;) { //printf ("loop pod %d *p<%c><%s>, mode %d nest %d\n", p - SvPVX (self->incr_text), *p, p, self->incr_mode, self->incr_nest);//D switch (self->incr_mode) { - // only used for intiial whitespace skipping + // only used for initial whitespace skipping case INCR_M_WS: for (;;) { if (*p > 0x20) { - self->incr_mode = INCR_M_JSON; - goto incr_m_json; + if (*p == '#') + { + self->incr_mode = INCR_M_C0; + goto incr_m_c; + } + else + { + self->incr_mode = INCR_M_JSON; + goto incr_m_json; + } } else if (!*p) goto interrupt; @@ -1556,6 +1667,25 @@ self->incr_mode = INCR_M_STR; goto incr_m_str; + // inside #-style comments + case INCR_M_C0: + case INCR_M_C1: + incr_m_c: + for (;;) + { + if (*p == '\n') + { + self->incr_mode = self->incr_mode == INCR_M_C0 ? INCR_M_WS : INCR_M_JSON; + break; + } + else if (!*p) + goto interrupt; + + ++p; + } + + break; + // inside a string case INCR_M_STR: incr_m_str: @@ -1623,6 +1753,11 @@ case '}': if (--self->incr_nest <= 0) goto interrupt; + break; + + case '#': + self->incr_mode = INCR_M_C1; + goto incr_m_c; } } } @@ -1633,6 +1768,7 @@ interrupt: self->incr_pos = p - SvPVX (self->incr_text); + //printf ("interrupt<%.*s>\n", self->incr_pos, SvPVX(self->incr_text));//D //printf ("return pos %d mode %d nest %d\n", self->incr_pos, self->incr_mode, self->incr_nest);//D } @@ -1657,6 +1793,8 @@ json_true = get_bool ("JSON::XS::true"); json_false = get_bool ("JSON::XS::false"); + + CvNODEBUG_on (get_cv ("JSON::XS::incr_text", 0)); /* the debugger completely breaks lvalue subs */ } PROTOTYPES: DISABLE @@ -1798,24 +1936,36 @@ if (!self->incr_text) self->incr_text = newSVpvn ("", 0); + /* if utf8-ness doesn't match the decoder, need to upgrade/downgrade */ + if (!DECODE_WANTS_OCTETS (self) == !SvUTF8 (self->incr_text)) + if (DECODE_WANTS_OCTETS (self)) + { + if (self->incr_pos) + self->incr_pos = utf8_length ((U8 *)SvPVX (self->incr_text), + (U8 *)SvPVX (self->incr_text) + self->incr_pos); + + sv_utf8_downgrade (self->incr_text, 0); + } + else + { + sv_utf8_upgrade (self->incr_text); + + if (self->incr_pos) + self->incr_pos = utf8_hop ((U8 *)SvPVX (self->incr_text), self->incr_pos) + - (U8 *)SvPVX (self->incr_text); + } + // append data, if any if (jsonstr) { - if (SvUTF8 (jsonstr)) - { - if (!SvUTF8 (self->incr_text)) - { - /* utf-8-ness differs, need to upgrade */ - sv_utf8_upgrade (self->incr_text); - - if (self->incr_pos) - self->incr_pos = utf8_hop ((U8 *)SvPVX (self->incr_text), self->incr_pos) - - (U8 *)SvPVX (self->incr_text); - } - } - else if (SvUTF8 (self->incr_text)) - sv_utf8_upgrade (jsonstr); + /* make sure both strings have same encoding */ + if (SvUTF8 (jsonstr) != SvUTF8 (self->incr_text)) + if (SvUTF8 (jsonstr)) + sv_utf8_downgrade (jsonstr, 0); + else + sv_utf8_upgrade (jsonstr); + /* and then just blindly append */ { STRLEN len; const char *str = SvPV (jsonstr, len); @@ -1844,7 +1994,16 @@ (unsigned long)self->incr_pos, (unsigned long)self->max_size); if (!INCR_DONE (self)) - break; + { + // as an optimisation, do not accumulate white space in the incr buffer + if (self->incr_mode == INCR_M_WS && self->incr_pos) + { + self->incr_pos = 0; + SvCUR_set (self->incr_text, 0); + } + + break; + } } XPUSHs (decode_json (self->incr_text, self, &offset)); @@ -1924,4 +2083,3 @@ XPUSHs (decode_json (jsonstr, &json, 0)); } -