--- JSON-XS/XS.xs 2007/03/22 21:13:58 1.4 +++ JSON-XS/XS.xs 2007/03/23 16:13:59 1.8 @@ -14,6 +14,7 @@ #define F_SPACE_AFTER 0x00000020 #define F_JSON_RPC 0x00000040 #define F_ALLOW_NONREF 0x00000080 +#define F_SHRINK 0x00000100 #define F_PRETTY F_INDENT | F_SPACE_BEFORE | F_SPACE_AFTER #define F_DEFAULT 0 @@ -55,6 +56,15 @@ return &SvUVX (SvRV (sv)); } +static void +shrink (SV *sv) +{ + sv_utf8_downgrade (sv, 1); +#ifdef SvPV_shrink_to_cur + SvPV_shrink_to_cur (sv); +#endif +} + ///////////////////////////////////////////////////////////////////////////// static void @@ -87,91 +97,92 @@ { unsigned char ch = *(unsigned char *)str; - if (ch == '"') - { - need (enc, len += 1); - *enc->cur++ = '\\'; - *enc->cur++ = '"'; - ++str; - } - else if (ch == '\\') - { - need (enc, len += 1); - *enc->cur++ = '\\'; - *enc->cur++ = '\\'; - ++str; - } - else if (ch >= 0x20 && ch < 0x80) // most common case - { - *enc->cur++ = ch; - ++str; - } - else if (ch == '\015') - { - need (enc, len += 1); - *enc->cur++ = '\\'; - *enc->cur++ = 'r'; - ++str; - } - else if (ch == '\012') - { - need (enc, len += 1); - *enc->cur++ = '\\'; - *enc->cur++ = 'n'; - ++str; - } - else + if (ch >= 0x20 && ch < 0x80) // most common case { - STRLEN clen; - UV uch; - - if (is_utf8) + if (ch == '"') // but with slow exceptions { - uch = utf8n_to_uvuni (str, end - str, &clen, UTF8_CHECK_ONLY); - if (clen < 0) - croak ("malformed UTF-8 character in string, cannot convert to JSON"); + need (enc, len += 1); + *enc->cur++ = '\\'; + *enc->cur++ = '"'; } - else + else if (ch == '\\') { - uch = ch; - clen = 1; + need (enc, len += 1); + *enc->cur++ = '\\'; + *enc->cur++ = '\\'; } + else + *enc->cur++ = ch; - if (uch < 0x80 || enc->flags & F_ASCII) + ++str; + } + else + { + switch (ch) { - if (uch > 0xFFFFUL) - { - need (enc, len += 11); - sprintf (enc->cur, "\\u%04x\\u%04x", - (uch - 0x10000) / 0x400 + 0xD800, - (uch - 0x10000) % 0x400 + 0xDC00); - enc->cur += 12; - } - else + case '\010': need (enc, len += 1); *enc->cur++ = '\\'; *enc->cur++ = 'b'; ++str; break; + case '\011': need (enc, len += 1); *enc->cur++ = '\\'; *enc->cur++ = 't'; ++str; break; + case '\012': need (enc, len += 1); *enc->cur++ = '\\'; *enc->cur++ = 'n'; ++str; break; + case '\014': need (enc, len += 1); *enc->cur++ = '\\'; *enc->cur++ = 'f'; ++str; break; + case '\015': need (enc, len += 1); *enc->cur++ = '\\'; *enc->cur++ = 'r'; ++str; break; + + default: { - static char hexdigit [16] = "0123456789abcdef"; - need (enc, len += 5); - *enc->cur++ = '\\'; - *enc->cur++ = 'u'; - *enc->cur++ = hexdigit [ uch >> 12 ]; - *enc->cur++ = hexdigit [(uch >> 8) & 15]; - *enc->cur++ = hexdigit [(uch >> 4) & 15]; - *enc->cur++ = hexdigit [(uch >> 0) & 15]; - } + STRLEN clen; + UV uch; - str += clen; - } - else if (is_utf8) - { - need (enc, len += clen); - while (clen--) - *enc->cur++ = *str++; - } - else - { - need (enc, 10); // never more than 11 bytes needed - enc->cur = uvuni_to_utf8_flags (enc->cur, uch, 0); - ++str; + if (is_utf8) + { + uch = utf8n_to_uvuni (str, end - str, &clen, UTF8_CHECK_ONLY); + if (clen == (STRLEN)-1) + croak ("malformed UTF-8 character in string, cannot convert to JSON"); + } + else + { + uch = ch; + clen = 1; + } + + if (uch < 0x80 || enc->flags & F_ASCII) + { + if (uch > 0xFFFFUL) + { + need (enc, len += 11); + sprintf (enc->cur, "\\u%04x\\u%04x", + (uch - 0x10000) / 0x400 + 0xD800, + (uch - 0x10000) % 0x400 + 0xDC00); + enc->cur += 12; + } + else + { + static char hexdigit [16] = "0123456789abcdef"; + need (enc, len += 5); + *enc->cur++ = '\\'; + *enc->cur++ = 'u'; + *enc->cur++ = hexdigit [ uch >> 12 ]; + *enc->cur++ = hexdigit [(uch >> 8) & 15]; + *enc->cur++ = hexdigit [(uch >> 4) & 15]; + *enc->cur++ = hexdigit [(uch >> 0) & 15]; + } + + str += clen; + } + else if (is_utf8) + { + need (enc, len += clen); + do + { + *enc->cur++ = *str++; + } + while (--clen); + } + else + { + need (enc, len += 10); // never more than 11 bytes needed + enc->cur = uvuni_to_utf8_flags (enc->cur, uch, 0); + ++str; + } + } } } @@ -308,16 +319,20 @@ qsort (hes, count, sizeof (HE *), he_cmp_fast); else { - // hack to disable "use bytes" - COP *oldcop = PL_curcop, cop; + // hack to forcefully disable "use bytes" + COP cop = *PL_curcop; cop.op_private = 0; - PL_curcop = &cop; + ENTER; SAVETMPS; + + SAVEVPTR (PL_curcop); + PL_curcop = &cop; + qsort (hes, count, sizeof (HE *), he_cmp_slow); - FREETMPS; - PL_curcop = oldcop; + FREETMPS; + LEAVE; } for (i = 0; i < count; ++i) @@ -424,6 +439,10 @@ SvUTF8_on (enc.sv); SvCUR_set (enc.sv, enc.cur - SvPVX (enc.sv)); + + if (enc.flags & F_SHRINK) + shrink (enc.sv); + return enc.sv; } @@ -531,7 +550,7 @@ if (hi >= 0xd800 && hi < 0xdc00) { if (dec->cur [0] != '\\' || dec->cur [1] != 'u') - ERR ("illegal surrogate character"); + ERR ("missing low surrogate character in surrogate pair"); dec->cur += 2; @@ -544,8 +563,8 @@ hi = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000; } - else if (lo >= 0xdc00 && lo < 0xe000) - ERR ("illegal surrogate character"); + else if (hi >= 0xdc00 && hi < 0xe000) + ERR ("missing high surrogate character in surrogate pair"); if (hi >= 0x80) { @@ -558,6 +577,10 @@ APPEND_CH (hi); } break; + + default: + --dec->cur; + ERR ("illegal backslash escape sequence in string"); } } else if (ch >= 0x20 && ch <= 0x7f) @@ -566,14 +589,20 @@ { STRLEN clen; UV uch = utf8n_to_uvuni (dec->cur, dec->end - dec->cur, &clen, UTF8_CHECK_ONLY); - if (clen < 0) - ERR ("malformed UTF-8 character in string, cannot convert to JSON"); + if (clen == (STRLEN)-1) + ERR ("malformed UTF-8 character in JSON string"); APPEND_GROW (clen); - memcpy (cur, dec->cur, clen); - cur += clen; - dec->cur += clen; + do + { + *cur++ = *dec->cur++; + } + while (--clen); + + utf8 = 1; } + else if (dec->cur == dec->end) + ERR ("unexpected end of string while parsing json string"); else ERR ("invalid character encountered"); } @@ -588,6 +617,9 @@ if (utf8) SvUTF8_on (sv); + if (dec->flags & F_SHRINK) + shrink (sv); + return sv; fail: @@ -611,34 +643,50 @@ if (*dec->cur >= '0' && *dec->cur <= '9') ERR ("malformed number (leading zero must not be followed by another digit)"); } - - // int - while (*dec->cur >= '0' && *dec->cur <= '9') - ++dec->cur; + else if (*dec->cur < '0' || *dec->cur > '9') + ERR ("malformed number (no digits after initial minus)"); + else + do + { + ++dec->cur; + } + while (*dec->cur >= '0' && *dec->cur <= '9'); // [frac] if (*dec->cur == '.') { - is_nv = 1; + ++dec->cur; + + if (*dec->cur < '0' || *dec->cur > '9') + ERR ("malformed number (no digits after decimal point)"); do { ++dec->cur; } while (*dec->cur >= '0' && *dec->cur <= '9'); + + is_nv = 1; } // [exp] if (*dec->cur == 'e' || *dec->cur == 'E') { - is_nv = 1; - ++dec->cur; + if (*dec->cur == '-' || *dec->cur == '+') ++dec->cur; - while (*dec->cur >= '0' && *dec->cur <= '9') - ++dec->cur; + if (*dec->cur < '0' || *dec->cur > '9') + ERR ("malformed number (no digits after exp sign)"); + + do + { + ++dec->cur; + } + while (*dec->cur >= '0' && *dec->cur <= '9'); + + is_nv = 1; } if (!is_nv) @@ -666,29 +714,33 @@ { AV *av = newAV (); - for (;;) - { - SV *value; + WS; + if (*dec->cur == ']') + ++dec->cur; + else + for (;;) + { + SV *value; - value = decode_sv (dec); - if (!value) - goto fail; + value = decode_sv (dec); + if (!value) + goto fail; - av_push (av, value); + av_push (av, value); - WS; + WS; - if (*dec->cur == ']') - { - ++dec->cur; - break; - } - - if (*dec->cur != ',') - ERR (", or ] expected while parsing array"); + if (*dec->cur == ']') + { + ++dec->cur; + break; + } + + if (*dec->cur != ',') + ERR (", or ] expected while parsing array"); - ++dec->cur; - } + ++dec->cur; + } return newRV_noinc ((SV *)av); @@ -702,41 +754,45 @@ { HV *hv = newHV (); - for (;;) - { - SV *key, *value; + WS; + if (*dec->cur == '}') + ++dec->cur; + else + for (;;) + { + SV *key, *value; - WS; EXPECT_CH ('"'); + WS; EXPECT_CH ('"'); - key = decode_str (dec); - if (!key) - goto fail; + key = decode_str (dec); + if (!key) + goto fail; - WS; EXPECT_CH (':'); + WS; EXPECT_CH (':'); - value = decode_sv (dec); - if (!value) - { - SvREFCNT_dec (key); - goto fail; - } + value = decode_sv (dec); + if (!value) + { + SvREFCNT_dec (key); + goto fail; + } - //TODO: optimise - hv_store_ent (hv, key, value, 0); + //TODO: optimise + hv_store_ent (hv, key, value, 0); - WS; + WS; - if (*dec->cur == '}') - { - ++dec->cur; - break; - } + if (*dec->cur == '}') + { + ++dec->cur; + break; + } - if (*dec->cur != ',') - ERR (", or } expected while parsing object/hash"); + if (*dec->cur != ',') + ERR (", or } expected while parsing object/hash"); - ++dec->cur; - } + ++dec->cur; + } return newRV_noinc ((SV *)hv); @@ -786,7 +842,7 @@ if (dec->end - dec->cur >= 4 && !memcmp (dec->cur, "null", 4)) { dec->cur += 4; - return newSViv (1); + return newSVsv (&PL_sv_undef); } else ERR ("'null' expected"); @@ -794,7 +850,7 @@ break; default: - ERR ("malformed json string"); + ERR ("malformed json string, neither array, object, number, string or atom"); break; } @@ -807,7 +863,9 @@ { SV *sv; - if (!(flags & F_UTF8)) + if (flags & F_UTF8) + sv_utf8_downgrade (string, 0); + else sv_utf8_upgrade (string); SvGROW (string, SvCUR (string) + 1); // should basically be a NOP @@ -818,17 +876,25 @@ dec.end = SvEND (string); dec.err = 0; - *dec.end = 1; // invalid anywhere sv = decode_sv (&dec); - *dec.end = 0; if (!sv) { - IV offset = utf8_distance (dec.cur, SvPVX (string)); + IV offset = dec.flags & F_UTF8 + ? dec.cur - SvPVX (string) + : utf8_distance (dec.cur, SvPVX (string)); SV *uni = sv_newmortal (); + // horrible hack to silence warning inside pv_uni_display + COP cop = *PL_curcop; + cop.cop_warnings = pWARN_NONE; + ENTER; + SAVEVPTR (PL_curcop); + PL_curcop = &cop; pv_uni_display (uni, dec.cur, dec.end - dec.cur, 20, UNI_DISPLAY_QQ); - croak ("%s, at character %d (%s)", + LEAVE; + + croak ("%s, at character offset %d (%s)", dec.err, (int)offset, dec.cur != dec.end ? SvPV_nolen (uni) : "(end of string)"); @@ -869,7 +935,7 @@ OUTPUT: RETVAL -SV *ascii (SV *self, int enable) +SV *ascii (SV *self, int enable = 1) ALIAS: ascii = F_ASCII utf8 = F_UTF8 @@ -880,6 +946,7 @@ json_rpc = F_JSON_RPC pretty = F_PRETTY allow_nonref = F_ALLOW_NONREF + shrink = F_SHRINK CODE: { UV *uv = SvJSON (self);