--- CBOR-XS/XS.xs 2013/11/30 17:19:34 1.35 +++ CBOR-XS/XS.xs 2014/01/05 14:24:54 1.43 @@ -99,7 +99,9 @@ #define F_SHRINK 0x00000001UL #define F_ALLOW_UNKNOWN 0x00000002UL #define F_ALLOW_SHARING 0x00000004UL -#define F_PACK_STRINGS 0x00000008UL +#define F_ALLOW_CYCLES 0x00000008UL +#define F_PACK_STRINGS 0x00000010UL +#define F_VALIDATE_UTF8 0x00000020UL #define INIT_SIZE 32 // initial scalar size to be allocated @@ -128,6 +130,11 @@ U32 max_depth; STRLEN max_size; SV *filter; + + // for the incremental parser + STRLEN incr_pos; // the current offset into the text + STRLEN incr_need; // minimum bytes needed to decode + AV *incr_count; // for every nesting level, the number of outstanding values, or -1 for indef. } CBOR; ecb_inline void @@ -141,6 +148,7 @@ cbor_free (CBOR *cbor) { SvREFCNT_dec (cbor->filter); + SvREFCNT_dec (cbor->incr_count); } ///////////////////////////////////////////////////////////////////////////// @@ -230,18 +238,18 @@ if (ecb_expect_true (len < LENGTH_EXT1)) *enc->cur++ = major | len; - else if (ecb_expect_true (len <= 0xff)) + else if (ecb_expect_true (len <= 0xffU)) { *enc->cur++ = major | LENGTH_EXT1; *enc->cur++ = len; } - else if (len <= 0xffff) + else if (len <= 0xffffU) { *enc->cur++ = major | LENGTH_EXT2; *enc->cur++ = len >> 8; *enc->cur++ = len; } - else if (len <= 0xffffffff) + else if (len <= 0xffffffffU) { *enc->cur++ = major | LENGTH_EXT4; *enc->cur++ = len >> 24; @@ -629,49 +637,49 @@ if (ecb_expect_true (m < LENGTH_EXT1)) return m; - - switch (m) + else if (ecb_expect_true (m == LENGTH_EXT1)) + { + WANT (1); + dec->cur += 1; + return dec->cur[-1]; + } + else if (ecb_expect_true (m == LENGTH_EXT2)) + { + WANT (2); + dec->cur += 2; + return (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); + } + else if (ecb_expect_true (m == LENGTH_EXT4)) { - case LENGTH_EXT1: - WANT (1); - dec->cur += 1; - return dec->cur[-1]; - - case LENGTH_EXT2: - WANT (2); - dec->cur += 2; - return (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); - - case LENGTH_EXT4: - WANT (4); - dec->cur += 4; - return (((UV)dec->cur[-4]) << 24) - | (((UV)dec->cur[-3]) << 16) - | (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); - - case LENGTH_EXT8: - WANT (8); - dec->cur += 8; + WANT (4); + dec->cur += 4; + return (((UV)dec->cur[-4]) << 24) + | (((UV)dec->cur[-3]) << 16) + | (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); + } + else if (ecb_expect_true (m == LENGTH_EXT8)) + { + WANT (8); + dec->cur += 8; - return + return #if UVSIZE < 8 - 0 + 0 #else - (((UV)dec->cur[-8]) << 56) - | (((UV)dec->cur[-7]) << 48) - | (((UV)dec->cur[-6]) << 40) - | (((UV)dec->cur[-5]) << 32) + (((UV)dec->cur[-8]) << 56) + | (((UV)dec->cur[-7]) << 48) + | (((UV)dec->cur[-6]) << 40) + | (((UV)dec->cur[-5]) << 32) #endif - | (((UV)dec->cur[-4]) << 24) - | (((UV)dec->cur[-3]) << 16) - | (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); - - default: - ERR ("corrupted CBOR data (unsupported integer minor encoding)"); + | (((UV)dec->cur[-4]) << 24) + | (((UV)dec->cur[-3]) << 16) + | (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); } + else + ERR ("corrupted CBOR data (unsupported integer minor encoding)"); fail: return 0; @@ -707,6 +715,7 @@ { int i, len = decode_uint (dec); + WANT (len); // complexity check for av_fill - need at least one byte per value, do not allow supersize arrays av_fill (av, len - 1); for (i = 0; i < len; ++i) @@ -729,29 +738,27 @@ // byte or utf-8 strings as keys, but only when !stringref if (ecb_expect_true (!dec->stringref)) - if ((*dec->cur - MAJOR_BYTES) <= 27) + if (ecb_expect_true ((U8)(*dec->cur - MAJOR_BYTES) <= LENGTH_EXT8)) { I32 len = decode_uint (dec); char *key = (char *)dec->cur; dec->cur += len; - if (ecb_expect_false (dec->stringref)) - av_push (dec->stringref, newSVpvn (key, len)); - hv_store (hv, key, len, decode_sv (dec), 0); return; } - else if ((*dec->cur - MAJOR_TEXT) <= 27) + else if (ecb_expect_true ((U8)(*dec->cur - MAJOR_TEXT) <= LENGTH_EXT8)) { I32 len = decode_uint (dec); char *key = (char *)dec->cur; dec->cur += len; - if (ecb_expect_false (dec->stringref)) - av_push (dec->stringref, newSVpvn_utf8 (key, len, 1)); + if (ecb_expect_false (dec->cbor.flags & F_VALIDATE_UTF8)) + if (!is_utf8_string (key, len)) + ERR ("corrupted CBOR data (invalid UTF-8 in map key)"); hv_store (hv, key, -len, decode_sv (dec), 0); @@ -763,6 +770,9 @@ hv_store_ent (hv, k, v, 0); SvREFCNT_dec (k); + +fail: + ; } static SV * @@ -854,7 +864,13 @@ } if (utf8) - SvUTF8_on (sv); + { + if (ecb_expect_false (dec->cbor.flags & F_VALIDATE_UTF8)) + if (!is_utf8_string (SvPVX (sv), SvCUR (sv))) + ERR ("corrupted CBOR data (invalid UTF-8 in text string)"); + + SvUTF8_on (sv); + } return sv; @@ -913,12 +929,22 @@ if (ecb_expect_false (!dec->shareable)) dec->shareable = (AV *)sv_2mortal ((SV *)newAV ()); - sv = newSV (0); - av_push (dec->shareable, SvREFCNT_inc_NN (sv)); + if (dec->cbor.flags & F_ALLOW_CYCLES) + { + sv = newSV (0); + av_push (dec->shareable, SvREFCNT_inc_NN (sv)); - SV *osv = decode_sv (dec); - sv_setsv (sv, osv); - SvREFCNT_dec_NN (osv); + SV *osv = decode_sv (dec); + sv_setsv (sv, osv); + SvREFCNT_dec_NN (osv); + } + else + { + av_push (dec->shareable, &PL_sv_undef); + int idx = AvFILLp (dec->shareable); + sv = decode_sv (dec); + av_store (dec->shareable, idx, SvREFCNT_inc_NN (sv)); + } } break; @@ -933,6 +959,9 @@ ERR ("corrupted CBOR data (sharedref index out of bounds)"); sv = SvREFCNT_inc_NN (AvARRAY (dec->shareable)[idx]); + + if (sv == &PL_sv_undef) + ERR ("cyclic CBOR data structure found, but allow_cycles is not enabled"); } break; @@ -1111,9 +1140,11 @@ } // 0..19 unassigned simple - // 24 reserved + unassigned (reserved values are not encodable) + // 24 reserved + unassigned simple (reserved values are not encodable) + // 28-30 unassigned misc + // 31 break code default: - ERR ("corrupted CBOR data (reserved/unassigned major 7 value)"); + ERR ("corrupted CBOR data (reserved/unassigned/unexpected major 7 value)"); } break; @@ -1150,6 +1181,17 @@ if (dec.err) { + if (dec.shareable) + { + // need to break cyclic links, which whould all be in shareable + int i; + SV **svp; + + for (i = av_len (dec.shareable) + 1; i--; ) + if ((svp = av_fetch (dec.shareable, i, 0))) + sv_setsv (*svp, &PL_sv_undef); + } + SvREFCNT_dec (sv); croak ("%s, at offset %d (octet 0x%02x)", dec.err, dec.cur - (U8 *)data, (int)(uint8_t)*dec.cur); } @@ -1160,6 +1202,126 @@ } ///////////////////////////////////////////////////////////////////////////// +// incremental parser + +#define INCR_DONE(cbor) (AvFILLp (cbor->incr_count) < 0) + +// returns 0 for notyet, 1 for success or error +static int +incr_parse (CBOR *self, SV *cborstr) +{ + STRLEN cur; + SvPV (cborstr, cur); + + while (ecb_expect_true (self->incr_need <= cur)) + { + // table of integer count bytes + static I8 incr_len[MINOR_MASK + 1] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 4, 8,-1,-1,-1,-2 + }; + + const U8 *p = SvPVX (cborstr) + self->incr_pos; + U8 m = *p & MINOR_MASK; + IV count = SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]); + I8 ilen = incr_len[m]; + + self->incr_need = self->incr_pos + 1; + + if (ecb_expect_false (ilen < 0)) + { + if (m != MINOR_INDEF) + return 1; // error + + if (*p == (MAJOR_MISC | MINOR_INDEF)) + { + if (count >= 0) + return 1; // error + + count = 1; + } + else + { + av_push (self->incr_count, newSViv (-1)); //TODO: nest + count = -1; + } + } + else + { + self->incr_need += ilen; + if (ecb_expect_false (self->incr_need > cur)) + return 0; + + int major = *p >> MAJOR_SHIFT; + + switch (major) + { + case MAJOR_BYTES >> MAJOR_SHIFT: + case MAJOR_TEXT >> MAJOR_SHIFT: + case MAJOR_ARRAY >> MAJOR_SHIFT: + case MAJOR_MAP >> MAJOR_SHIFT: + { + UV len; + + if (ecb_expect_false (ilen)) + { + len = 0; + + do { + len = (len << 8) | *++p; + } while (--ilen); + } + else + len = m; + + switch (major) + { + case MAJOR_BYTES >> MAJOR_SHIFT: + case MAJOR_TEXT >> MAJOR_SHIFT: + self->incr_need += len; + if (ecb_expect_false (self->incr_need > cur)) + return 0; + + break; + + case MAJOR_MAP >> MAJOR_SHIFT: + len <<= 1; + case MAJOR_ARRAY >> MAJOR_SHIFT: + if (len) + { + av_push (self->incr_count, newSViv (len + 1)); //TODO: nest + count = len + 1; + } + break; + } + } + } + } + + self->incr_pos = self->incr_need; + + if (count > 0) + { + while (!--count) + { + if (!AvFILLp (self->incr_count)) + return 1; // done + + SvREFCNT_dec_NN (av_pop (self->incr_count)); + count = SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]); + } + + SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]) = count; + } + } + + return 0; +} + + +///////////////////////////////////////////////////////////////////////////// // XS interface functions MODULE = CBOR::XS PACKAGE = CBOR::XS @@ -1208,7 +1370,9 @@ shrink = F_SHRINK allow_unknown = F_ALLOW_UNKNOWN allow_sharing = F_ALLOW_SHARING + allow_cycles = F_ALLOW_CYCLES pack_strings = F_PACK_STRINGS + validate_utf8 = F_VALIDATE_UTF8 PPCODE: { if (enable) @@ -1224,7 +1388,9 @@ get_shrink = F_SHRINK get_allow_unknown = F_ALLOW_UNKNOWN get_allow_sharing = F_ALLOW_SHARING + get_allow_cycles = F_ALLOW_CYCLES get_pack_strings = F_PACK_STRINGS + get_validate_utf8 = F_VALIDATE_UTF8 PPCODE: XPUSHs (boolSV (self->flags & ix)); @@ -1283,6 +1449,58 @@ PUSHs (sv_2mortal (newSVuv (offset - SvPVX (cborstr)))); } +void incr_parse (CBOR *self, SV *cborstr) + ALIAS: + incr_parse_multiple = 1 + PPCODE: +{ + if (SvUTF8 (cborstr)) + sv_utf8_downgrade (cborstr, 0); + + if (!self->incr_count) + { + self->incr_count = newAV (); + self->incr_pos = 0; + self->incr_need = 1; + + av_push (self->incr_count, newSViv (1)); + } + + do + { + if (!incr_parse (self, cborstr)) + { + if (self->incr_need > self->max_size && self->max_size) + croak ("attempted decode of CBOR text of %lu bytes size, but max_size is set to %lu", + (unsigned long)self->incr_need, (unsigned long)self->max_size); + + break; + } + + SV *sv; + char *offset; + + PUTBACK; sv = decode_cbor (cborstr, self, &offset); SPAGAIN; + XPUSHs (sv); + + sv_chop (cborstr, offset); + + av_clear (self->incr_count); + av_push (self->incr_count, newSViv (1)); + + self->incr_pos = 0; + self->incr_need = self->incr_pos + 1; + } + while (ix); +} + +void incr_reset (CBOR *self) + CODE: +{ + SvREFCNT_dec (self->incr_count); + self->incr_count = 0; +} + void DESTROY (CBOR *self) PPCODE: cbor_free (self); @@ -1290,10 +1508,14 @@ PROTOTYPES: ENABLE void encode_cbor (SV *scalar) + ALIAS: + encode_cbor = 0 + encode_cbor_sharing = F_ALLOW_SHARING PPCODE: { CBOR cbor; cbor_init (&cbor); + cbor.flags |= ix; PUTBACK; scalar = encode_cbor (scalar, &cbor); SPAGAIN; XPUSHs (scalar); }