--- CBOR-XS/XS.xs 2013/10/29 18:37:31 1.15 +++ CBOR-XS/XS.xs 2014/01/05 14:24:54 1.43 @@ -21,38 +21,87 @@ #ifndef HvNAMEUTF8 # define HvNAMEUTF8(hv) 0 #endif +#ifndef SvREFCNT_dec_NN +# define SvREFCNT_dec_NN(sv) SvREFCNT_dec (sv) +#endif + +// known major and minor types +enum cbor_type +{ + MAJOR_SHIFT = 5, + MINOR_MASK = 0x1f, + + MAJOR_POS_INT = 0 << MAJOR_SHIFT, + MAJOR_NEG_INT = 1 << MAJOR_SHIFT, + MAJOR_BYTES = 2 << MAJOR_SHIFT, + MAJOR_TEXT = 3 << MAJOR_SHIFT, + MAJOR_ARRAY = 4 << MAJOR_SHIFT, + MAJOR_MAP = 5 << MAJOR_SHIFT, + MAJOR_TAG = 6 << MAJOR_SHIFT, + MAJOR_MISC = 7 << MAJOR_SHIFT, + + // INT/STRING/ARRAY/MAP subtypes + LENGTH_EXT1 = 24, + LENGTH_EXT2 = 25, + LENGTH_EXT4 = 26, + LENGTH_EXT8 = 27, + + // SIMPLE types (effectively MISC subtypes) + SIMPLE_FALSE = 20, + SIMPLE_TRUE = 21, + SIMPLE_NULL = 22, + SIMPLE_UNDEF = 23, + + // MISC subtype (unused) + MISC_EXT1 = 24, + MISC_FLOAT16 = 25, + MISC_FLOAT32 = 26, + MISC_FLOAT64 = 27, + + // BYTES/TEXT/ARRAY/MAP + MINOR_INDEF = 31, +}; // known tags enum cbor_tag { - // inofficial extensions (pending iana registration) - CBOR_TAG_PERL_OBJECT = 256, - CBOR_TAG_GENERIC_OBJECT = 257, - - // rfc7049 - CBOR_TAG_DATETIME = 0, // rfc4287, utf-8 - CBOR_TAG_TIMESTAMP = 1, // unix timestamp, any - CBOR_TAG_POS_BIGNUM = 2, // byte string - CBOR_TAG_NEG_BIGNUM = 3, // byte string - CBOR_TAG_DECIMAL = 4, // decimal fraction, array - CBOR_TAG_BIGFLOAT = 5, // array - - CBOR_TAG_CONV_B64U = 21, // base64url, any - CBOR_TAG_CONV_B64 = 22, // base64, any - CBOR_TAG_CONV_HEX = 23, // base16, any - CBOR_TAG_CBOR = 24, // embedded cbor, byte string - - CBOR_TAG_URI = 32, // URI rfc3986, utf-8 - CBOR_TAG_B64U = 33, // base64url rfc4648, utf-8 - CBOR_TAG_B64 = 34, // base6 rfc46484, utf-8 - CBOR_TAG_REGEX = 35, // regex pcre/ecma262, utf-8 - CBOR_TAG_MIME = 36, // mime message rfc2045, utf-8 + // extensions + CBOR_TAG_STRINGREF = 25, // http://cbor.schmorp.de/stringref + CBOR_TAG_PERL_OBJECT = 26, // http://cbor.schmorp.de/perl-object + CBOR_TAG_GENERIC_OBJECT = 27, // http://cbor.schmorp.de/generic-object + CBOR_TAG_VALUE_SHAREABLE = 28, // http://cbor.schmorp.de/value-sharing + CBOR_TAG_VALUE_SHAREDREF = 29, // http://cbor.schmorp.de/value-sharing + CBOR_TAG_STRINGREF_NAMESPACE = 256, // http://cbor.schmorp.de/stringref + CBOR_TAG_INDIRECTION = 22098, // http://cbor.schmorp.de/indirection + + // rfc7049 + CBOR_TAG_DATETIME = 0, // rfc4287, utf-8 + CBOR_TAG_TIMESTAMP = 1, // unix timestamp, any + CBOR_TAG_POS_BIGNUM = 2, // byte string + CBOR_TAG_NEG_BIGNUM = 3, // byte string + CBOR_TAG_DECIMAL = 4, // decimal fraction, array + CBOR_TAG_BIGFLOAT = 5, // array + + CBOR_TAG_CONV_B64U = 21, // base64url, any + CBOR_TAG_CONV_B64 = 22, // base64, any + CBOR_TAG_CONV_HEX = 23, // base16, any + CBOR_TAG_CBOR = 24, // embedded cbor, byte string + + CBOR_TAG_URI = 32, // URI rfc3986, utf-8 + CBOR_TAG_B64U = 33, // base64url rfc4648, utf-8 + CBOR_TAG_B64 = 34, // base6 rfc46484, utf-8 + CBOR_TAG_REGEX = 35, // regex pcre/ecma262, utf-8 + CBOR_TAG_MIME = 36, // mime message rfc2045, utf-8 - CBOR_TAG_MAGIC = 55799 // self-describe cbor + CBOR_TAG_MAGIC = 55799, // self-describe cbor }; -#define F_SHRINK 0x00000200UL -#define F_ALLOW_UNKNOWN 0x00002000UL +#define F_SHRINK 0x00000001UL +#define F_ALLOW_UNKNOWN 0x00000002UL +#define F_ALLOW_SHARING 0x00000004UL +#define F_ALLOW_CYCLES 0x00000008UL +#define F_PACK_STRINGS 0x00000010UL +#define F_VALIDATE_UTF8 0x00000020UL #define INIT_SIZE 32 // initial scalar size to be allocated @@ -74,12 +123,18 @@ #endif static HV *cbor_stash, *types_boolean_stash, *types_error_stash, *cbor_tagged_stash; // CBOR::XS:: -static SV *types_true, *types_false, *types_error, *sv_cbor; +static SV *types_true, *types_false, *types_error, *sv_cbor, *default_filter; typedef struct { U32 flags; U32 max_depth; STRLEN max_size; + SV *filter; + + // for the incremental parser + STRLEN incr_pos; // the current offset into the text + STRLEN incr_need; // minimum bytes needed to decode + AV *incr_count; // for every nesting level, the number of outstanding values, or -1 for indef. } CBOR; ecb_inline void @@ -89,6 +144,13 @@ cbor->max_depth = 512; } +ecb_inline void +cbor_free (CBOR *cbor) +{ + SvREFCNT_dec (cbor->filter); + SvREFCNT_dec (cbor->incr_count); +} + ///////////////////////////////////////////////////////////////////////////// // utility functions @@ -118,10 +180,20 @@ } } -///////////////////////////////////////////////////////////////////////////// -// fp hell - -//TODO +// minimum length of a string to be registered for stringref +ecb_inline int +minimum_string_length (UV idx) +{ + return idx > 23 + ? idx > 0xffU + ? idx > 0xffffU + ? idx > 0xffffffffU + ? 11 + : 7 + : 5 + : 4 + : 3; +} ///////////////////////////////////////////////////////////////////////////// // encoder @@ -134,6 +206,10 @@ SV *sv; // result scalar CBOR cbor; U32 depth; // recursion level + HV *stringref[2]; // string => index, or 0 ([0] = bytes, [1] = utf-8) + UV stringref_idx; + HV *shareable; // ptr => index, or 0 + UV shareable_idx; } enc_t; ecb_inline void @@ -160,22 +236,22 @@ { need (enc, 9); - if (len < 24) + if (ecb_expect_true (len < LENGTH_EXT1)) *enc->cur++ = major | len; - else if (len <= 0xff) + else if (ecb_expect_true (len <= 0xffU)) { - *enc->cur++ = major | 24; + *enc->cur++ = major | LENGTH_EXT1; *enc->cur++ = len; } - else if (len <= 0xffff) + else if (len <= 0xffffU) { - *enc->cur++ = major | 25; + *enc->cur++ = major | LENGTH_EXT2; *enc->cur++ = len >> 8; *enc->cur++ = len; } - else if (len <= 0xffffffff) + else if (len <= 0xffffffffU) { - *enc->cur++ = major | 26; + *enc->cur++ = major | LENGTH_EXT4; *enc->cur++ = len >> 24; *enc->cur++ = len >> 16; *enc->cur++ = len >> 8; @@ -183,7 +259,7 @@ } else { - *enc->cur++ = major | 27; + *enc->cur++ = major | LENGTH_EXT8; *enc->cur++ = len >> 56; *enc->cur++ = len >> 48; *enc->cur++ = len >> 40; @@ -195,15 +271,46 @@ } } -static void +ecb_inline void +encode_tag (enc_t *enc, UV tag) +{ + encode_uint (enc, MAJOR_TAG, tag); +} + +ecb_inline void encode_str (enc_t *enc, int utf8, char *str, STRLEN len) { - encode_uint (enc, utf8 ? 0x60 : 0x40, len); + encode_uint (enc, utf8 ? MAJOR_TEXT : MAJOR_BYTES, len); need (enc, len); memcpy (enc->cur, str, len); enc->cur += len; } +static void +encode_strref (enc_t *enc, int utf8, char *str, STRLEN len) +{ + if (ecb_expect_false (enc->cbor.flags & F_PACK_STRINGS)) + { + SV **svp = hv_fetch (enc->stringref[!!utf8], str, len, 1); + + if (SvOK (*svp)) + { + // already registered, use stringref + encode_tag (enc, CBOR_TAG_STRINGREF); + encode_uint (enc, MAJOR_POS_INT, SvUV (*svp)); + return; + } + else if (len >= minimum_string_length (enc->stringref_idx)) + { + // register only + sv_setuv (*svp, enc->stringref_idx); + ++enc->stringref_idx; + } + } + + encode_str (enc, utf8, str, len); +} + static void encode_sv (enc_t *enc, SV *sv); static void @@ -216,7 +323,7 @@ ++enc->depth; - encode_uint (enc, 0x80, len + 1); + encode_uint (enc, MAJOR_ARRAY, len + 1); for (i = 0; i <= len; ++i) { @@ -241,22 +348,22 @@ int mg = SvMAGICAL (hv); if (mg) - encode_ch (enc, 0xa0 | 31); + encode_ch (enc, MAJOR_MAP | MINOR_INDEF); else - encode_uint (enc, 0xa0, pairs); + encode_uint (enc, MAJOR_MAP, pairs); while ((he = hv_iternext (hv))) { if (HeKLEN (he) == HEf_SVKEY) encode_sv (enc, HeSVKEY (he)); else - encode_str (enc, HeKUTF8 (he), HeKEY (he), HeKLEN (he)); + encode_strref (enc, HeKUTF8 (he), HeKEY (he), HeKLEN (he)); encode_sv (enc, ecb_expect_false (mg) ? hv_iterval (hv, he) : HeVAL (he)); } if (mg) - encode_ch (enc, 0xe0 | 31); + encode_ch (enc, MAJOR_MISC | MINOR_INDEF); --enc->depth; } @@ -265,10 +372,9 @@ static void encode_rv (enc_t *enc, SV *sv) { - svtype svt; - SvGETMAGIC (sv); - svt = SvTYPE (sv); + + svtype svt = SvTYPE (sv); if (ecb_expect_false (SvOBJECT (sv))) { @@ -283,21 +389,57 @@ : gv_stashpv ("CBOR::XS::Tagged" , 1); HV *stash = SvSTASH (sv); - GV *method; if (stash == boolean_stash) - encode_ch (enc, SvIV (sv) ? 0xe0 | 21 : 0xe0 | 20); + { + encode_ch (enc, SvIV (sv) ? MAJOR_MISC | SIMPLE_TRUE : MAJOR_MISC | SIMPLE_FALSE); + return; + } else if (stash == error_stash) - encode_ch (enc, 0xe0 | 23); + { + encode_ch (enc, MAJOR_MISC | SIMPLE_UNDEF); + return; + } else if (stash == tagged_stash) { if (svt != SVt_PVAV) croak ("encountered CBOR::XS::Tagged object that isn't an array"); - encode_uint (enc, 0xc0, SvUV (*av_fetch ((AV *)sv, 0, 1))); + encode_uint (enc, MAJOR_TAG, SvUV (*av_fetch ((AV *)sv, 0, 1))); encode_sv (enc, *av_fetch ((AV *)sv, 1, 1)); + + return; + } + } + + if (ecb_expect_false (SvREFCNT (sv) > 1) + && ecb_expect_false (enc->cbor.flags & F_ALLOW_SHARING)) + { + if (!enc->shareable) + enc->shareable = (HV *)sv_2mortal ((SV *)newHV ()); + + SV **svp = hv_fetch (enc->shareable, (char *)&sv, sizeof (sv), 1); + + if (SvOK (*svp)) + { + encode_tag (enc, CBOR_TAG_VALUE_SHAREDREF); + encode_uint (enc, MAJOR_POS_INT, SvUV (*svp)); + return; } - else if ((method = gv_fetchmethod_autoload (stash, "TO_CBOR", 0))) + else + { + sv_setuv (*svp, enc->shareable_idx); + ++enc->shareable_idx; + encode_tag (enc, CBOR_TAG_VALUE_SHAREABLE); + } + } + + if (ecb_expect_false (SvOBJECT (sv))) + { + HV *stash = SvSTASH (sv); + GV *method; + + if ((method = gv_fetchmethod_autoload (stash, "TO_CBOR", 0))) { dSP; @@ -338,9 +480,9 @@ if (count == 1 && SvROK (TOPs) && SvRV (TOPs) == sv) croak ("%s::FREEZE(CBOR) method returned same object as was passed instead of a new one", HvNAME (stash)); - encode_uint (enc, 0xc0, CBOR_TAG_PERL_OBJECT); - encode_uint (enc, 0x80, count + 1); - encode_str (enc, HvNAMEUTF8 (stash), HvNAME (stash), HvNAMELEN (stash)); + encode_tag (enc, CBOR_TAG_PERL_OBJECT); + encode_uint (enc, MAJOR_ARRAY, count + 1); + encode_strref (enc, HvNAMEUTF8 (stash), HvNAME (stash), HvNAMELEN (stash)); while (count) encode_sv (enc, SP[1 - count--]); @@ -357,26 +499,11 @@ encode_hv (enc, (HV *)sv); else if (svt == SVt_PVAV) encode_av (enc, (AV *)sv); - else if (svt < SVt_PVAV) + else { - STRLEN len = 0; - char *pv = svt ? SvPV (sv, len) : 0; - - if (len == 1 && *pv == '1') - encode_ch (enc, 0xe0 | 21); - else if (len == 1 && *pv == '0') - encode_ch (enc, 0xe0 | 20); - else if (enc->cbor.flags & F_ALLOW_UNKNOWN) - encode_ch (enc, 0xe0 | 23); - else - croak ("cannot encode reference to scalar '%s' unless the scalar is 0 or 1", - SvPV_nolen (sv_2mortal (newRV_inc (sv)))); + encode_tag (enc, CBOR_TAG_INDIRECTION); + encode_sv (enc, sv); } - else if (enc->cbor.flags & F_ALLOW_UNKNOWN) - encode_ch (enc, 0xe0 | 23); - else - croak ("encountered %s, but CBOR can only represent references to arrays or hashes", - SvPV_nolen (sv_2mortal (newRV_inc (sv)))); } static void @@ -386,14 +513,14 @@ need (enc, 9); - if (ecb_expect_false (nv == (U32)nv)) - encode_uint (enc, 0x00, (U32)nv); + if (ecb_expect_false (nv == (NV)(U32)nv)) + encode_uint (enc, MAJOR_POS_INT, (U32)nv); //TODO: maybe I32? else if (ecb_expect_false (nv == (float)nv)) { uint32_t fp = ecb_float_to_binary32 (nv); - *enc->cur++ = 0xe0 | 26; + *enc->cur++ = MAJOR_MISC | MISC_FLOAT32; if (!ecb_big_endian ()) fp = ecb_bswap32 (fp); @@ -405,7 +532,7 @@ { uint64_t fp = ecb_double_to_binary64 (nv); - *enc->cur++ = 0xe0 | 27; + *enc->cur++ = MAJOR_MISC | MISC_FLOAT64; if (!ecb_big_endian ()) fp = ecb_bswap64 (fp); @@ -424,25 +551,25 @@ { STRLEN len; char *str = SvPV (sv, len); - encode_str (enc, SvUTF8 (sv), str, len); + encode_strref (enc, SvUTF8 (sv), str, len); } else if (SvNOKp (sv)) encode_nv (enc, sv); else if (SvIOKp (sv)) { if (SvIsUV (sv)) - encode_uint (enc, 0x00, SvUVX (sv)); + encode_uint (enc, MAJOR_POS_INT, SvUVX (sv)); else if (SvIVX (sv) >= 0) - encode_uint (enc, 0x00, SvIVX (sv)); + encode_uint (enc, MAJOR_POS_INT, SvIVX (sv)); else - encode_uint (enc, 0x20, -(SvIVX (sv) + 1)); + encode_uint (enc, MAJOR_NEG_INT, -(SvIVX (sv) + 1)); } else if (SvROK (sv)) encode_rv (enc, SvRV (sv)); else if (!SvOK (sv)) - encode_ch (enc, 0xe0 | 22); + encode_ch (enc, MAJOR_MISC | SIMPLE_NULL); else if (enc->cbor.flags & F_ALLOW_UNKNOWN) - encode_ch (enc, 0xe0 | 23); + encode_ch (enc, MAJOR_MISC | SIMPLE_UNDEF); else croak ("encountered perl type (%s,0x%x) that CBOR cannot handle, check your input data", SvPV_nolen (sv), (unsigned int)SvFLAGS (sv)); @@ -451,15 +578,22 @@ static SV * encode_cbor (SV *scalar, CBOR *cbor) { - enc_t enc; + enc_t enc = { }; enc.cbor = *cbor; enc.sv = sv_2mortal (NEWSV (0, INIT_SIZE)); enc.cur = SvPVX (enc.sv); enc.end = SvEND (enc.sv); - enc.depth = 0; SvPOK_only (enc.sv); + + if (cbor->flags & F_PACK_STRINGS) + { + encode_tag (&enc, CBOR_TAG_STRINGREF_NAMESPACE); + enc.stringref[0]= (HV *)sv_2mortal ((SV *)newHV ()); + enc.stringref[1]= (HV *)sv_2mortal ((SV *)newHV ()); + } + encode_sv (&enc, scalar); SvCUR_set (enc.sv, enc.cur - SvPVX (enc.sv)); @@ -483,6 +617,9 @@ CBOR cbor; U32 depth; // recursion depth U32 maxdepth; // recursion depth limit + AV *shareable; + AV *stringref; + SV *decode_tagged; } dec_t; #define ERR(reason) SB if (!dec->err) dec->err = reason; goto fail; SE @@ -495,47 +632,54 @@ static UV decode_uint (dec_t *dec) { - switch (*dec->cur & 31) + U8 m = *dec->cur & MINOR_MASK; + ++dec->cur; + + if (ecb_expect_true (m < LENGTH_EXT1)) + return m; + else if (ecb_expect_true (m == LENGTH_EXT1)) + { + WANT (1); + dec->cur += 1; + return dec->cur[-1]; + } + else if (ecb_expect_true (m == LENGTH_EXT2)) + { + WANT (2); + dec->cur += 2; + return (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); + } + else if (ecb_expect_true (m == LENGTH_EXT4)) { - case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: - case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: - case 16: case 17: case 18: case 19: case 20: case 21: case 22: case 23: - return *dec->cur++ & 31; - - case 24: - WANT (2); - dec->cur += 2; - return dec->cur[-1]; - - case 25: - WANT (3); - dec->cur += 3; - return (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); - - case 26: - WANT (5); - dec->cur += 5; - return (((UV)dec->cur[-4]) << 24) - | (((UV)dec->cur[-3]) << 16) - | (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); - - case 27: - WANT (9); - dec->cur += 9; - return (((UV)dec->cur[-8]) << 56) - | (((UV)dec->cur[-7]) << 48) - | (((UV)dec->cur[-6]) << 40) - | (((UV)dec->cur[-5]) << 32) - | (((UV)dec->cur[-4]) << 24) - | (((UV)dec->cur[-3]) << 16) - | (((UV)dec->cur[-2]) << 8) - | ((UV)dec->cur[-1]); + WANT (4); + dec->cur += 4; + return (((UV)dec->cur[-4]) << 24) + | (((UV)dec->cur[-3]) << 16) + | (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); + } + else if (ecb_expect_true (m == LENGTH_EXT8)) + { + WANT (8); + dec->cur += 8; - default: - ERR ("corrupted CBOR data (unsupported integer minor encoding)"); + return +#if UVSIZE < 8 + 0 +#else + (((UV)dec->cur[-8]) << 56) + | (((UV)dec->cur[-7]) << 48) + | (((UV)dec->cur[-6]) << 40) + | (((UV)dec->cur[-5]) << 32) +#endif + | (((UV)dec->cur[-4]) << 24) + | (((UV)dec->cur[-3]) << 16) + | (((UV)dec->cur[-2]) << 8) + | ((UV)dec->cur[-1]); } + else + ERR ("corrupted CBOR data (unsupported integer minor encoding)"); fail: return 0; @@ -550,7 +694,7 @@ DEC_INC_DEPTH; - if ((*dec->cur & 31) == 31) + if (*dec->cur == (MAJOR_ARRAY | MINOR_INDEF)) { ++dec->cur; @@ -558,7 +702,7 @@ { WANT (1); - if (*dec->cur == (0xe0 | 31)) + if (*dec->cur == (MAJOR_MISC | MINOR_INDEF)) { ++dec->cur; break; @@ -571,6 +715,7 @@ { int i, len = decode_uint (dec); + WANT (len); // complexity check for av_fill - need at least one byte per value, do not allow supersize arrays av_fill (av, len - 1); for (i = 0; i < len; ++i) @@ -586,6 +731,50 @@ return &PL_sv_undef; } +static void +decode_he (dec_t *dec, HV *hv) +{ + // for speed reasons, we specialcase single-string + // byte or utf-8 strings as keys, but only when !stringref + + if (ecb_expect_true (!dec->stringref)) + if (ecb_expect_true ((U8)(*dec->cur - MAJOR_BYTES) <= LENGTH_EXT8)) + { + I32 len = decode_uint (dec); + char *key = (char *)dec->cur; + + dec->cur += len; + + hv_store (hv, key, len, decode_sv (dec), 0); + + return; + } + else if (ecb_expect_true ((U8)(*dec->cur - MAJOR_TEXT) <= LENGTH_EXT8)) + { + I32 len = decode_uint (dec); + char *key = (char *)dec->cur; + + dec->cur += len; + + if (ecb_expect_false (dec->cbor.flags & F_VALIDATE_UTF8)) + if (!is_utf8_string (key, len)) + ERR ("corrupted CBOR data (invalid UTF-8 in map key)"); + + hv_store (hv, key, -len, decode_sv (dec), 0); + + return; + } + + SV *k = decode_sv (dec); + SV *v = decode_sv (dec); + + hv_store_ent (hv, k, v, 0); + SvREFCNT_dec (k); + +fail: + ; +} + static SV * decode_hv (dec_t *dec) { @@ -593,7 +782,7 @@ DEC_INC_DEPTH; - if ((*dec->cur & 31) == 31) + if (*dec->cur == (MAJOR_MAP | MINOR_INDEF)) { ++dec->cur; @@ -601,31 +790,21 @@ { WANT (1); - if (*dec->cur == (0xe0 | 31)) + if (*dec->cur == (MAJOR_MISC | MINOR_INDEF)) { ++dec->cur; break; } - SV *k = decode_sv (dec); - SV *v = decode_sv (dec); - - hv_store_ent (hv, k, v, 0); - SvREFCNT_dec (k); + decode_he (dec, hv); } } else { - int len = decode_uint (dec); + int pairs = decode_uint (dec); - while (len--) - { - SV *k = decode_sv (dec); - SV *v = decode_sv (dec); - - hv_store_ent (hv, k, v, 0); - SvREFCNT_dec (k); - } + while (pairs--) + decode_he (dec, hv); } DEC_DEC_DEPTH; @@ -642,24 +821,33 @@ { SV *sv = 0; - if ((*dec->cur & 31) == 31) + if ((*dec->cur & MINOR_MASK) == MINOR_INDEF) { + // indefinite length strings ++dec->cur; + U8 major = *dec->cur & MAJOR_MISC; + sv = newSVpvn ("", 0); - // not very fast, and certainly not robust against illegal input for (;;) { WANT (1); - if (*dec->cur == (0xe0 | 31)) - { - ++dec->cur; - break; - } + if ((*dec->cur - major) > LENGTH_EXT8) + if (*dec->cur == (MAJOR_MISC | MINOR_INDEF)) + { + ++dec->cur; + break; + } + else + ERR ("corrupted CBOR data (invalid chunks in indefinite length string)"); - sv_catsv (sv, decode_sv (dec)); + STRLEN len = decode_uint (dec); + + WANT (len); + sv_catpvn (sv, dec->cur, len); + dec->cur += len; } } else @@ -669,10 +857,20 @@ WANT (len); sv = newSVpvn (dec->cur, len); dec->cur += len; + + if (ecb_expect_false (dec->stringref) + && SvCUR (sv) >= minimum_string_length (AvFILLp (dec->stringref) + 1)) + av_push (dec->stringref, SvREFCNT_inc_NN (sv)); } if (utf8) - SvUTF8_on (sv); + { + if (ecb_expect_false (dec->cbor.flags & F_VALIDATE_UTF8)) + if (!is_utf8_string (SvPVX (sv), SvCUR (sv))) + ERR ("corrupted CBOR data (invalid UTF-8 in text string)"); + + SvUTF8_on (sv); + } return sv; @@ -684,67 +882,186 @@ static SV * decode_tagged (dec_t *dec) { + SV *sv = 0; UV tag = decode_uint (dec); - SV *sv = decode_sv (dec); - if (tag == CBOR_TAG_MAGIC) - return sv; - else if (tag == CBOR_TAG_PERL_OBJECT) - { - if (!SvROK (sv) || SvTYPE (SvRV (sv)) != SVt_PVAV) - ERR ("corrupted CBOR data (non-array perl object)"); - - AV *av = (AV *)SvRV (sv); - int len = av_len (av) + 1; - HV *stash = gv_stashsv (*av_fetch (av, 0, 1), 0); - - if (!stash) - ERR ("cannot decode perl-object (package does not exist)"); - - GV *method = gv_fetchmethod_autoload (stash, "THAW", 0); - - if (!method) - ERR ("cannot decode perl-object (package does not have a THAW method)"); - - dSP; - - ENTER; SAVETMPS; PUSHMARK (SP); - EXTEND (SP, len + 1); - // we re-bless the reference to get overload and other niceties right - PUSHs (*av_fetch (av, 0, 1)); - PUSHs (sv_cbor); - - int i; - - for (i = 1; i < len; ++i) - PUSHs (*av_fetch (av, i, 1)); - - PUTBACK; - call_sv ((SV *)GvCV (method), G_SCALAR); - SPAGAIN; + WANT (1); - SvREFCNT_dec (sv); - sv = SvREFCNT_inc (POPs); + switch (tag) + { + case CBOR_TAG_MAGIC: + sv = decode_sv (dec); + break; - PUTBACK; + case CBOR_TAG_INDIRECTION: + sv = newRV_noinc (decode_sv (dec)); + break; - FREETMPS; LEAVE; + case CBOR_TAG_STRINGREF_NAMESPACE: + { + ENTER; SAVETMPS; - return sv; - } - else - { - AV *av = newAV (); - av_push (av, newSVuv (tag)); - av_push (av, sv); + SAVESPTR (dec->stringref); + dec->stringref = (AV *)sv_2mortal ((SV *)newAV ()); - HV *tagged_stash = !CBOR_SLOW || cbor_tagged_stash - ? cbor_tagged_stash - : gv_stashpv ("CBOR::XS::Tagged" , 1); + sv = decode_sv (dec); - return sv_bless (newRV_noinc ((SV *)av), tagged_stash); + FREETMPS; LEAVE; + } + break; + + case CBOR_TAG_STRINGREF: + { + if ((*dec->cur >> MAJOR_SHIFT) != (MAJOR_POS_INT >> MAJOR_SHIFT)) + ERR ("corrupted CBOR data (stringref index not an unsigned integer)"); + + UV idx = decode_uint (dec); + + if (!dec->stringref || (int)idx > AvFILLp (dec->stringref)) + ERR ("corrupted CBOR data (stringref index out of bounds or outside namespace)"); + + sv = newSVsv (AvARRAY (dec->stringref)[idx]); + } + break; + + case CBOR_TAG_VALUE_SHAREABLE: + { + if (ecb_expect_false (!dec->shareable)) + dec->shareable = (AV *)sv_2mortal ((SV *)newAV ()); + + if (dec->cbor.flags & F_ALLOW_CYCLES) + { + sv = newSV (0); + av_push (dec->shareable, SvREFCNT_inc_NN (sv)); + + SV *osv = decode_sv (dec); + sv_setsv (sv, osv); + SvREFCNT_dec_NN (osv); + } + else + { + av_push (dec->shareable, &PL_sv_undef); + int idx = AvFILLp (dec->shareable); + sv = decode_sv (dec); + av_store (dec->shareable, idx, SvREFCNT_inc_NN (sv)); + } + } + break; + + case CBOR_TAG_VALUE_SHAREDREF: + { + if ((*dec->cur >> MAJOR_SHIFT) != (MAJOR_POS_INT >> MAJOR_SHIFT)) + ERR ("corrupted CBOR data (sharedref index not an unsigned integer)"); + + UV idx = decode_uint (dec); + + if (!dec->shareable || (int)idx > AvFILLp (dec->shareable)) + ERR ("corrupted CBOR data (sharedref index out of bounds)"); + + sv = SvREFCNT_inc_NN (AvARRAY (dec->shareable)[idx]); + + if (sv == &PL_sv_undef) + ERR ("cyclic CBOR data structure found, but allow_cycles is not enabled"); + } + break; + + case CBOR_TAG_PERL_OBJECT: + { + sv = decode_sv (dec); + + if (!SvROK (sv) || SvTYPE (SvRV (sv)) != SVt_PVAV) + ERR ("corrupted CBOR data (non-array perl object)"); + + AV *av = (AV *)SvRV (sv); + int len = av_len (av) + 1; + HV *stash = gv_stashsv (*av_fetch (av, 0, 1), 0); + + if (!stash) + ERR ("cannot decode perl-object (package does not exist)"); + + GV *method = gv_fetchmethod_autoload (stash, "THAW", 0); + + if (!method) + ERR ("cannot decode perl-object (package does not have a THAW method)"); + + dSP; + + ENTER; SAVETMPS; PUSHMARK (SP); + EXTEND (SP, len + 1); + // we re-bless the reference to get overload and other niceties right + PUSHs (*av_fetch (av, 0, 1)); + PUSHs (sv_cbor); + + int i; + + for (i = 1; i < len; ++i) + PUSHs (*av_fetch (av, i, 1)); + + PUTBACK; + call_sv ((SV *)GvCV (method), G_SCALAR | G_EVAL); + SPAGAIN; + + if (SvTRUE (ERRSV)) + { + FREETMPS; LEAVE; + ERR (SvPVutf8_nolen (sv_2mortal (SvREFCNT_inc (ERRSV)))); + } + + SvREFCNT_dec (sv); + sv = SvREFCNT_inc (POPs); + + PUTBACK; + + FREETMPS; LEAVE; + } + break; + + default: + { + sv = decode_sv (dec); + + dSP; + ENTER; SAVETMPS; PUSHMARK (SP); + EXTEND (SP, 2); + PUSHs (newSVuv (tag)); + PUSHs (sv); + + PUTBACK; + int count = call_sv (dec->cbor.filter ? dec->cbor.filter : default_filter, G_ARRAY | G_EVAL); + SPAGAIN; + + if (SvTRUE (ERRSV)) + { + FREETMPS; LEAVE; + ERR (SvPVutf8_nolen (sv_2mortal (SvREFCNT_inc (ERRSV)))); + } + + if (count) + { + SvREFCNT_dec (sv); + sv = SvREFCNT_inc (POPs); + } + else + { + AV *av = newAV (); + av_push (av, newSVuv (tag)); + av_push (av, sv); + + HV *tagged_stash = !CBOR_SLOW || cbor_tagged_stash + ? cbor_tagged_stash + : gv_stashpv ("CBOR::XS::Tagged" , 1); + sv = sv_bless (newRV_noinc ((SV *)av), tagged_stash); + } + + PUTBACK; + + FREETMPS; LEAVE; + } + break; } + return sv; + fail: SvREFCNT_dec (sv); return &PL_sv_undef; @@ -755,44 +1072,38 @@ { WANT (1); - switch (*dec->cur >> 5) + switch (*dec->cur >> MAJOR_SHIFT) { - case 0: // unsigned int - return newSVuv (decode_uint (dec)); - case 1: // negative int - return newSViv (-1 - (IV)decode_uint (dec)); - case 2: // octet string - return decode_str (dec, 0); - case 3: // utf-8 string - return decode_str (dec, 1); - case 4: // array - return decode_av (dec); - case 5: // map - return decode_hv (dec); - case 6: // tag - return decode_tagged (dec); - case 7: // misc - switch (*dec->cur++ & 31) + case MAJOR_POS_INT >> MAJOR_SHIFT: return newSVuv (decode_uint (dec)); + case MAJOR_NEG_INT >> MAJOR_SHIFT: return newSViv (-1 - (IV)decode_uint (dec)); + case MAJOR_BYTES >> MAJOR_SHIFT: return decode_str (dec, 0); + case MAJOR_TEXT >> MAJOR_SHIFT: return decode_str (dec, 1); + case MAJOR_ARRAY >> MAJOR_SHIFT: return decode_av (dec); + case MAJOR_MAP >> MAJOR_SHIFT: return decode_hv (dec); + case MAJOR_TAG >> MAJOR_SHIFT: return decode_tagged (dec); + + case MAJOR_MISC >> MAJOR_SHIFT: + switch (*dec->cur++ & MINOR_MASK) { - case 20: + case SIMPLE_FALSE: #if CBOR_SLOW types_false = get_bool ("Types::Serialiser::false"); #endif return newSVsv (types_false); - case 21: + case SIMPLE_TRUE: #if CBOR_SLOW types_true = get_bool ("Types::Serialiser::true"); #endif return newSVsv (types_true); - case 22: + case SIMPLE_NULL: return newSVsv (&PL_sv_undef); - case 23: + case SIMPLE_UNDEF: #if CBOR_SLOW types_error = get_bool ("Types::Serialiser::error"); #endif return newSVsv (types_error); - case 25: + case MISC_FLOAT16: { WANT (2); @@ -802,7 +1113,7 @@ return newSVnv (ecb_binary16_to_float (fp)); } - case 26: + case MISC_FLOAT32: { uint32_t fp; WANT (4); @@ -815,7 +1126,7 @@ return newSVnv (ecb_binary32_to_float (fp)); } - case 27: + case MISC_FLOAT64: { uint64_t fp; WANT (8); @@ -828,10 +1139,12 @@ return newSVnv (ecb_binary64_to_double (fp)); } - // 0..19 unassigned - // 24 reserved + unassigned (reserved values are not encodable) + // 0..19 unassigned simple + // 24 reserved + unassigned simple (reserved values are not encodable) + // 28-30 unassigned misc + // 31 break code default: - ERR ("corrupted CBOR data (reserved/unassigned major 7 value)"); + ERR ("corrupted CBOR data (reserved/unassigned/unexpected major 7 value)"); } break; @@ -844,49 +1157,18 @@ static SV * decode_cbor (SV *string, CBOR *cbor, char **offset_return) { - dec_t dec; + dec_t dec = { }; SV *sv; + STRLEN len; + char *data = SvPVbyte (string, len); - /* work around bugs in 5.10 where manipulating magic values - * makes perl ignore the magic in subsequent accesses. - * also make a copy of non-PV values, to get them into a clean - * state (SvPV should do that, but it's buggy, see below). - */ - /*SvGETMAGIC (string);*/ - if (SvMAGICAL (string) || !SvPOK (string)) - string = sv_2mortal (newSVsv (string)); - - SvUPGRADE (string, SVt_PV); - - /* work around a bug in perl 5.10, which causes SvCUR to fail an - * assertion with -DDEBUGGING, although SvCUR is documented to - * return the xpv_cur field which certainly exists after upgrading. - * according to nicholas clark, calling SvPOK fixes this. - * But it doesn't fix it, so try another workaround, call SvPV_nolen - * and hope for the best. - * Damnit, SvPV_nolen still trips over yet another assertion. This - * assertion business is seriously broken, try yet another workaround - * for the broken -DDEBUGGING. - */ - { -#ifdef DEBUGGING - STRLEN offset = SvOK (string) ? sv_len (string) : 0; -#else - STRLEN offset = SvCUR (string); -#endif - - if (offset > cbor->max_size && cbor->max_size) - croak ("attempted decode of CBOR text of %lu bytes size, but max_size is set to %lu", - (unsigned long)SvCUR (string), (unsigned long)cbor->max_size); - } - - sv_utf8_downgrade (string, 0); + if (len > cbor->max_size && cbor->max_size) + croak ("attempted decode of CBOR text of %lu bytes size, but max_size is set to %lu", + (unsigned long)len, (unsigned long)cbor->max_size); dec.cbor = *cbor; - dec.cur = (U8 *)SvPVX (string); - dec.end = (U8 *)SvEND (string); - dec.err = 0; - dec.depth = 0; + dec.cur = (U8 *)data; + dec.end = (U8 *)data + len; sv = decode_sv (&dec); @@ -899,8 +1181,19 @@ if (dec.err) { + if (dec.shareable) + { + // need to break cyclic links, which whould all be in shareable + int i; + SV **svp; + + for (i = av_len (dec.shareable) + 1; i--; ) + if ((svp = av_fetch (dec.shareable, i, 0))) + sv_setsv (*svp, &PL_sv_undef); + } + SvREFCNT_dec (sv); - croak ("%s, at offset %d (octet 0x%02x)", dec.err, dec.cur - (U8 *)SvPVX (string), (int)(uint8_t)*dec.cur); + croak ("%s, at offset %d (octet 0x%02x)", dec.err, dec.cur - (U8 *)data, (int)(uint8_t)*dec.cur); } sv = sv_2mortal (sv); @@ -909,6 +1202,126 @@ } ///////////////////////////////////////////////////////////////////////////// +// incremental parser + +#define INCR_DONE(cbor) (AvFILLp (cbor->incr_count) < 0) + +// returns 0 for notyet, 1 for success or error +static int +incr_parse (CBOR *self, SV *cborstr) +{ + STRLEN cur; + SvPV (cborstr, cur); + + while (ecb_expect_true (self->incr_need <= cur)) + { + // table of integer count bytes + static I8 incr_len[MINOR_MASK + 1] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 4, 8,-1,-1,-1,-2 + }; + + const U8 *p = SvPVX (cborstr) + self->incr_pos; + U8 m = *p & MINOR_MASK; + IV count = SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]); + I8 ilen = incr_len[m]; + + self->incr_need = self->incr_pos + 1; + + if (ecb_expect_false (ilen < 0)) + { + if (m != MINOR_INDEF) + return 1; // error + + if (*p == (MAJOR_MISC | MINOR_INDEF)) + { + if (count >= 0) + return 1; // error + + count = 1; + } + else + { + av_push (self->incr_count, newSViv (-1)); //TODO: nest + count = -1; + } + } + else + { + self->incr_need += ilen; + if (ecb_expect_false (self->incr_need > cur)) + return 0; + + int major = *p >> MAJOR_SHIFT; + + switch (major) + { + case MAJOR_BYTES >> MAJOR_SHIFT: + case MAJOR_TEXT >> MAJOR_SHIFT: + case MAJOR_ARRAY >> MAJOR_SHIFT: + case MAJOR_MAP >> MAJOR_SHIFT: + { + UV len; + + if (ecb_expect_false (ilen)) + { + len = 0; + + do { + len = (len << 8) | *++p; + } while (--ilen); + } + else + len = m; + + switch (major) + { + case MAJOR_BYTES >> MAJOR_SHIFT: + case MAJOR_TEXT >> MAJOR_SHIFT: + self->incr_need += len; + if (ecb_expect_false (self->incr_need > cur)) + return 0; + + break; + + case MAJOR_MAP >> MAJOR_SHIFT: + len <<= 1; + case MAJOR_ARRAY >> MAJOR_SHIFT: + if (len) + { + av_push (self->incr_count, newSViv (len + 1)); //TODO: nest + count = len + 1; + } + break; + } + } + } + } + + self->incr_pos = self->incr_need; + + if (count > 0) + { + while (!--count) + { + if (!AvFILLp (self->incr_count)) + return 1; // done + + SvREFCNT_dec_NN (av_pop (self->incr_count)); + count = SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]); + } + + SvIVX (AvARRAY (self->incr_count)[AvFILLp (self->incr_count)]) = count; + } + } + + return 0; +} + + +///////////////////////////////////////////////////////////////////////////// // XS interface functions MODULE = CBOR::XS PACKAGE = CBOR::XS @@ -925,6 +1338,8 @@ types_false = get_bool ("Types::Serialiser::false"); types_error = get_bool ("Types::Serialiser::error"); + default_filter = newSVpv ("CBOR::XS::default_filter", 0); + sv_cbor = newSVpv ("CBOR", 0); SvREADONLY_on (sv_cbor); } @@ -954,6 +1369,10 @@ ALIAS: shrink = F_SHRINK allow_unknown = F_ALLOW_UNKNOWN + allow_sharing = F_ALLOW_SHARING + allow_cycles = F_ALLOW_CYCLES + pack_strings = F_PACK_STRINGS + validate_utf8 = F_VALIDATE_UTF8 PPCODE: { if (enable) @@ -968,6 +1387,10 @@ ALIAS: get_shrink = F_SHRINK get_allow_unknown = F_ALLOW_UNKNOWN + get_allow_sharing = F_ALLOW_SHARING + get_allow_cycles = F_ALLOW_CYCLES + get_pack_strings = F_PACK_STRINGS + get_validate_utf8 = F_VALIDATE_UTF8 PPCODE: XPUSHs (boolSV (self->flags & ix)); @@ -993,6 +1416,18 @@ OUTPUT: RETVAL +void filter (CBOR *self, SV *filter = 0) + PPCODE: + SvREFCNT_dec (self->filter); + self->filter = filter ? newSVsv (filter) : filter; + XPUSHs (ST (0)); + +SV *get_filter (CBOR *self) + CODE: + RETVAL = self->filter ? self->filter : NEWSV (0, 0); + OUTPUT: + RETVAL + void encode (CBOR *self, SV *scalar) PPCODE: PUTBACK; scalar = encode_cbor (scalar, self); SPAGAIN; @@ -1014,13 +1449,73 @@ PUSHs (sv_2mortal (newSVuv (offset - SvPVX (cborstr)))); } +void incr_parse (CBOR *self, SV *cborstr) + ALIAS: + incr_parse_multiple = 1 + PPCODE: +{ + if (SvUTF8 (cborstr)) + sv_utf8_downgrade (cborstr, 0); + + if (!self->incr_count) + { + self->incr_count = newAV (); + self->incr_pos = 0; + self->incr_need = 1; + + av_push (self->incr_count, newSViv (1)); + } + + do + { + if (!incr_parse (self, cborstr)) + { + if (self->incr_need > self->max_size && self->max_size) + croak ("attempted decode of CBOR text of %lu bytes size, but max_size is set to %lu", + (unsigned long)self->incr_need, (unsigned long)self->max_size); + + break; + } + + SV *sv; + char *offset; + + PUTBACK; sv = decode_cbor (cborstr, self, &offset); SPAGAIN; + XPUSHs (sv); + + sv_chop (cborstr, offset); + + av_clear (self->incr_count); + av_push (self->incr_count, newSViv (1)); + + self->incr_pos = 0; + self->incr_need = self->incr_pos + 1; + } + while (ix); +} + +void incr_reset (CBOR *self) + CODE: +{ + SvREFCNT_dec (self->incr_count); + self->incr_count = 0; +} + +void DESTROY (CBOR *self) + PPCODE: + cbor_free (self); + PROTOTYPES: ENABLE void encode_cbor (SV *scalar) + ALIAS: + encode_cbor = 0 + encode_cbor_sharing = F_ALLOW_SHARING PPCODE: { CBOR cbor; cbor_init (&cbor); + cbor.flags |= ix; PUTBACK; scalar = encode_cbor (scalar, &cbor); SPAGAIN; XPUSHs (scalar); }