--- CBOR-XS/XS.xs 2013/11/20 01:09:46 1.18 +++ CBOR-XS/XS.xs 2013/11/28 09:13:12 1.29 @@ -21,17 +21,20 @@ #ifndef HvNAMEUTF8 # define HvNAMEUTF8(hv) 0 #endif +#ifndef SvREFCNT_dec_NN +# define SvREFCNT_dec_NN(sv) SvREFCNT_dec (sv) +#endif // known tags enum cbor_tag { - // inofficial extensions (pending iana registration) - CBOR_TAG_PERL_OBJECT = 24, // http://cbor.schmorp.de/perl-object - CBOR_TAG_GENERIC_OBJECT = 25, // http://cbor.schmorp.de/generic-object - CBOR_TAG_VALUE_SHARABLE = 26, // http://cbor.schmorp.de/value-sharing - CBOR_TAG_VALUE_SHAREDREF = 27, // http://cbor.schmorp.de/value-sharing - CBOR_TAG_STRINGREF_NAMESPACE = 65537, // http://cbor.schmorp.de/stringref - CBOR_TAG_STRINGREF = 28, // http://cbor.schmorp.de/stringref + // extensions + CBOR_TAG_STRINGREF = 25, // http://cbor.schmorp.de/stringref + CBOR_TAG_PERL_OBJECT = 26, // http://cbor.schmorp.de/perl-object + CBOR_TAG_GENERIC_OBJECT = 27, // http://cbor.schmorp.de/generic-object + CBOR_TAG_VALUE_SHAREABLE = 28, // http://cbor.schmorp.de/value-sharing + CBOR_TAG_VALUE_SHAREDREF = 29, // http://cbor.schmorp.de/value-sharing + CBOR_TAG_STRINGREF_NAMESPACE = 256, // http://cbor.schmorp.de/stringref CBOR_TAG_INDIRECTION = 22098, // http://cbor.schmorp.de/indirection // rfc7049 @@ -56,11 +59,10 @@ CBOR_TAG_MAGIC = 55799 // self-describe cbor }; -#define F_SHRINK 0x00000001UL -#define F_ALLOW_UNKNOWN 0x00000002UL -#define F_ALLOW_SHARING 0x00000004UL //TODO -#define F_DEDUP_STRINGS 0x00000008UL //TODO -#define F_DEDUP_KEYS 0x00000010UL //TODO +#define F_SHRINK 0x00000001UL +#define F_ALLOW_UNKNOWN 0x00000002UL +#define F_ALLOW_SHARING 0x00000004UL //TODO +#define F_ALLOW_STRINGREF 0x00000008UL //TODO #define INIT_SIZE 32 // initial scalar size to be allocated @@ -82,12 +84,13 @@ #endif static HV *cbor_stash, *types_boolean_stash, *types_error_stash, *cbor_tagged_stash; // CBOR::XS:: -static SV *types_true, *types_false, *types_error, *sv_cbor; +static SV *types_true, *types_false, *types_error, *sv_cbor, *default_filter; typedef struct { U32 flags; U32 max_depth; STRLEN max_size; + SV *filter; } CBOR; ecb_inline void @@ -97,6 +100,12 @@ cbor->max_depth = 512; } +ecb_inline void +cbor_free (CBOR *cbor) +{ + SvREFCNT_dec (cbor->filter); +} + ///////////////////////////////////////////////////////////////////////////// // utility functions @@ -126,6 +135,21 @@ } } +// minimum length of a string to be registered for stringref +ecb_inline int +minimum_string_length (UV idx) +{ + return idx > 23 + ? idx > 0xffU + ? idx > 0xffffU + ? idx > 0xffffffffU + ? 11 + : 7 + : 5 + : 4 + : 3; +} + ///////////////////////////////////////////////////////////////////////////// // encoder @@ -137,9 +161,10 @@ SV *sv; // result scalar CBOR cbor; U32 depth; // recursion level - HV *stringref; // string => index, or 0 - HV *sharable; // ptr => index, or 0 - HV *sharable_idx; + HV *stringref[2]; // string => index, or 0 ([0] = bytes, [1] = utf-8) + UV stringref_idx; + HV *shareable; // ptr => index, or 0 + UV shareable_idx; } enc_t; ecb_inline void @@ -201,53 +226,38 @@ } } -static void -encode_str (enc_t *enc, int utf8, char *str, STRLEN len) -{ - encode_uint (enc, utf8 ? 0x60 : 0x40, len); - need (enc, len); - memcpy (enc->cur, str, len); - enc->cur += len; -} - ecb_inline void encode_tag (enc_t *enc, UV tag) { encode_uint (enc, 0xc0, tag); } -static int -encode_sharable2 (enc_t *enc, SV *sv) +static void +encode_str (enc_t *enc, int utf8, char *str, STRLEN len) { - if (!enc->sharable) - enc->sharable = (HV *)sv_2mortal ((SV *)newHV ()); - - SV **svp = hv_fetch (enc->sharable, &sv, sizeof (sv), 1); - - if (SvOK (*svp)) + if (ecb_expect_false (enc->cbor.flags & F_ALLOW_STRINGREF)) { - encode_tag (enc, CBOR_TAG_VALUE_SHAREDREF); - encode_uint (enc, 0x00, SvUV (*svp)); + SV **svp = hv_fetch (enc->stringref[!!utf8], str, len, 1); - return 1; - } - else - { - sv_setuv (*svp, enc->sharable_idx++); - encode_tag (enc, CBOR_TAG_VALUE_SHARABLE); - - return 0; + if (SvOK (*svp)) + { + // already registered, use stringref + encode_tag (enc, CBOR_TAG_STRINGREF); + encode_uint (enc, 0x00, SvUV (*svp)); + return; + } + else if (len >= minimum_string_length (enc->stringref_idx)) + { + // register only + sv_setuv (*svp, enc->stringref_idx); + ++enc->stringref_idx; + } } -} -ecb_inline int -encode_sharable (enc_t *enc, SV *sv) -{ - if (ecb_expect_false (enc->cbor.flags & F_ALLOW_SHARING) - && ecb_expect_false (SvREFCNT (sv) > 1)) - return encode_sharable2 (enc, sv); - - return 0; + encode_uint (enc, utf8 ? 0x60 : 0x40, len); + need (enc, len); + memcpy (enc->cur, str, len); + enc->cur += len; } static void encode_sv (enc_t *enc, SV *sv); @@ -311,13 +321,31 @@ static void encode_rv (enc_t *enc, SV *sv) { - svtype svt; - SvGETMAGIC (sv); - svt = SvTYPE (sv); - if (encode_sharable (enc, sv)) - return; + if (ecb_expect_false (enc->cbor.flags & F_ALLOW_SHARING) + && ecb_expect_false (SvREFCNT (sv) > 1)) + { + if (!enc->shareable) + enc->shareable = (HV *)sv_2mortal ((SV *)newHV ()); + + SV **svp = hv_fetch (enc->shareable, (char *)&sv, sizeof (sv), 1); + + if (SvOK (*svp)) + { + encode_tag (enc, CBOR_TAG_VALUE_SHAREDREF); + encode_uint (enc, 0x00, SvUV (*svp)); + return; + } + else + { + sv_setuv (*svp, enc->shareable_idx); + ++enc->shareable_idx; + encode_tag (enc, CBOR_TAG_VALUE_SHAREABLE); + } + } + + svtype svt = SvTYPE (sv); if (ecb_expect_false (SvOBJECT (sv))) { @@ -454,9 +482,6 @@ { SvGETMAGIC (sv); - if (encode_sharable (enc, sv)) - return; - if (SvPOKp (sv)) { STRLEN len; @@ -496,6 +521,14 @@ enc.end = SvEND (enc.sv); SvPOK_only (enc.sv); + + if (cbor->flags & F_ALLOW_STRINGREF) + { + encode_tag (&enc, CBOR_TAG_STRINGREF_NAMESPACE); + enc.stringref[0]= (HV *)sv_2mortal ((SV *)newHV ()); + enc.stringref[1]= (HV *)sv_2mortal ((SV *)newHV ()); + } + encode_sv (&enc, scalar); SvCUR_set (enc.sv, enc.cur - SvPVX (enc.sv)); @@ -519,7 +552,9 @@ CBOR cbor; U32 depth; // recursion depth U32 maxdepth; // recursion depth limit - AV *sharable; + AV *shareable; + AV *stringref; + SV *decode_tagged; } dec_t; #define ERR(reason) SB if (!dec->err) dec->err = reason; goto fail; SE @@ -627,34 +662,43 @@ decode_he (dec_t *dec, HV *hv) { // for speed reasons, we specialcase single-string - // byte or utf-8 strings as keys. + // byte or utf-8 strings as keys, but only when !stringref - if (*dec->cur >= 0x40 && *dec->cur <= 0x40 + 27) - { - I32 len = decode_uint (dec); - char *key = (char *)dec->cur; + if (ecb_expect_true (!dec->stringref)) + if (*dec->cur >= 0x40 && *dec->cur <= 0x40 + 27) + { + I32 len = decode_uint (dec); + char *key = (char *)dec->cur; - dec->cur += len; + dec->cur += len; - hv_store (hv, key, len, decode_sv (dec), 0); - } - else if (*dec->cur >= 0x60 && *dec->cur <= 0x60 + 27) - { - I32 len = decode_uint (dec); - char *key = (char *)dec->cur; + if (ecb_expect_false (dec->stringref)) + av_push (dec->stringref, newSVpvn (key, len)); - dec->cur += len; + hv_store (hv, key, len, decode_sv (dec), 0); - hv_store (hv, key, -len, decode_sv (dec), 0); - } - else - { - SV *k = decode_sv (dec); - SV *v = decode_sv (dec); + return; + } + else if (*dec->cur >= 0x60 && *dec->cur <= 0x60 + 27) + { + I32 len = decode_uint (dec); + char *key = (char *)dec->cur; - hv_store_ent (hv, k, v, 0); - SvREFCNT_dec (k); - } + dec->cur += len; + + if (ecb_expect_false (dec->stringref)) + av_push (dec->stringref, newSVpvn_utf8 (key, len, 1)); + + hv_store (hv, key, -len, decode_sv (dec), 0); + + return; + } + + SV *k = decode_sv (dec); + SV *v = decode_sv (dec); + + hv_store_ent (hv, k, v, 0); + SvREFCNT_dec (k); } static SV * @@ -730,6 +774,10 @@ WANT (len); sv = newSVpvn (dec->cur, len); dec->cur += len; + + if (ecb_expect_false (dec->stringref) + && SvCUR (sv) >= minimum_string_length (AvFILLp (dec->stringref) + 1)) + av_push (dec->stringref, SvREFCNT_inc_NN (sv)); } if (utf8) @@ -745,40 +793,80 @@ static SV * decode_tagged (dec_t *dec) { + SV *sv = 0; UV tag = decode_uint (dec); - SV *sv = decode_sv (dec); + + WANT (1); switch (tag) { case CBOR_TAG_MAGIC: - return sv; + sv = decode_sv (dec); + break; case CBOR_TAG_INDIRECTION: - return newRV_noinc (sv); + sv = newRV_noinc (decode_sv (dec)); + break; + + case CBOR_TAG_STRINGREF_NAMESPACE: + { + ENTER; SAVETMPS; - case CBOR_TAG_VALUE_SHARABLE: - if (ecb_expect_false (!dec->sharable)) - dec->sharable = (AV *)sv_2mortal ((SV *)newAV ()); + SAVESPTR (dec->stringref); + dec->stringref = (AV *)sv_2mortal ((SV *)newAV ()); - av_push (dec->sharable, SvREFCNT_inc_NN (sv)); + sv = decode_sv (dec); + + FREETMPS; LEAVE; + } + break; + + case CBOR_TAG_STRINGREF: + { + if ((*dec->cur >> 5) != 0) + ERR ("corrupted CBOR data (stringref index not an unsigned integer)"); - return sv; + UV idx = decode_uint (dec); + + if (!dec->stringref || (int)idx > AvFILLp (dec->stringref)) + ERR ("corrupted CBOR data (stringref index out of bounds or outside namespace)"); + + sv = newSVsv (AvARRAY (dec->stringref)[idx]); + } + break; + + case CBOR_TAG_VALUE_SHAREABLE: + { + if (ecb_expect_false (!dec->shareable)) + dec->shareable = (AV *)sv_2mortal ((SV *)newAV ()); + + sv = newSV (0); + av_push (dec->shareable, SvREFCNT_inc_NN (sv)); + + SV *osv = decode_sv (dec); + sv_setsv (sv, osv); + SvREFCNT_dec_NN (osv); + } + break; case CBOR_TAG_VALUE_SHAREDREF: { - // TODO: should verify that the sv atcually was a CBOR unsigned integer - UV idx = SvUV (sv); + if ((*dec->cur >> 5) != 0) + ERR ("corrupted CBOR data (sharedref index not an unsigned integer)"); - if (!dec->sharable || idx > AvFILLp (dec->sharable)) - ERR ("corrupted CBOR data (sharedref index out of bounds)"); + UV idx = decode_uint (dec); - SvREFCNT_dec (sv); + if (!dec->shareable || (int)idx > AvFILLp (dec->shareable)) + ERR ("corrupted CBOR data (sharedref index out of bounds)"); - return SvREFCNT_inc_NN (AvARRAY (dec->sharable)[idx]); + sv = SvREFCNT_inc_NN (AvARRAY (dec->shareable)[idx]); } + break; case CBOR_TAG_PERL_OBJECT: { + sv = decode_sv (dec); + if (!SvROK (sv) || SvTYPE (SvRV (sv)) != SVt_PVAV) ERR ("corrupted CBOR data (non-array perl object)"); @@ -823,24 +911,55 @@ PUTBACK; FREETMPS; LEAVE; - - return sv; } + break; default: { - AV *av = newAV (); - av_push (av, newSVuv (tag)); - av_push (av, sv); - - HV *tagged_stash = !CBOR_SLOW || cbor_tagged_stash - ? cbor_tagged_stash - : gv_stashpv ("CBOR::XS::Tagged" , 1); + sv = decode_sv (dec); - return sv_bless (newRV_noinc ((SV *)av), tagged_stash); + dSP; + ENTER; SAVETMPS; PUSHMARK (SP); + EXTEND (SP, 2); + PUSHs (newSVuv (tag)); + PUSHs (sv); + + PUTBACK; + int count = call_sv (dec->cbor.filter ? dec->cbor.filter : default_filter, G_ARRAY | G_EVAL); + SPAGAIN; + + if (SvTRUE (ERRSV)) + { + FREETMPS; LEAVE; + ERR (SvPVutf8_nolen (sv_2mortal (SvREFCNT_inc (ERRSV)))); + } + + if (count) + { + SvREFCNT_dec (sv); + sv = SvREFCNT_inc (POPs); + } + else + { + AV *av = newAV (); + av_push (av, newSVuv (tag)); + av_push (av, sv); + + HV *tagged_stash = !CBOR_SLOW || cbor_tagged_stash + ? cbor_tagged_stash + : gv_stashpv ("CBOR::XS::Tagged" , 1); + sv = sv_bless (newRV_noinc ((SV *)av), tagged_stash); + } + + PUTBACK; + + FREETMPS; LEAVE; } + break; } + return sv; + fail: SvREFCNT_dec (sv); return &PL_sv_undef; @@ -990,6 +1109,8 @@ types_false = get_bool ("Types::Serialiser::false"); types_error = get_bool ("Types::Serialiser::error"); + default_filter = newSVpv ("CBOR::XS::default_filter", 0); + sv_cbor = newSVpv ("CBOR", 0); SvREADONLY_on (sv_cbor); } @@ -1020,8 +1141,7 @@ shrink = F_SHRINK allow_unknown = F_ALLOW_UNKNOWN allow_sharing = F_ALLOW_SHARING - dedup_keys = F_DEDUP_KEYS - dedup_strings = F_DEDUP_STRINGS + allow_stringref = F_ALLOW_STRINGREF PPCODE: { if (enable) @@ -1037,8 +1157,7 @@ get_shrink = F_SHRINK get_allow_unknown = F_ALLOW_UNKNOWN get_allow_sharing = F_ALLOW_SHARING - get_dedup_keys = F_DEDUP_KEYS - get_dedup_strings = F_DEDUP_STRINGS + get_allow_stringref = F_ALLOW_STRINGREF PPCODE: XPUSHs (boolSV (self->flags & ix)); @@ -1064,6 +1183,18 @@ OUTPUT: RETVAL +void filter (CBOR *self, SV *filter = 0) + PPCODE: + SvREFCNT_dec (self->filter); + self->filter = filter ? newSVsv (filter) : filter; + XPUSHs (ST (0)); + +SV *get_filter (CBOR *self) + CODE: + RETVAL = self->filter ? self->filter : NEWSV (0, 0); + OUTPUT: + RETVAL + void encode (CBOR *self, SV *scalar) PPCODE: PUTBACK; scalar = encode_cbor (scalar, self); SPAGAIN; @@ -1085,6 +1216,10 @@ PUSHs (sv_2mortal (newSVuv (offset - SvPVX (cborstr)))); } +void DESTROY (CBOR *self) + PPCODE: + cbor_free (self); + PROTOTYPES: ENABLE void encode_cbor (SV *scalar)