ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/CBOR-XS/XS.pm
Revision: 1.39
Committed: Tue Dec 10 15:31:40 2013 UTC (10 years, 5 months ago) by root
Branch: MAIN
Changes since 1.38: +64 -0 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     CBOR::XS - Concise Binary Object Representation (CBOR, RFC7049)
4    
5     =encoding utf-8
6    
7     =head1 SYNOPSIS
8    
9     use CBOR::XS;
10    
11     $binary_cbor_data = encode_cbor $perl_value;
12     $perl_value = decode_cbor $binary_cbor_data;
13    
14     # OO-interface
15    
16     $coder = CBOR::XS->new;
17 root 1.6 $binary_cbor_data = $coder->encode ($perl_value);
18     $perl_value = $coder->decode ($binary_cbor_data);
19    
20     # prefix decoding
21    
22     my $many_cbor_strings = ...;
23     while (length $many_cbor_strings) {
24     my ($data, $length) = $cbor->decode_prefix ($many_cbor_strings);
25     # data was decoded
26     substr $many_cbor_strings, 0, $length, ""; # remove decoded cbor string
27     }
28 root 1.1
29     =head1 DESCRIPTION
30    
31 root 1.5 This module converts Perl data structures to the Concise Binary Object
32     Representation (CBOR) and vice versa. CBOR is a fast binary serialisation
33 root 1.28 format that aims to use an (almost) superset of the JSON data model, i.e.
34     when you can represent something useful in JSON, you should be able to
35     represent it in CBOR.
36 root 1.1
37 root 1.28 In short, CBOR is a faster and quite compact binary alternative to JSON,
38 root 1.10 with the added ability of supporting serialisation of Perl objects. (JSON
39     often compresses better than CBOR though, so if you plan to compress the
40 root 1.28 data later and speed is less important you might want to compare both
41     formats first).
42 root 1.5
43 root 1.15 To give you a general idea about speed, with texts in the megabyte range,
44     C<CBOR::XS> usually encodes roughly twice as fast as L<Storable> or
45     L<JSON::XS> and decodes about 15%-30% faster than those. The shorter the
46     data, the worse L<Storable> performs in comparison.
47    
48 root 1.28 Regarding compactness, C<CBOR::XS>-encoded data structures are usually
49     about 20% smaller than the same data encoded as (compact) JSON or
50     L<Storable>.
51    
52     In addition to the core CBOR data format, this module implements a
53 root 1.31 number of extensions, to support cyclic and shared data structures
54     (see C<allow_sharing> and C<allow_cycles>), string deduplication (see
55     C<pack_strings>) and scalar references (always enabled).
56 root 1.21
57 root 1.5 The primary goal of this module is to be I<correct> and the secondary goal
58     is to be I<fast>. To reach the latter goal it was written in C.
59 root 1.1
60     See MAPPING, below, on how CBOR::XS maps perl values to CBOR values and
61     vice versa.
62    
63     =cut
64    
65     package CBOR::XS;
66    
67     use common::sense;
68    
69 root 1.38 our $VERSION = 1.12;
70 root 1.1 our @ISA = qw(Exporter);
71    
72     our @EXPORT = qw(encode_cbor decode_cbor);
73    
74     use Exporter;
75     use XSLoader;
76    
77 root 1.6 use Types::Serialiser;
78    
79 root 1.3 our $MAGIC = "\xd9\xd9\xf7";
80    
81 root 1.1 =head1 FUNCTIONAL INTERFACE
82    
83     The following convenience methods are provided by this module. They are
84     exported by default:
85    
86     =over 4
87    
88     =item $cbor_data = encode_cbor $perl_scalar
89    
90     Converts the given Perl data structure to CBOR representation. Croaks on
91     error.
92    
93     =item $perl_scalar = decode_cbor $cbor_data
94    
95     The opposite of C<encode_cbor>: expects a valid CBOR string to parse,
96     returning the resulting perl scalar. Croaks on error.
97    
98     =back
99    
100    
101     =head1 OBJECT-ORIENTED INTERFACE
102    
103     The object oriented interface lets you configure your own encoding or
104     decoding style, within the limits of supported formats.
105    
106     =over 4
107    
108     =item $cbor = new CBOR::XS
109    
110     Creates a new CBOR::XS object that can be used to de/encode CBOR
111     strings. All boolean flags described below are by default I<disabled>.
112    
113     The mutators for flags all return the CBOR object again and thus calls can
114     be chained:
115    
116     my $cbor = CBOR::XS->new->encode ({a => [1,2]});
117    
118     =item $cbor = $cbor->max_depth ([$maximum_nesting_depth])
119    
120     =item $max_depth = $cbor->get_max_depth
121    
122     Sets the maximum nesting level (default C<512>) accepted while encoding
123     or decoding. If a higher nesting level is detected in CBOR data or a Perl
124     data structure, then the encoder and decoder will stop and croak at that
125     point.
126    
127     Nesting level is defined by number of hash- or arrayrefs that the encoder
128     needs to traverse to reach a given point or the number of C<{> or C<[>
129     characters without their matching closing parenthesis crossed to reach a
130     given character in a string.
131    
132     Setting the maximum depth to one disallows any nesting, so that ensures
133     that the object is only a single hash/object or array.
134    
135     If no argument is given, the highest possible setting will be used, which
136     is rarely useful.
137    
138     Note that nesting is implemented by recursion in C. The default value has
139     been chosen to be as large as typical operating systems allow without
140     crashing.
141    
142     See SECURITY CONSIDERATIONS, below, for more info on why this is useful.
143    
144     =item $cbor = $cbor->max_size ([$maximum_string_size])
145    
146     =item $max_size = $cbor->get_max_size
147    
148     Set the maximum length a CBOR string may have (in bytes) where decoding
149     is being attempted. The default is C<0>, meaning no limit. When C<decode>
150     is called on a string that is longer then this many bytes, it will not
151     attempt to decode the string but throw an exception. This setting has no
152     effect on C<encode> (yet).
153    
154     If no argument is given, the limit check will be deactivated (same as when
155     C<0> is specified).
156    
157     See SECURITY CONSIDERATIONS, below, for more info on why this is useful.
158    
159 root 1.19 =item $cbor = $cbor->allow_unknown ([$enable])
160    
161     =item $enabled = $cbor->get_allow_unknown
162    
163     If C<$enable> is true (or missing), then C<encode> will I<not> throw an
164     exception when it encounters values it cannot represent in CBOR (for
165     example, filehandles) but instead will encode a CBOR C<error> value.
166    
167     If C<$enable> is false (the default), then C<encode> will throw an
168     exception when it encounters anything it cannot encode as CBOR.
169    
170     This option does not affect C<decode> in any way, and it is recommended to
171     leave it off unless you know your communications partner.
172    
173 root 1.20 =item $cbor = $cbor->allow_sharing ([$enable])
174 root 1.19
175 root 1.20 =item $enabled = $cbor->get_allow_sharing
176 root 1.19
177     If C<$enable> is true (or missing), then C<encode> will not double-encode
178 root 1.20 values that have been referenced before (e.g. when the same object, such
179     as an array, is referenced multiple times), but instead will emit a
180     reference to the earlier value.
181 root 1.19
182     This means that such values will only be encoded once, and will not result
183     in a deep cloning of the value on decode, in decoders supporting the value
184 root 1.25 sharing extension. This also makes it possible to encode cyclic data
185 root 1.31 structures (which need C<allow_cycles> to ne enabled to be decoded by this
186     module).
187 root 1.19
188 root 1.21 It is recommended to leave it off unless you know your
189     communication partner supports the value sharing extensions to CBOR
190 root 1.26 (L<http://cbor.schmorp.de/value-sharing>), as without decoder support, the
191 root 1.25 resulting data structure might be unusable.
192 root 1.21
193 root 1.19 Detecting shared values incurs a runtime overhead when values are encoded
194     that have a reference counter large than one, and might unnecessarily
195     increase the encoded size, as potentially shared values are encode as
196 root 1.31 shareable whether or not they are actually shared.
197 root 1.19
198 root 1.20 At the moment, only targets of references can be shared (e.g. scalars,
199     arrays or hashes pointed to by a reference). Weirder constructs, such as
200     an array with multiple "copies" of the I<same> string, which are hard but
201     not impossible to create in Perl, are not supported (this is the same as
202 root 1.25 with L<Storable>).
203 root 1.19
204 root 1.25 If C<$enable> is false (the default), then C<encode> will encode shared
205     data structures repeatedly, unsharing them in the process. Cyclic data
206     structures cannot be encoded in this mode.
207 root 1.19
208     This option does not affect C<decode> in any way - shared values and
209 root 1.21 references will always be decoded properly if present.
210    
211 root 1.31 =item $cbor = $cbor->allow_cycles ([$enable])
212    
213     =item $enabled = $cbor->get_allow_cycles
214    
215     If C<$enable> is true (or missing), then C<decode> will happily decode
216     self-referential (cyclic) data structures. By default these will not be
217     decoded, as they need manual cleanup to avoid memory leaks, so code that
218     isn't prepared for this will not leak memory.
219    
220     If C<$enable> is false (the default), then C<decode> will throw an error
221     when it encounters a self-referential/cyclic data structure.
222    
223     This option does not affect C<encode> in any way - shared values and
224     references will always be decoded properly if present.
225    
226 root 1.25 =item $cbor = $cbor->pack_strings ([$enable])
227 root 1.21
228 root 1.25 =item $enabled = $cbor->get_pack_strings
229 root 1.21
230     If C<$enable> is true (or missing), then C<encode> will try not to encode
231     the same string twice, but will instead encode a reference to the string
232 root 1.25 instead. Depending on your data format, this can save a lot of space, but
233 root 1.21 also results in a very large runtime overhead (expect encoding times to be
234     2-4 times as high as without).
235    
236     It is recommended to leave it off unless you know your
237     communications partner supports the stringref extension to CBOR
238 root 1.26 (L<http://cbor.schmorp.de/stringref>), as without decoder support, the
239 root 1.25 resulting data structure might not be usable.
240 root 1.21
241 root 1.25 If C<$enable> is false (the default), then C<encode> will encode strings
242     the standard CBOR way.
243 root 1.21
244     This option does not affect C<decode> in any way - string references will
245     always be decoded properly if present.
246 root 1.19
247 root 1.33 =item $cbor = $cbor->validate_utf8 ([$enable])
248    
249     =item $enabled = $cbor->get_validate_utf8
250    
251     If C<$enable> is true (or missing), then C<decode> will validate that
252     elements (text strings) containing UTF-8 data in fact contain valid UTF-8
253     data (instead of blindly accepting it). This validation obviously takes
254     extra time during decoding.
255    
256     The concept of "valid UTF-8" used is perl's concept, which is a superset
257     of the official UTF-8.
258    
259     If C<$enable> is false (the default), then C<decode> will blindly accept
260     UTF-8 data, marking them as valid UTF-8 in the resulting data structure
261     regardless of whether thats true or not.
262    
263     Perl isn't too happy about corrupted UTF-8 in strings, but should
264     generally not crash or do similarly evil things. Extensions might be not
265     so forgiving, so it's recommended to turn on this setting if you receive
266     untrusted CBOR.
267    
268     This option does not affect C<encode> in any way - strings that are
269     supposedly valid UTF-8 will simply be dumped into the resulting CBOR
270     string without checking whether that is, in fact, true or not.
271    
272 root 1.23 =item $cbor = $cbor->filter ([$cb->($tag, $value)])
273    
274     =item $cb_or_undef = $cbor->get_filter
275    
276 root 1.24 Sets or replaces the tagged value decoding filter (when C<$cb> is
277     specified) or clears the filter (if no argument or C<undef> is provided).
278    
279     The filter callback is called only during decoding, when a non-enforced
280     tagged value has been decoded (see L<TAG HANDLING AND EXTENSIONS> for a
281     list of enforced tags). For specific tags, it's often better to provide a
282     default converter using the C<%CBOR::XS::FILTER> hash (see below).
283    
284     The first argument is the numerical tag, the second is the (decoded) value
285     that has been tagged.
286    
287     The filter function should return either exactly one value, which will
288     replace the tagged value in the decoded data structure, or no values,
289     which will result in default handling, which currently means the decoder
290     creates a C<CBOR::XS::Tagged> object to hold the tag and the value.
291    
292     When the filter is cleared (the default state), the default filter
293     function, C<CBOR::XS::default_filter>, is used. This function simply looks
294     up the tag in the C<%CBOR::XS::FILTER> hash. If an entry exists it must be
295     a code reference that is called with tag and value, and is responsible for
296     decoding the value. If no entry exists, it returns no values.
297    
298 root 1.28 Example: decode all tags not handled internally into C<CBOR::XS::Tagged>
299 root 1.24 objects, with no other special handling (useful when working with
300     potentially "unsafe" CBOR data).
301    
302     CBOR::XS->new->filter (sub { })->decode ($cbor_data);
303    
304     Example: provide a global filter for tag 1347375694, converting the value
305     into some string form.
306    
307     $CBOR::XS::FILTER{1347375694} = sub {
308     my ($tag, $value);
309    
310     "tag 1347375694 value $value"
311     };
312 root 1.23
313 root 1.1 =item $cbor_data = $cbor->encode ($perl_scalar)
314    
315     Converts the given Perl data structure (a scalar value) to its CBOR
316     representation.
317    
318     =item $perl_scalar = $cbor->decode ($cbor_data)
319    
320     The opposite of C<encode>: expects CBOR data and tries to parse it,
321     returning the resulting simple scalar or reference. Croaks on error.
322    
323     =item ($perl_scalar, $octets) = $cbor->decode_prefix ($cbor_data)
324    
325     This works like the C<decode> method, but instead of raising an exception
326     when there is trailing garbage after the CBOR string, it will silently
327     stop parsing there and return the number of characters consumed so far.
328    
329     This is useful if your CBOR texts are not delimited by an outer protocol
330     and you need to know where the first CBOR string ends amd the next one
331     starts.
332    
333     CBOR::XS->new->decode_prefix ("......")
334     => ("...", 3)
335    
336     =back
337    
338 root 1.39 =head2 INCREMENTAL PARSING
339    
340     In some cases, there is the need for incremental parsing of JSON
341     texts. While this module always has to keep both CBOR text and resulting
342     Perl data structure in memory at one time, it does allow you to parse a
343     CBOR stream incrementally, using a similar to using "decode_prefix" to see
344     if a full CBOR object is available, but is much more efficient.
345    
346     It basically works by parsing as much of a CBOR string as possible - if
347     the CBOR data is not complete yet, the pasrer will remember where it was,
348     to be able to restart when more data has been accumulated. Once enough
349     data is available to either decode a complete CBOR value or raise an
350     error, a real decode will be attempted.
351    
352     A typical use case would be a network protocol that consists of sending
353     and receiving CBOR-encoded messages. The solution that works with CBOR and
354     about anything else is by prepending a length to every CBOR value, so the
355     receiver knows how many octets to read. More compact (and slightly slower)
356     would be to just send CBOR values back-to-back, as C<CBOR::XS> knows where
357     a CBOR value ends, and doesn't need an explicit length.
358    
359     The following methods help with this:
360    
361     =over 4
362    
363     =item @decoded = $cbor->incr_parse ($buffer)
364    
365     This method attempts to decode exactly one CBOR value from the beginning
366     of the given C<$buffer>. The value is removed from the C<$buffer> on
367     success. When C<$buffer> doesn't contain a complete value yet, it returns
368     nothing. Finally, when the C<$buffer> doesn't start with something
369     that could ever be a valid CBOR value, it raises an exception, just as
370     C<decode> would. In the latter case the decoder state is undefined and
371     must be reset before being able to parse further.
372    
373     This method modifies the C<$buffer> in place. When no CBOR value can be
374     decoded, the decoder stores the current string offset. On the next call,
375     continues decoding at the place where it stopped before. For this to make
376     sense, the C<$buffer> must begin with the same octets as on previous
377     unsuccessful calls.
378    
379     You can call this method in scalar context, in which case it either
380     returns a decoded value or C<undef>. This makes it impossible to
381     distinguish between CBOR null values (which decode to C<undef>) and an
382     unsuccessful decode, which is often acceptable.
383    
384     =item @decoded = $cbor->incr_parse_multiple ($buffer)
385    
386     Same as C<incr_parse>, but attempts to decode as many CBOR values as
387     possible in one go, instead of at most one. Calls to C<incr_parse> and
388     C<incr_parse_multiple> can be interleaved.
389    
390     =item $cbor->incr_reset
391    
392     Resets the incremental decoder. This throws away any saved state, so that
393     subsequent calls to C<incr_parse> or C<incr_parse_multiple> start to parse
394     a new CBOR value from the beginning of the C<$buffer> again.
395    
396     This method can be caled at any time, but it I<must> be called if you want
397     to change your C<$buffer> or there was a decoding error and you want to
398     reuse the C<$cbor> object for future incremental parsings.
399    
400     =back
401    
402 root 1.1
403     =head1 MAPPING
404    
405     This section describes how CBOR::XS maps Perl values to CBOR values and
406     vice versa. These mappings are designed to "do the right thing" in most
407     circumstances automatically, preserving round-tripping characteristics
408     (what you put in comes out as something equivalent).
409    
410     For the more enlightened: note that in the following descriptions,
411     lowercase I<perl> refers to the Perl interpreter, while uppercase I<Perl>
412     refers to the abstract Perl language itself.
413    
414    
415     =head2 CBOR -> PERL
416    
417     =over 4
418    
419 root 1.4 =item integers
420    
421     CBOR integers become (numeric) perl scalars. On perls without 64 bit
422     support, 64 bit integers will be truncated or otherwise corrupted.
423    
424     =item byte strings
425    
426 root 1.27 Byte strings will become octet strings in Perl (the Byte values 0..255
427 root 1.4 will simply become characters of the same value in Perl).
428    
429     =item UTF-8 strings
430    
431     UTF-8 strings in CBOR will be decoded, i.e. the UTF-8 octets will be
432     decoded into proper Unicode code points. At the moment, the validity of
433     the UTF-8 octets will not be validated - corrupt input will result in
434     corrupted Perl strings.
435    
436     =item arrays, maps
437    
438     CBOR arrays and CBOR maps will be converted into references to a Perl
439     array or hash, respectively. The keys of the map will be stringified
440     during this process.
441    
442 root 1.6 =item null
443    
444     CBOR null becomes C<undef> in Perl.
445    
446     =item true, false, undefined
447 root 1.1
448 root 1.6 These CBOR values become C<Types:Serialiser::true>,
449     C<Types:Serialiser::false> and C<Types::Serialiser::error>,
450 root 1.1 respectively. They are overloaded to act almost exactly like the numbers
451 root 1.6 C<1> and C<0> (for true and false) or to throw an exception on access (for
452     error). See the L<Types::Serialiser> manpage for details.
453    
454 root 1.23 =item tagged values
455 root 1.1
456 root 1.23 Tagged items consists of a numeric tag and another CBOR value.
457 root 1.4
458 root 1.23 See L<TAG HANDLING AND EXTENSIONS> and the description of C<< ->filter >>
459 root 1.28 for details on which tags are handled how.
460 root 1.4
461     =item anything else
462    
463     Anything else (e.g. unsupported simple values) will raise a decoding
464     error.
465 root 1.1
466     =back
467    
468    
469     =head2 PERL -> CBOR
470    
471     The mapping from Perl to CBOR is slightly more difficult, as Perl is a
472 root 1.28 typeless language. That means this module can only guess which CBOR type
473     is meant by a perl value.
474 root 1.1
475     =over 4
476    
477     =item hash references
478    
479 root 1.4 Perl hash references become CBOR maps. As there is no inherent ordering in
480     hash keys (or CBOR maps), they will usually be encoded in a pseudo-random
481 root 1.28 order. This order can be different each time a hahs is encoded.
482 root 1.4
483     Currently, tied hashes will use the indefinite-length format, while normal
484     hashes will use the fixed-length format.
485 root 1.1
486     =item array references
487    
488 root 1.4 Perl array references become fixed-length CBOR arrays.
489 root 1.1
490     =item other references
491    
492 root 1.28 Other unblessed references will be represented using
493     the indirection tag extension (tag value C<22098>,
494     L<http://cbor.schmorp.de/indirection>). CBOR decoders are guaranteed
495     to be able to decode these values somehow, by either "doing the right
496     thing", decoding into a generic tagged object, simply ignoring the tag, or
497     something else.
498 root 1.4
499     =item CBOR::XS::Tagged objects
500    
501     Objects of this type must be arrays consisting of a single C<[tag, value]>
502 root 1.13 pair. The (numerical) tag will be encoded as a CBOR tag, the value will
503 root 1.28 be encoded as appropriate for the value. You must use C<CBOR::XS::tag> to
504 root 1.13 create such objects.
505 root 1.1
506 root 1.6 =item Types::Serialiser::true, Types::Serialiser::false, Types::Serialiser::error
507 root 1.1
508 root 1.6 These special values become CBOR true, CBOR false and CBOR undefined
509     values, respectively. You can also use C<\1>, C<\0> and C<\undef> directly
510     if you want.
511 root 1.1
512 root 1.7 =item other blessed objects
513 root 1.1
514 root 1.7 Other blessed objects are serialised via C<TO_CBOR> or C<FREEZE>. See
515 root 1.23 L<TAG HANDLING AND EXTENSIONS> for specific classes handled by this
516     module, and L<OBJECT SERIALISATION> for generic object serialisation.
517 root 1.1
518     =item simple scalars
519    
520     Simple Perl scalars (any scalar that is not a reference) are the most
521     difficult objects to encode: CBOR::XS will encode undefined scalars as
522 root 1.4 CBOR null values, scalars that have last been used in a string context
523 root 1.1 before encoding as CBOR strings, and anything else as number value:
524    
525     # dump as number
526     encode_cbor [2] # yields [2]
527     encode_cbor [-3.0e17] # yields [-3e+17]
528     my $value = 5; encode_cbor [$value] # yields [5]
529    
530 root 1.27 # used as string, so dump as string (either byte or text)
531 root 1.1 print $value;
532     encode_cbor [$value] # yields ["5"]
533    
534     # undef becomes null
535     encode_cbor [undef] # yields [null]
536    
537     You can force the type to be a CBOR string by stringifying it:
538    
539     my $x = 3.1; # some variable containing a number
540     "$x"; # stringified
541     $x .= ""; # another, more awkward way to stringify
542     print $x; # perl does it for you, too, quite often
543    
544 root 1.27 You can force whether a string ie encoded as byte or text string by using
545     C<utf8::upgrade> and C<utf8::downgrade>):
546    
547     utf8::upgrade $x; # encode $x as text string
548     utf8::downgrade $x; # encode $x as byte string
549    
550     Perl doesn't define what operations up- and downgrade strings, so if the
551     difference between byte and text is important, you should up- or downgrade
552     your string as late as possible before encoding.
553    
554 root 1.1 You can force the type to be a CBOR number by numifying it:
555    
556     my $x = "3"; # some variable containing a string
557     $x += 0; # numify it, ensuring it will be dumped as a number
558     $x *= 1; # same thing, the choice is yours.
559    
560     You can not currently force the type in other, less obscure, ways. Tell me
561     if you need this capability (but don't forget to explain why it's needed
562     :).
563    
564 root 1.4 Perl values that seem to be integers generally use the shortest possible
565     representation. Floating-point values will use either the IEEE single
566     format if possible without loss of precision, otherwise the IEEE double
567     format will be used. Perls that use formats other than IEEE double to
568     represent numerical values are supported, but might suffer loss of
569     precision.
570 root 1.1
571     =back
572    
573 root 1.7 =head2 OBJECT SERIALISATION
574    
575 root 1.29 This module implements both a CBOR-specific and the generic
576     L<Types::Serialier> object serialisation protocol. The following
577     subsections explain both methods.
578    
579     =head3 ENCODING
580    
581 root 1.7 This module knows two way to serialise a Perl object: The CBOR-specific
582     way, and the generic way.
583    
584 root 1.29 Whenever the encoder encounters a Perl object that it cannot serialise
585 root 1.7 directly (most of them), it will first look up the C<TO_CBOR> method on
586     it.
587    
588     If it has a C<TO_CBOR> method, it will call it with the object as only
589     argument, and expects exactly one return value, which it will then
590     substitute and encode it in the place of the object.
591    
592     Otherwise, it will look up the C<FREEZE> method. If it exists, it will
593     call it with the object as first argument, and the constant string C<CBOR>
594     as the second argument, to distinguish it from other serialisers.
595    
596     The C<FREEZE> method can return any number of values (i.e. zero or
597     more). These will be encoded as CBOR perl object, together with the
598     classname.
599    
600 root 1.29 These methods I<MUST NOT> change the data structure that is being
601     serialised. Failure to comply to this can result in memory corruption -
602     and worse.
603    
604 root 1.7 If an object supports neither C<TO_CBOR> nor C<FREEZE>, encoding will fail
605     with an error.
606    
607 root 1.29 =head3 DECODING
608    
609     Objects encoded via C<TO_CBOR> cannot (normally) be automatically decoded,
610     but objects encoded via C<FREEZE> can be decoded using the following
611     protocol:
612 root 1.7
613     When an encoded CBOR perl object is encountered by the decoder, it will
614     look up the C<THAW> method, by using the stored classname, and will fail
615     if the method cannot be found.
616    
617     After the lookup it will call the C<THAW> method with the stored classname
618     as first argument, the constant string C<CBOR> as second argument, and all
619     values returned by C<FREEZE> as remaining arguments.
620    
621 root 1.29 =head3 EXAMPLES
622 root 1.7
623     Here is an example C<TO_CBOR> method:
624    
625     sub My::Object::TO_CBOR {
626     my ($obj) = @_;
627    
628     ["this is a serialised My::Object object", $obj->{id}]
629     }
630    
631     When a C<My::Object> is encoded to CBOR, it will instead encode a simple
632     array with two members: a string, and the "object id". Decoding this CBOR
633     string will yield a normal perl array reference in place of the object.
634    
635     A more useful and practical example would be a serialisation method for
636     the URI module. CBOR has a custom tag value for URIs, namely 32:
637    
638     sub URI::TO_CBOR {
639     my ($self) = @_;
640     my $uri = "$self"; # stringify uri
641     utf8::upgrade $uri; # make sure it will be encoded as UTF-8 string
642 root 1.28 CBOR::XS::tag 32, "$_[0]"
643 root 1.7 }
644    
645     This will encode URIs as a UTF-8 string with tag 32, which indicates an
646     URI.
647    
648     Decoding such an URI will not (currently) give you an URI object, but
649     instead a CBOR::XS::Tagged object with tag number 32 and the string -
650     exactly what was returned by C<TO_CBOR>.
651    
652     To serialise an object so it can automatically be deserialised, you need
653     to use C<FREEZE> and C<THAW>. To take the URI module as example, this
654     would be a possible implementation:
655    
656     sub URI::FREEZE {
657     my ($self, $serialiser) = @_;
658     "$self" # encode url string
659     }
660    
661     sub URI::THAW {
662     my ($class, $serialiser, $uri) = @_;
663    
664     $class->new ($uri)
665     }
666    
667     Unlike C<TO_CBOR>, multiple values can be returned by C<FREEZE>. For
668     example, a C<FREEZE> method that returns "type", "id" and "variant" values
669     would cause an invocation of C<THAW> with 5 arguments:
670    
671     sub My::Object::FREEZE {
672     my ($self, $serialiser) = @_;
673    
674     ($self->{type}, $self->{id}, $self->{variant})
675     }
676    
677     sub My::Object::THAW {
678     my ($class, $serialiser, $type, $id, $variant) = @_;
679    
680     $class-<new (type => $type, id => $id, variant => $variant)
681     }
682    
683 root 1.1
684 root 1.7 =head1 MAGIC HEADER
685 root 1.3
686     There is no way to distinguish CBOR from other formats
687     programmatically. To make it easier to distinguish CBOR from other
688     formats, the CBOR specification has a special "magic string" that can be
689 root 1.18 prepended to any CBOR string without changing its meaning.
690 root 1.3
691     This string is available as C<$CBOR::XS::MAGIC>. This module does not
692 root 1.18 prepend this string to the CBOR data it generates, but it will ignore it
693 root 1.3 if present, so users can prepend this string as a "file type" indicator as
694     required.
695    
696    
697 root 1.12 =head1 THE CBOR::XS::Tagged CLASS
698    
699     CBOR has the concept of tagged values - any CBOR value can be tagged with
700     a numeric 64 bit number, which are centrally administered.
701    
702     C<CBOR::XS> handles a few tags internally when en- or decoding. You can
703     also create tags yourself by encoding C<CBOR::XS::Tagged> objects, and the
704     decoder will create C<CBOR::XS::Tagged> objects itself when it hits an
705     unknown tag.
706    
707     These objects are simply blessed array references - the first member of
708     the array being the numerical tag, the second being the value.
709    
710     You can interact with C<CBOR::XS::Tagged> objects in the following ways:
711    
712     =over 4
713    
714     =item $tagged = CBOR::XS::tag $tag, $value
715    
716     This function(!) creates a new C<CBOR::XS::Tagged> object using the given
717     C<$tag> (0..2**64-1) to tag the given C<$value> (which can be any Perl
718     value that can be encoded in CBOR, including serialisable Perl objects and
719     C<CBOR::XS::Tagged> objects).
720    
721     =item $tagged->[0]
722    
723     =item $tagged->[0] = $new_tag
724    
725     =item $tag = $tagged->tag
726    
727     =item $new_tag = $tagged->tag ($new_tag)
728    
729     Access/mutate the tag.
730    
731     =item $tagged->[1]
732    
733     =item $tagged->[1] = $new_value
734    
735     =item $value = $tagged->value
736    
737     =item $new_value = $tagged->value ($new_value)
738    
739     Access/mutate the tagged value.
740    
741     =back
742    
743     =cut
744    
745     sub tag($$) {
746     bless [@_], CBOR::XS::Tagged::;
747     }
748    
749     sub CBOR::XS::Tagged::tag {
750     $_[0][0] = $_[1] if $#_;
751     $_[0][0]
752     }
753    
754     sub CBOR::XS::Tagged::value {
755     $_[0][1] = $_[1] if $#_;
756     $_[0][1]
757     }
758    
759 root 1.13 =head2 EXAMPLES
760    
761     Here are some examples of C<CBOR::XS::Tagged> uses to tag objects.
762    
763     You can look up CBOR tag value and emanings in the IANA registry at
764     L<http://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml>.
765    
766     Prepend a magic header (C<$CBOR::XS::MAGIC>):
767    
768     my $cbor = encode_cbor CBOR::XS::tag 55799, $value;
769     # same as:
770     my $cbor = $CBOR::XS::MAGIC . encode_cbor $value;
771    
772     Serialise some URIs and a regex in an array:
773    
774     my $cbor = encode_cbor [
775     (CBOR::XS::tag 32, "http://www.nethype.de/"),
776     (CBOR::XS::tag 32, "http://software.schmorp.de/"),
777     (CBOR::XS::tag 35, "^[Pp][Ee][Rr][lL]\$"),
778     ];
779    
780     Wrap CBOR data in CBOR:
781    
782     my $cbor_cbor = encode_cbor
783     CBOR::XS::tag 24,
784     encode_cbor [1, 2, 3];
785    
786 root 1.19 =head1 TAG HANDLING AND EXTENSIONS
787    
788 root 1.22 This section describes how this module handles specific tagged values
789     and extensions. If a tag is not mentioned here and no additional filters
790     are provided for it, then the default handling applies (creating a
791     CBOR::XS::Tagged object on decoding, and only encoding the tag when
792     explicitly requested).
793 root 1.19
794 root 1.23 Tags not handled specifically are currently converted into a
795     L<CBOR::XS::Tagged> object, which is simply a blessed array reference
796     consisting of the numeric tag value followed by the (decoded) CBOR value.
797    
798 root 1.19 Future versions of this module reserve the right to special case
799 root 1.22 additional tags (such as base64url).
800    
801     =head2 ENFORCED TAGS
802    
803     These tags are always handled when decoding, and their handling cannot be
804     overriden by the user.
805 root 1.19
806     =over 4
807    
808 root 1.26 =item 26 (perl-object, L<http://cbor.schmorp.de/perl-object>)
809 root 1.19
810 root 1.23 These tags are automatically created (and decoded) for serialisable
811     objects using the C<FREEZE/THAW> methods (the L<Types::Serialier> object
812     serialisation protocol). See L<OBJECT SERIALISATION> for details.
813 root 1.19
814 root 1.31 =item 28, 29 (shareable, sharedref, L <http://cbor.schmorp.de/value-sharing>)
815 root 1.19
816 root 1.31 These tags are automatically decoded when encountered (and they do not
817     result in a cyclic data structure, see C<allow_cycles>), resulting in
818 root 1.19 shared values in the decoded object. They are only encoded, however, when
819 root 1.31 C<allow_sharing> is enabled.
820    
821     Not all shared values can be successfully decoded: values that reference
822     themselves will I<currently> decode as C<undef> (this is not the same
823     as a reference pointing to itself, which will be represented as a value
824     that contains an indirect reference to itself - these will be decoded
825     properly).
826    
827     Note that considerably more shared value data structures can be decoded
828     than will be encoded - currently, only values pointed to by references
829     will be shared, others will not. While non-reference shared values can be
830     generated in Perl with some effort, they were considered too unimportant
831     to be supported in the encoder. The decoder, however, will decode these
832     values as shared values.
833 root 1.19
834 root 1.26 =item 256, 25 (stringref-namespace, stringref, L <http://cbor.schmorp.de/stringref>)
835 root 1.21
836     These tags are automatically decoded when encountered. They are only
837 root 1.25 encoded, however, when C<pack_strings> is enabled.
838 root 1.21
839 root 1.19 =item 22098 (indirection, L<http://cbor.schmorp.de/indirection>)
840    
841     This tag is automatically generated when a reference are encountered (with
842     the exception of hash and array refernces). It is converted to a reference
843     when decoding.
844    
845     =item 55799 (self-describe CBOR, RFC 7049)
846    
847     This value is not generated on encoding (unless explicitly requested by
848     the user), and is simply ignored when decoding.
849    
850     =back
851    
852 root 1.24 =head2 NON-ENFORCED TAGS
853 root 1.22
854     These tags have default filters provided when decoding. Their handling can
855     be overriden by changing the C<%CBOR::XS::FILTER> entry for the tag, or by
856 root 1.24 providing a custom C<filter> callback when decoding.
857 root 1.22
858     When they result in decoding into a specific Perl class, the module
859     usually provides a corresponding C<TO_CBOR> method as well.
860    
861     When any of these need to load additional modules that are not part of the
862     perl core distribution (e.g. L<URI>), it is (currently) up to the user to
863     provide these modules. The decoding usually fails with an exception if the
864     required module cannot be loaded.
865    
866     =over 4
867    
868 root 1.35 =item 0, 1 (date/time string, seconds since the epoch)
869    
870     These tags are decoded into L<Time::Piece> objects. The corresponding
871     C<Time::Piece::TO_CBOR> method always encodes into tag 1 values currently.
872    
873     The L<Time::Piece> API is generally surprisingly bad, and fractional
874     seconds are only accidentally kept intact, so watch out. On the plus side,
875     the module comes with perl since 5.10, which has to count for something.
876    
877 root 1.22 =item 2, 3 (positive/negative bignum)
878    
879     These tags are decoded into L<Math::BigInt> objects. The corresponding
880     C<Math::BigInt::TO_CBOR> method encodes "small" bigints into normal CBOR
881     integers, and others into positive/negative CBOR bignums.
882    
883     =item 4, 5 (decimal fraction/bigfloat)
884    
885     Both decimal fractions and bigfloats are decoded into L<Math::BigFloat>
886     objects. The corresponding C<Math::BigFloat::TO_CBOR> method I<always>
887     encodes into a decimal fraction.
888    
889     CBOR cannot represent bigfloats with I<very> large exponents - conversion
890     of such big float objects is undefined.
891    
892     Also, NaN and infinities are not encoded properly.
893    
894     =item 21, 22, 23 (expected later JSON conversion)
895    
896     CBOR::XS is not a CBOR-to-JSON converter, and will simply ignore these
897     tags.
898    
899     =item 32 (URI)
900    
901     These objects decode into L<URI> objects. The corresponding
902     C<URI::TO_CBOR> method again results in a CBOR URI value.
903    
904     =back
905    
906     =cut
907    
908     our %FILTER = (
909     # 0 # rfc4287 datetime, utf-8
910     # 1 # unix timestamp, any
911    
912     2 => sub { # pos bigint
913     require Math::BigInt;
914     Math::BigInt->new ("0x" . unpack "H*", pop)
915     },
916    
917     3 => sub { # neg bigint
918     require Math::BigInt;
919     -Math::BigInt->new ("0x" . unpack "H*", pop)
920     },
921    
922     4 => sub { # decimal fraction, array
923     require Math::BigFloat;
924     Math::BigFloat->new ($_[1][1] . "E" . $_[1][0])
925     },
926    
927     5 => sub { # bigfloat, array
928     require Math::BigFloat;
929     scalar Math::BigFloat->new ($_[1][1])->blsft ($_[1][0], 2)
930     },
931    
932     21 => sub { pop }, # expected conversion to base64url encoding
933     22 => sub { pop }, # expected conversion to base64 encoding
934     23 => sub { pop }, # expected conversion to base16 encoding
935    
936     # 24 # embedded cbor, byte string
937    
938     32 => sub {
939     require URI;
940     URI->new (pop)
941     },
942    
943     # 33 # base64url rfc4648, utf-8
944     # 34 # base64 rfc46484, utf-8
945     # 35 # regex pcre/ecma262, utf-8
946     # 36 # mime message rfc2045, utf-8
947     );
948    
949 root 1.19
950 root 1.7 =head1 CBOR and JSON
951 root 1.1
952 root 1.4 CBOR is supposed to implement a superset of the JSON data model, and is,
953     with some coercion, able to represent all JSON texts (something that other
954     "binary JSON" formats such as BSON generally do not support).
955    
956     CBOR implements some extra hints and support for JSON interoperability,
957     and the spec offers further guidance for conversion between CBOR and
958     JSON. None of this is currently implemented in CBOR, and the guidelines
959     in the spec do not result in correct round-tripping of data. If JSON
960     interoperability is improved in the future, then the goal will be to
961     ensure that decoded JSON data will round-trip encoding and decoding to
962     CBOR intact.
963 root 1.1
964    
965     =head1 SECURITY CONSIDERATIONS
966    
967     When you are using CBOR in a protocol, talking to untrusted potentially
968     hostile creatures requires relatively few measures.
969    
970     First of all, your CBOR decoder should be secure, that is, should not have
971     any buffer overflows. Obviously, this module should ensure that and I am
972     trying hard on making that true, but you never know.
973    
974     Second, you need to avoid resource-starving attacks. That means you should
975     limit the size of CBOR data you accept, or make sure then when your
976     resources run out, that's just fine (e.g. by using a separate process that
977     can crash safely). The size of a CBOR string in octets is usually a good
978     indication of the size of the resources required to decode it into a Perl
979     structure. While CBOR::XS can check the size of the CBOR text, it might be
980     too late when you already have it in memory, so you might want to check
981     the size before you accept the string.
982    
983     Third, CBOR::XS recurses using the C stack when decoding objects and
984     arrays. The C stack is a limited resource: for instance, on my amd64
985     machine with 8MB of stack size I can decode around 180k nested arrays but
986     only 14k nested CBOR objects (due to perl itself recursing deeply on croak
987     to free the temporary). If that is exceeded, the program crashes. To be
988     conservative, the default nesting limit is set to 512. If your process
989     has a smaller stack, you should adjust this setting accordingly with the
990     C<max_depth> method.
991    
992     Something else could bomb you, too, that I forgot to think of. In that
993     case, you get to keep the pieces. I am always open for hints, though...
994    
995     Also keep in mind that CBOR::XS might leak contents of your Perl data
996     structures in its error messages, so when you serialise sensitive
997     information you might want to make sure that exceptions thrown by CBOR::XS
998     will not end up in front of untrusted eyes.
999    
1000     =head1 CBOR IMPLEMENTATION NOTES
1001    
1002     This section contains some random implementation notes. They do not
1003     describe guaranteed behaviour, but merely behaviour as-is implemented
1004     right now.
1005    
1006     64 bit integers are only properly decoded when Perl was built with 64 bit
1007     support.
1008    
1009     Strings and arrays are encoded with a definite length. Hashes as well,
1010     unless they are tied (or otherwise magical).
1011    
1012     Only the double data type is supported for NV data types - when Perl uses
1013     long double to represent floating point values, they might not be encoded
1014     properly. Half precision types are accepted, but not encoded.
1015    
1016     Strict mode and canonical mode are not implemented.
1017    
1018    
1019 root 1.30 =head1 LIMITATIONS ON PERLS WITHOUT 64-BIT INTEGER SUPPORT
1020    
1021     On perls that were built without 64 bit integer support (these are rare
1022     nowadays, even on 32 bit architectures), support for any kind of 64 bit
1023     integer in CBOR is very limited - most likely, these 64 bit values will
1024     be truncated, corrupted, or otherwise not decoded correctly. This also
1025     includes string, array and map sizes that are stored as 64 bit integers.
1026    
1027    
1028 root 1.1 =head1 THREADS
1029    
1030     This module is I<not> guaranteed to be thread safe and there are no
1031     plans to change this until Perl gets thread support (as opposed to the
1032     horribly slow so-called "threads" which are simply slow and bloated
1033     process simulations - use fork, it's I<much> faster, cheaper, better).
1034    
1035     (It might actually work, but you have been warned).
1036    
1037    
1038     =head1 BUGS
1039    
1040     While the goal of this module is to be correct, that unfortunately does
1041     not mean it's bug-free, only that I think its design is bug-free. If you
1042     keep reporting bugs they will be fixed swiftly, though.
1043    
1044     Please refrain from using rt.cpan.org or any other bug reporting
1045     service. I put the contact address into my modules for a reason.
1046    
1047     =cut
1048    
1049 root 1.22 our %FILTER = (
1050 root 1.35 0 => sub { # rfc4287 datetime, utf-8
1051     require Time::Piece;
1052     # Time::Piece::Strptime uses the "incredibly flexible date parsing routine"
1053     # from FreeBSD, which can't parse ISO 8601, RFC3339, RFC4287 or much of anything
1054     # else either. Whats incredibe over standard strptime totally escapes me.
1055     # doesn't do fractional times, either. sigh.
1056 root 1.36 # In fact, it's all a lie, it uses whatever strptime it wants, and of course,
1057     # they are all incomptible. The openbsd one simply ignores %z (but according to the
1058 root 1.37 # docs, it would be much more incredibly flexible indeed. If it worked, that is.).
1059 root 1.35 scalar eval {
1060     my $s = $_[1];
1061    
1062     $s =~ s/Z$/+00:00/;
1063 root 1.36 $s =~ s/(\.[0-9]+)?([+-][0-9][0-9]):([0-9][0-9])$//
1064 root 1.35 or die;
1065    
1066 root 1.36 my $b = $1 - ($2 * 60 + $3) * 60; # fractional part + offset. hopefully
1067     my $d = Time::Piece->strptime ($s, "%Y-%m-%dT%H:%M:%S");
1068 root 1.35
1069 root 1.36 Time::Piece::gmtime ($d->epoch + $b)
1070 root 1.35 } || die "corrupted CBOR date/time string ($_[0])";
1071     },
1072    
1073     1 => sub { # seconds since the epoch, possibly fractional
1074     require Time::Piece;
1075     scalar Time::Piece::gmtime (pop)
1076     },
1077 root 1.22
1078     2 => sub { # pos bigint
1079     require Math::BigInt;
1080     Math::BigInt->new ("0x" . unpack "H*", pop)
1081     },
1082    
1083     3 => sub { # neg bigint
1084     require Math::BigInt;
1085     -Math::BigInt->new ("0x" . unpack "H*", pop)
1086     },
1087    
1088     4 => sub { # decimal fraction, array
1089     require Math::BigFloat;
1090     Math::BigFloat->new ($_[1][1] . "E" . $_[1][0])
1091     },
1092    
1093     5 => sub { # bigfloat, array
1094     require Math::BigFloat;
1095     scalar Math::BigFloat->new ($_[1][1])->blsft ($_[1][0], 2)
1096     },
1097    
1098     21 => sub { pop }, # expected conversion to base64url encoding
1099     22 => sub { pop }, # expected conversion to base64 encoding
1100     23 => sub { pop }, # expected conversion to base16 encoding
1101    
1102     # 24 # embedded cbor, byte string
1103    
1104     32 => sub {
1105     require URI;
1106     URI->new (pop)
1107     },
1108    
1109     # 33 # base64url rfc4648, utf-8
1110     # 34 # base64 rfc46484, utf-8
1111     # 35 # regex pcre/ecma262, utf-8
1112     # 36 # mime message rfc2045, utf-8
1113     );
1114    
1115     sub CBOR::XS::default_filter {
1116     &{ $FILTER{$_[0]} or return }
1117     }
1118    
1119     sub URI::TO_CBOR {
1120     my $uri = $_[0]->as_string;
1121     utf8::upgrade $uri;
1122 root 1.35 tag 32, $uri
1123 root 1.22 }
1124    
1125     sub Math::BigInt::TO_CBOR {
1126     if ($_[0] >= -2147483648 && $_[0] <= 2147483647) {
1127     $_[0]->numify
1128     } else {
1129     my $hex = substr $_[0]->as_hex, 2;
1130     $hex = "0$hex" if 1 & length $hex; # sigh
1131 root 1.35 tag $_[0] >= 0 ? 2 : 3, pack "H*", $hex
1132 root 1.22 }
1133     }
1134    
1135     sub Math::BigFloat::TO_CBOR {
1136     my ($m, $e) = $_[0]->parts;
1137 root 1.35 tag 4, [$e->numify, $m]
1138     }
1139    
1140     sub Time::Piece::TO_CBOR {
1141     tag 1, $_[0]->epoch
1142 root 1.22 }
1143    
1144 root 1.1 XSLoader::load "CBOR::XS", $VERSION;
1145    
1146     =head1 SEE ALSO
1147    
1148     The L<JSON> and L<JSON::XS> modules that do similar, but human-readable,
1149     serialisation.
1150    
1151 root 1.6 The L<Types::Serialiser> module provides the data model for true, false
1152     and error values.
1153    
1154 root 1.1 =head1 AUTHOR
1155    
1156     Marc Lehmann <schmorp@schmorp.de>
1157     http://home.schmorp.de/
1158    
1159     =cut
1160    
1161 root 1.6 1
1162