ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/CBOR-XS/XS.pm
Revision: 1.38
Committed: Tue Dec 3 10:23:55 2013 UTC (10 years, 5 months ago) by root
Branch: MAIN
CVS Tags: rel-1_12
Changes since 1.37: +1 -1 lines
Log Message:
1.12

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     CBOR::XS - Concise Binary Object Representation (CBOR, RFC7049)
4    
5     =encoding utf-8
6    
7     =head1 SYNOPSIS
8    
9     use CBOR::XS;
10    
11     $binary_cbor_data = encode_cbor $perl_value;
12     $perl_value = decode_cbor $binary_cbor_data;
13    
14     # OO-interface
15    
16     $coder = CBOR::XS->new;
17 root 1.6 $binary_cbor_data = $coder->encode ($perl_value);
18     $perl_value = $coder->decode ($binary_cbor_data);
19    
20     # prefix decoding
21    
22     my $many_cbor_strings = ...;
23     while (length $many_cbor_strings) {
24     my ($data, $length) = $cbor->decode_prefix ($many_cbor_strings);
25     # data was decoded
26     substr $many_cbor_strings, 0, $length, ""; # remove decoded cbor string
27     }
28 root 1.1
29     =head1 DESCRIPTION
30    
31 root 1.5 This module converts Perl data structures to the Concise Binary Object
32     Representation (CBOR) and vice versa. CBOR is a fast binary serialisation
33 root 1.28 format that aims to use an (almost) superset of the JSON data model, i.e.
34     when you can represent something useful in JSON, you should be able to
35     represent it in CBOR.
36 root 1.1
37 root 1.28 In short, CBOR is a faster and quite compact binary alternative to JSON,
38 root 1.10 with the added ability of supporting serialisation of Perl objects. (JSON
39     often compresses better than CBOR though, so if you plan to compress the
40 root 1.28 data later and speed is less important you might want to compare both
41     formats first).
42 root 1.5
43 root 1.15 To give you a general idea about speed, with texts in the megabyte range,
44     C<CBOR::XS> usually encodes roughly twice as fast as L<Storable> or
45     L<JSON::XS> and decodes about 15%-30% faster than those. The shorter the
46     data, the worse L<Storable> performs in comparison.
47    
48 root 1.28 Regarding compactness, C<CBOR::XS>-encoded data structures are usually
49     about 20% smaller than the same data encoded as (compact) JSON or
50     L<Storable>.
51    
52     In addition to the core CBOR data format, this module implements a
53 root 1.31 number of extensions, to support cyclic and shared data structures
54     (see C<allow_sharing> and C<allow_cycles>), string deduplication (see
55     C<pack_strings>) and scalar references (always enabled).
56 root 1.21
57 root 1.5 The primary goal of this module is to be I<correct> and the secondary goal
58     is to be I<fast>. To reach the latter goal it was written in C.
59 root 1.1
60     See MAPPING, below, on how CBOR::XS maps perl values to CBOR values and
61     vice versa.
62    
63     =cut
64    
65     package CBOR::XS;
66    
67     use common::sense;
68    
69 root 1.38 our $VERSION = 1.12;
70 root 1.1 our @ISA = qw(Exporter);
71    
72     our @EXPORT = qw(encode_cbor decode_cbor);
73    
74     use Exporter;
75     use XSLoader;
76    
77 root 1.6 use Types::Serialiser;
78    
79 root 1.3 our $MAGIC = "\xd9\xd9\xf7";
80    
81 root 1.1 =head1 FUNCTIONAL INTERFACE
82    
83     The following convenience methods are provided by this module. They are
84     exported by default:
85    
86     =over 4
87    
88     =item $cbor_data = encode_cbor $perl_scalar
89    
90     Converts the given Perl data structure to CBOR representation. Croaks on
91     error.
92    
93     =item $perl_scalar = decode_cbor $cbor_data
94    
95     The opposite of C<encode_cbor>: expects a valid CBOR string to parse,
96     returning the resulting perl scalar. Croaks on error.
97    
98     =back
99    
100    
101     =head1 OBJECT-ORIENTED INTERFACE
102    
103     The object oriented interface lets you configure your own encoding or
104     decoding style, within the limits of supported formats.
105    
106     =over 4
107    
108     =item $cbor = new CBOR::XS
109    
110     Creates a new CBOR::XS object that can be used to de/encode CBOR
111     strings. All boolean flags described below are by default I<disabled>.
112    
113     The mutators for flags all return the CBOR object again and thus calls can
114     be chained:
115    
116     my $cbor = CBOR::XS->new->encode ({a => [1,2]});
117    
118     =item $cbor = $cbor->max_depth ([$maximum_nesting_depth])
119    
120     =item $max_depth = $cbor->get_max_depth
121    
122     Sets the maximum nesting level (default C<512>) accepted while encoding
123     or decoding. If a higher nesting level is detected in CBOR data or a Perl
124     data structure, then the encoder and decoder will stop and croak at that
125     point.
126    
127     Nesting level is defined by number of hash- or arrayrefs that the encoder
128     needs to traverse to reach a given point or the number of C<{> or C<[>
129     characters without their matching closing parenthesis crossed to reach a
130     given character in a string.
131    
132     Setting the maximum depth to one disallows any nesting, so that ensures
133     that the object is only a single hash/object or array.
134    
135     If no argument is given, the highest possible setting will be used, which
136     is rarely useful.
137    
138     Note that nesting is implemented by recursion in C. The default value has
139     been chosen to be as large as typical operating systems allow without
140     crashing.
141    
142     See SECURITY CONSIDERATIONS, below, for more info on why this is useful.
143    
144     =item $cbor = $cbor->max_size ([$maximum_string_size])
145    
146     =item $max_size = $cbor->get_max_size
147    
148     Set the maximum length a CBOR string may have (in bytes) where decoding
149     is being attempted. The default is C<0>, meaning no limit. When C<decode>
150     is called on a string that is longer then this many bytes, it will not
151     attempt to decode the string but throw an exception. This setting has no
152     effect on C<encode> (yet).
153    
154     If no argument is given, the limit check will be deactivated (same as when
155     C<0> is specified).
156    
157     See SECURITY CONSIDERATIONS, below, for more info on why this is useful.
158    
159 root 1.19 =item $cbor = $cbor->allow_unknown ([$enable])
160    
161     =item $enabled = $cbor->get_allow_unknown
162    
163     If C<$enable> is true (or missing), then C<encode> will I<not> throw an
164     exception when it encounters values it cannot represent in CBOR (for
165     example, filehandles) but instead will encode a CBOR C<error> value.
166    
167     If C<$enable> is false (the default), then C<encode> will throw an
168     exception when it encounters anything it cannot encode as CBOR.
169    
170     This option does not affect C<decode> in any way, and it is recommended to
171     leave it off unless you know your communications partner.
172    
173 root 1.20 =item $cbor = $cbor->allow_sharing ([$enable])
174 root 1.19
175 root 1.20 =item $enabled = $cbor->get_allow_sharing
176 root 1.19
177     If C<$enable> is true (or missing), then C<encode> will not double-encode
178 root 1.20 values that have been referenced before (e.g. when the same object, such
179     as an array, is referenced multiple times), but instead will emit a
180     reference to the earlier value.
181 root 1.19
182     This means that such values will only be encoded once, and will not result
183     in a deep cloning of the value on decode, in decoders supporting the value
184 root 1.25 sharing extension. This also makes it possible to encode cyclic data
185 root 1.31 structures (which need C<allow_cycles> to ne enabled to be decoded by this
186     module).
187 root 1.19
188 root 1.21 It is recommended to leave it off unless you know your
189     communication partner supports the value sharing extensions to CBOR
190 root 1.26 (L<http://cbor.schmorp.de/value-sharing>), as without decoder support, the
191 root 1.25 resulting data structure might be unusable.
192 root 1.21
193 root 1.19 Detecting shared values incurs a runtime overhead when values are encoded
194     that have a reference counter large than one, and might unnecessarily
195     increase the encoded size, as potentially shared values are encode as
196 root 1.31 shareable whether or not they are actually shared.
197 root 1.19
198 root 1.20 At the moment, only targets of references can be shared (e.g. scalars,
199     arrays or hashes pointed to by a reference). Weirder constructs, such as
200     an array with multiple "copies" of the I<same> string, which are hard but
201     not impossible to create in Perl, are not supported (this is the same as
202 root 1.25 with L<Storable>).
203 root 1.19
204 root 1.25 If C<$enable> is false (the default), then C<encode> will encode shared
205     data structures repeatedly, unsharing them in the process. Cyclic data
206     structures cannot be encoded in this mode.
207 root 1.19
208     This option does not affect C<decode> in any way - shared values and
209 root 1.21 references will always be decoded properly if present.
210    
211 root 1.31 =item $cbor = $cbor->allow_cycles ([$enable])
212    
213     =item $enabled = $cbor->get_allow_cycles
214    
215     If C<$enable> is true (or missing), then C<decode> will happily decode
216     self-referential (cyclic) data structures. By default these will not be
217     decoded, as they need manual cleanup to avoid memory leaks, so code that
218     isn't prepared for this will not leak memory.
219    
220     If C<$enable> is false (the default), then C<decode> will throw an error
221     when it encounters a self-referential/cyclic data structure.
222    
223     This option does not affect C<encode> in any way - shared values and
224     references will always be decoded properly if present.
225    
226 root 1.25 =item $cbor = $cbor->pack_strings ([$enable])
227 root 1.21
228 root 1.25 =item $enabled = $cbor->get_pack_strings
229 root 1.21
230     If C<$enable> is true (or missing), then C<encode> will try not to encode
231     the same string twice, but will instead encode a reference to the string
232 root 1.25 instead. Depending on your data format, this can save a lot of space, but
233 root 1.21 also results in a very large runtime overhead (expect encoding times to be
234     2-4 times as high as without).
235    
236     It is recommended to leave it off unless you know your
237     communications partner supports the stringref extension to CBOR
238 root 1.26 (L<http://cbor.schmorp.de/stringref>), as without decoder support, the
239 root 1.25 resulting data structure might not be usable.
240 root 1.21
241 root 1.25 If C<$enable> is false (the default), then C<encode> will encode strings
242     the standard CBOR way.
243 root 1.21
244     This option does not affect C<decode> in any way - string references will
245     always be decoded properly if present.
246 root 1.19
247 root 1.33 =item $cbor = $cbor->validate_utf8 ([$enable])
248    
249     =item $enabled = $cbor->get_validate_utf8
250    
251     If C<$enable> is true (or missing), then C<decode> will validate that
252     elements (text strings) containing UTF-8 data in fact contain valid UTF-8
253     data (instead of blindly accepting it). This validation obviously takes
254     extra time during decoding.
255    
256     The concept of "valid UTF-8" used is perl's concept, which is a superset
257     of the official UTF-8.
258    
259     If C<$enable> is false (the default), then C<decode> will blindly accept
260     UTF-8 data, marking them as valid UTF-8 in the resulting data structure
261     regardless of whether thats true or not.
262    
263     Perl isn't too happy about corrupted UTF-8 in strings, but should
264     generally not crash or do similarly evil things. Extensions might be not
265     so forgiving, so it's recommended to turn on this setting if you receive
266     untrusted CBOR.
267    
268     This option does not affect C<encode> in any way - strings that are
269     supposedly valid UTF-8 will simply be dumped into the resulting CBOR
270     string without checking whether that is, in fact, true or not.
271    
272 root 1.23 =item $cbor = $cbor->filter ([$cb->($tag, $value)])
273    
274     =item $cb_or_undef = $cbor->get_filter
275    
276 root 1.24 Sets or replaces the tagged value decoding filter (when C<$cb> is
277     specified) or clears the filter (if no argument or C<undef> is provided).
278    
279     The filter callback is called only during decoding, when a non-enforced
280     tagged value has been decoded (see L<TAG HANDLING AND EXTENSIONS> for a
281     list of enforced tags). For specific tags, it's often better to provide a
282     default converter using the C<%CBOR::XS::FILTER> hash (see below).
283    
284     The first argument is the numerical tag, the second is the (decoded) value
285     that has been tagged.
286    
287     The filter function should return either exactly one value, which will
288     replace the tagged value in the decoded data structure, or no values,
289     which will result in default handling, which currently means the decoder
290     creates a C<CBOR::XS::Tagged> object to hold the tag and the value.
291    
292     When the filter is cleared (the default state), the default filter
293     function, C<CBOR::XS::default_filter>, is used. This function simply looks
294     up the tag in the C<%CBOR::XS::FILTER> hash. If an entry exists it must be
295     a code reference that is called with tag and value, and is responsible for
296     decoding the value. If no entry exists, it returns no values.
297    
298 root 1.28 Example: decode all tags not handled internally into C<CBOR::XS::Tagged>
299 root 1.24 objects, with no other special handling (useful when working with
300     potentially "unsafe" CBOR data).
301    
302     CBOR::XS->new->filter (sub { })->decode ($cbor_data);
303    
304     Example: provide a global filter for tag 1347375694, converting the value
305     into some string form.
306    
307     $CBOR::XS::FILTER{1347375694} = sub {
308     my ($tag, $value);
309    
310     "tag 1347375694 value $value"
311     };
312 root 1.23
313 root 1.1 =item $cbor_data = $cbor->encode ($perl_scalar)
314    
315     Converts the given Perl data structure (a scalar value) to its CBOR
316     representation.
317    
318     =item $perl_scalar = $cbor->decode ($cbor_data)
319    
320     The opposite of C<encode>: expects CBOR data and tries to parse it,
321     returning the resulting simple scalar or reference. Croaks on error.
322    
323     =item ($perl_scalar, $octets) = $cbor->decode_prefix ($cbor_data)
324    
325     This works like the C<decode> method, but instead of raising an exception
326     when there is trailing garbage after the CBOR string, it will silently
327     stop parsing there and return the number of characters consumed so far.
328    
329     This is useful if your CBOR texts are not delimited by an outer protocol
330     and you need to know where the first CBOR string ends amd the next one
331     starts.
332    
333     CBOR::XS->new->decode_prefix ("......")
334     => ("...", 3)
335    
336     =back
337    
338    
339     =head1 MAPPING
340    
341     This section describes how CBOR::XS maps Perl values to CBOR values and
342     vice versa. These mappings are designed to "do the right thing" in most
343     circumstances automatically, preserving round-tripping characteristics
344     (what you put in comes out as something equivalent).
345    
346     For the more enlightened: note that in the following descriptions,
347     lowercase I<perl> refers to the Perl interpreter, while uppercase I<Perl>
348     refers to the abstract Perl language itself.
349    
350    
351     =head2 CBOR -> PERL
352    
353     =over 4
354    
355 root 1.4 =item integers
356    
357     CBOR integers become (numeric) perl scalars. On perls without 64 bit
358     support, 64 bit integers will be truncated or otherwise corrupted.
359    
360     =item byte strings
361    
362 root 1.27 Byte strings will become octet strings in Perl (the Byte values 0..255
363 root 1.4 will simply become characters of the same value in Perl).
364    
365     =item UTF-8 strings
366    
367     UTF-8 strings in CBOR will be decoded, i.e. the UTF-8 octets will be
368     decoded into proper Unicode code points. At the moment, the validity of
369     the UTF-8 octets will not be validated - corrupt input will result in
370     corrupted Perl strings.
371    
372     =item arrays, maps
373    
374     CBOR arrays and CBOR maps will be converted into references to a Perl
375     array or hash, respectively. The keys of the map will be stringified
376     during this process.
377    
378 root 1.6 =item null
379    
380     CBOR null becomes C<undef> in Perl.
381    
382     =item true, false, undefined
383 root 1.1
384 root 1.6 These CBOR values become C<Types:Serialiser::true>,
385     C<Types:Serialiser::false> and C<Types::Serialiser::error>,
386 root 1.1 respectively. They are overloaded to act almost exactly like the numbers
387 root 1.6 C<1> and C<0> (for true and false) or to throw an exception on access (for
388     error). See the L<Types::Serialiser> manpage for details.
389    
390 root 1.23 =item tagged values
391 root 1.1
392 root 1.23 Tagged items consists of a numeric tag and another CBOR value.
393 root 1.4
394 root 1.23 See L<TAG HANDLING AND EXTENSIONS> and the description of C<< ->filter >>
395 root 1.28 for details on which tags are handled how.
396 root 1.4
397     =item anything else
398    
399     Anything else (e.g. unsupported simple values) will raise a decoding
400     error.
401 root 1.1
402     =back
403    
404    
405     =head2 PERL -> CBOR
406    
407     The mapping from Perl to CBOR is slightly more difficult, as Perl is a
408 root 1.28 typeless language. That means this module can only guess which CBOR type
409     is meant by a perl value.
410 root 1.1
411     =over 4
412    
413     =item hash references
414    
415 root 1.4 Perl hash references become CBOR maps. As there is no inherent ordering in
416     hash keys (or CBOR maps), they will usually be encoded in a pseudo-random
417 root 1.28 order. This order can be different each time a hahs is encoded.
418 root 1.4
419     Currently, tied hashes will use the indefinite-length format, while normal
420     hashes will use the fixed-length format.
421 root 1.1
422     =item array references
423    
424 root 1.4 Perl array references become fixed-length CBOR arrays.
425 root 1.1
426     =item other references
427    
428 root 1.28 Other unblessed references will be represented using
429     the indirection tag extension (tag value C<22098>,
430     L<http://cbor.schmorp.de/indirection>). CBOR decoders are guaranteed
431     to be able to decode these values somehow, by either "doing the right
432     thing", decoding into a generic tagged object, simply ignoring the tag, or
433     something else.
434 root 1.4
435     =item CBOR::XS::Tagged objects
436    
437     Objects of this type must be arrays consisting of a single C<[tag, value]>
438 root 1.13 pair. The (numerical) tag will be encoded as a CBOR tag, the value will
439 root 1.28 be encoded as appropriate for the value. You must use C<CBOR::XS::tag> to
440 root 1.13 create such objects.
441 root 1.1
442 root 1.6 =item Types::Serialiser::true, Types::Serialiser::false, Types::Serialiser::error
443 root 1.1
444 root 1.6 These special values become CBOR true, CBOR false and CBOR undefined
445     values, respectively. You can also use C<\1>, C<\0> and C<\undef> directly
446     if you want.
447 root 1.1
448 root 1.7 =item other blessed objects
449 root 1.1
450 root 1.7 Other blessed objects are serialised via C<TO_CBOR> or C<FREEZE>. See
451 root 1.23 L<TAG HANDLING AND EXTENSIONS> for specific classes handled by this
452     module, and L<OBJECT SERIALISATION> for generic object serialisation.
453 root 1.1
454     =item simple scalars
455    
456     Simple Perl scalars (any scalar that is not a reference) are the most
457     difficult objects to encode: CBOR::XS will encode undefined scalars as
458 root 1.4 CBOR null values, scalars that have last been used in a string context
459 root 1.1 before encoding as CBOR strings, and anything else as number value:
460    
461     # dump as number
462     encode_cbor [2] # yields [2]
463     encode_cbor [-3.0e17] # yields [-3e+17]
464     my $value = 5; encode_cbor [$value] # yields [5]
465    
466 root 1.27 # used as string, so dump as string (either byte or text)
467 root 1.1 print $value;
468     encode_cbor [$value] # yields ["5"]
469    
470     # undef becomes null
471     encode_cbor [undef] # yields [null]
472    
473     You can force the type to be a CBOR string by stringifying it:
474    
475     my $x = 3.1; # some variable containing a number
476     "$x"; # stringified
477     $x .= ""; # another, more awkward way to stringify
478     print $x; # perl does it for you, too, quite often
479    
480 root 1.27 You can force whether a string ie encoded as byte or text string by using
481     C<utf8::upgrade> and C<utf8::downgrade>):
482    
483     utf8::upgrade $x; # encode $x as text string
484     utf8::downgrade $x; # encode $x as byte string
485    
486     Perl doesn't define what operations up- and downgrade strings, so if the
487     difference between byte and text is important, you should up- or downgrade
488     your string as late as possible before encoding.
489    
490 root 1.1 You can force the type to be a CBOR number by numifying it:
491    
492     my $x = "3"; # some variable containing a string
493     $x += 0; # numify it, ensuring it will be dumped as a number
494     $x *= 1; # same thing, the choice is yours.
495    
496     You can not currently force the type in other, less obscure, ways. Tell me
497     if you need this capability (but don't forget to explain why it's needed
498     :).
499    
500 root 1.4 Perl values that seem to be integers generally use the shortest possible
501     representation. Floating-point values will use either the IEEE single
502     format if possible without loss of precision, otherwise the IEEE double
503     format will be used. Perls that use formats other than IEEE double to
504     represent numerical values are supported, but might suffer loss of
505     precision.
506 root 1.1
507     =back
508    
509 root 1.7 =head2 OBJECT SERIALISATION
510    
511 root 1.29 This module implements both a CBOR-specific and the generic
512     L<Types::Serialier> object serialisation protocol. The following
513     subsections explain both methods.
514    
515     =head3 ENCODING
516    
517 root 1.7 This module knows two way to serialise a Perl object: The CBOR-specific
518     way, and the generic way.
519    
520 root 1.29 Whenever the encoder encounters a Perl object that it cannot serialise
521 root 1.7 directly (most of them), it will first look up the C<TO_CBOR> method on
522     it.
523    
524     If it has a C<TO_CBOR> method, it will call it with the object as only
525     argument, and expects exactly one return value, which it will then
526     substitute and encode it in the place of the object.
527    
528     Otherwise, it will look up the C<FREEZE> method. If it exists, it will
529     call it with the object as first argument, and the constant string C<CBOR>
530     as the second argument, to distinguish it from other serialisers.
531    
532     The C<FREEZE> method can return any number of values (i.e. zero or
533     more). These will be encoded as CBOR perl object, together with the
534     classname.
535    
536 root 1.29 These methods I<MUST NOT> change the data structure that is being
537     serialised. Failure to comply to this can result in memory corruption -
538     and worse.
539    
540 root 1.7 If an object supports neither C<TO_CBOR> nor C<FREEZE>, encoding will fail
541     with an error.
542    
543 root 1.29 =head3 DECODING
544    
545     Objects encoded via C<TO_CBOR> cannot (normally) be automatically decoded,
546     but objects encoded via C<FREEZE> can be decoded using the following
547     protocol:
548 root 1.7
549     When an encoded CBOR perl object is encountered by the decoder, it will
550     look up the C<THAW> method, by using the stored classname, and will fail
551     if the method cannot be found.
552    
553     After the lookup it will call the C<THAW> method with the stored classname
554     as first argument, the constant string C<CBOR> as second argument, and all
555     values returned by C<FREEZE> as remaining arguments.
556    
557 root 1.29 =head3 EXAMPLES
558 root 1.7
559     Here is an example C<TO_CBOR> method:
560    
561     sub My::Object::TO_CBOR {
562     my ($obj) = @_;
563    
564     ["this is a serialised My::Object object", $obj->{id}]
565     }
566    
567     When a C<My::Object> is encoded to CBOR, it will instead encode a simple
568     array with two members: a string, and the "object id". Decoding this CBOR
569     string will yield a normal perl array reference in place of the object.
570    
571     A more useful and practical example would be a serialisation method for
572     the URI module. CBOR has a custom tag value for URIs, namely 32:
573    
574     sub URI::TO_CBOR {
575     my ($self) = @_;
576     my $uri = "$self"; # stringify uri
577     utf8::upgrade $uri; # make sure it will be encoded as UTF-8 string
578 root 1.28 CBOR::XS::tag 32, "$_[0]"
579 root 1.7 }
580    
581     This will encode URIs as a UTF-8 string with tag 32, which indicates an
582     URI.
583    
584     Decoding such an URI will not (currently) give you an URI object, but
585     instead a CBOR::XS::Tagged object with tag number 32 and the string -
586     exactly what was returned by C<TO_CBOR>.
587    
588     To serialise an object so it can automatically be deserialised, you need
589     to use C<FREEZE> and C<THAW>. To take the URI module as example, this
590     would be a possible implementation:
591    
592     sub URI::FREEZE {
593     my ($self, $serialiser) = @_;
594     "$self" # encode url string
595     }
596    
597     sub URI::THAW {
598     my ($class, $serialiser, $uri) = @_;
599    
600     $class->new ($uri)
601     }
602    
603     Unlike C<TO_CBOR>, multiple values can be returned by C<FREEZE>. For
604     example, a C<FREEZE> method that returns "type", "id" and "variant" values
605     would cause an invocation of C<THAW> with 5 arguments:
606    
607     sub My::Object::FREEZE {
608     my ($self, $serialiser) = @_;
609    
610     ($self->{type}, $self->{id}, $self->{variant})
611     }
612    
613     sub My::Object::THAW {
614     my ($class, $serialiser, $type, $id, $variant) = @_;
615    
616     $class-<new (type => $type, id => $id, variant => $variant)
617     }
618    
619 root 1.1
620 root 1.7 =head1 MAGIC HEADER
621 root 1.3
622     There is no way to distinguish CBOR from other formats
623     programmatically. To make it easier to distinguish CBOR from other
624     formats, the CBOR specification has a special "magic string" that can be
625 root 1.18 prepended to any CBOR string without changing its meaning.
626 root 1.3
627     This string is available as C<$CBOR::XS::MAGIC>. This module does not
628 root 1.18 prepend this string to the CBOR data it generates, but it will ignore it
629 root 1.3 if present, so users can prepend this string as a "file type" indicator as
630     required.
631    
632    
633 root 1.12 =head1 THE CBOR::XS::Tagged CLASS
634    
635     CBOR has the concept of tagged values - any CBOR value can be tagged with
636     a numeric 64 bit number, which are centrally administered.
637    
638     C<CBOR::XS> handles a few tags internally when en- or decoding. You can
639     also create tags yourself by encoding C<CBOR::XS::Tagged> objects, and the
640     decoder will create C<CBOR::XS::Tagged> objects itself when it hits an
641     unknown tag.
642    
643     These objects are simply blessed array references - the first member of
644     the array being the numerical tag, the second being the value.
645    
646     You can interact with C<CBOR::XS::Tagged> objects in the following ways:
647    
648     =over 4
649    
650     =item $tagged = CBOR::XS::tag $tag, $value
651    
652     This function(!) creates a new C<CBOR::XS::Tagged> object using the given
653     C<$tag> (0..2**64-1) to tag the given C<$value> (which can be any Perl
654     value that can be encoded in CBOR, including serialisable Perl objects and
655     C<CBOR::XS::Tagged> objects).
656    
657     =item $tagged->[0]
658    
659     =item $tagged->[0] = $new_tag
660    
661     =item $tag = $tagged->tag
662    
663     =item $new_tag = $tagged->tag ($new_tag)
664    
665     Access/mutate the tag.
666    
667     =item $tagged->[1]
668    
669     =item $tagged->[1] = $new_value
670    
671     =item $value = $tagged->value
672    
673     =item $new_value = $tagged->value ($new_value)
674    
675     Access/mutate the tagged value.
676    
677     =back
678    
679     =cut
680    
681     sub tag($$) {
682     bless [@_], CBOR::XS::Tagged::;
683     }
684    
685     sub CBOR::XS::Tagged::tag {
686     $_[0][0] = $_[1] if $#_;
687     $_[0][0]
688     }
689    
690     sub CBOR::XS::Tagged::value {
691     $_[0][1] = $_[1] if $#_;
692     $_[0][1]
693     }
694    
695 root 1.13 =head2 EXAMPLES
696    
697     Here are some examples of C<CBOR::XS::Tagged> uses to tag objects.
698    
699     You can look up CBOR tag value and emanings in the IANA registry at
700     L<http://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml>.
701    
702     Prepend a magic header (C<$CBOR::XS::MAGIC>):
703    
704     my $cbor = encode_cbor CBOR::XS::tag 55799, $value;
705     # same as:
706     my $cbor = $CBOR::XS::MAGIC . encode_cbor $value;
707    
708     Serialise some URIs and a regex in an array:
709    
710     my $cbor = encode_cbor [
711     (CBOR::XS::tag 32, "http://www.nethype.de/"),
712     (CBOR::XS::tag 32, "http://software.schmorp.de/"),
713     (CBOR::XS::tag 35, "^[Pp][Ee][Rr][lL]\$"),
714     ];
715    
716     Wrap CBOR data in CBOR:
717    
718     my $cbor_cbor = encode_cbor
719     CBOR::XS::tag 24,
720     encode_cbor [1, 2, 3];
721    
722 root 1.19 =head1 TAG HANDLING AND EXTENSIONS
723    
724 root 1.22 This section describes how this module handles specific tagged values
725     and extensions. If a tag is not mentioned here and no additional filters
726     are provided for it, then the default handling applies (creating a
727     CBOR::XS::Tagged object on decoding, and only encoding the tag when
728     explicitly requested).
729 root 1.19
730 root 1.23 Tags not handled specifically are currently converted into a
731     L<CBOR::XS::Tagged> object, which is simply a blessed array reference
732     consisting of the numeric tag value followed by the (decoded) CBOR value.
733    
734 root 1.19 Future versions of this module reserve the right to special case
735 root 1.22 additional tags (such as base64url).
736    
737     =head2 ENFORCED TAGS
738    
739     These tags are always handled when decoding, and their handling cannot be
740     overriden by the user.
741 root 1.19
742     =over 4
743    
744 root 1.26 =item 26 (perl-object, L<http://cbor.schmorp.de/perl-object>)
745 root 1.19
746 root 1.23 These tags are automatically created (and decoded) for serialisable
747     objects using the C<FREEZE/THAW> methods (the L<Types::Serialier> object
748     serialisation protocol). See L<OBJECT SERIALISATION> for details.
749 root 1.19
750 root 1.31 =item 28, 29 (shareable, sharedref, L <http://cbor.schmorp.de/value-sharing>)
751 root 1.19
752 root 1.31 These tags are automatically decoded when encountered (and they do not
753     result in a cyclic data structure, see C<allow_cycles>), resulting in
754 root 1.19 shared values in the decoded object. They are only encoded, however, when
755 root 1.31 C<allow_sharing> is enabled.
756    
757     Not all shared values can be successfully decoded: values that reference
758     themselves will I<currently> decode as C<undef> (this is not the same
759     as a reference pointing to itself, which will be represented as a value
760     that contains an indirect reference to itself - these will be decoded
761     properly).
762    
763     Note that considerably more shared value data structures can be decoded
764     than will be encoded - currently, only values pointed to by references
765     will be shared, others will not. While non-reference shared values can be
766     generated in Perl with some effort, they were considered too unimportant
767     to be supported in the encoder. The decoder, however, will decode these
768     values as shared values.
769 root 1.19
770 root 1.26 =item 256, 25 (stringref-namespace, stringref, L <http://cbor.schmorp.de/stringref>)
771 root 1.21
772     These tags are automatically decoded when encountered. They are only
773 root 1.25 encoded, however, when C<pack_strings> is enabled.
774 root 1.21
775 root 1.19 =item 22098 (indirection, L<http://cbor.schmorp.de/indirection>)
776    
777     This tag is automatically generated when a reference are encountered (with
778     the exception of hash and array refernces). It is converted to a reference
779     when decoding.
780    
781     =item 55799 (self-describe CBOR, RFC 7049)
782    
783     This value is not generated on encoding (unless explicitly requested by
784     the user), and is simply ignored when decoding.
785    
786     =back
787    
788 root 1.24 =head2 NON-ENFORCED TAGS
789 root 1.22
790     These tags have default filters provided when decoding. Their handling can
791     be overriden by changing the C<%CBOR::XS::FILTER> entry for the tag, or by
792 root 1.24 providing a custom C<filter> callback when decoding.
793 root 1.22
794     When they result in decoding into a specific Perl class, the module
795     usually provides a corresponding C<TO_CBOR> method as well.
796    
797     When any of these need to load additional modules that are not part of the
798     perl core distribution (e.g. L<URI>), it is (currently) up to the user to
799     provide these modules. The decoding usually fails with an exception if the
800     required module cannot be loaded.
801    
802     =over 4
803    
804 root 1.35 =item 0, 1 (date/time string, seconds since the epoch)
805    
806     These tags are decoded into L<Time::Piece> objects. The corresponding
807     C<Time::Piece::TO_CBOR> method always encodes into tag 1 values currently.
808    
809     The L<Time::Piece> API is generally surprisingly bad, and fractional
810     seconds are only accidentally kept intact, so watch out. On the plus side,
811     the module comes with perl since 5.10, which has to count for something.
812    
813 root 1.22 =item 2, 3 (positive/negative bignum)
814    
815     These tags are decoded into L<Math::BigInt> objects. The corresponding
816     C<Math::BigInt::TO_CBOR> method encodes "small" bigints into normal CBOR
817     integers, and others into positive/negative CBOR bignums.
818    
819     =item 4, 5 (decimal fraction/bigfloat)
820    
821     Both decimal fractions and bigfloats are decoded into L<Math::BigFloat>
822     objects. The corresponding C<Math::BigFloat::TO_CBOR> method I<always>
823     encodes into a decimal fraction.
824    
825     CBOR cannot represent bigfloats with I<very> large exponents - conversion
826     of such big float objects is undefined.
827    
828     Also, NaN and infinities are not encoded properly.
829    
830     =item 21, 22, 23 (expected later JSON conversion)
831    
832     CBOR::XS is not a CBOR-to-JSON converter, and will simply ignore these
833     tags.
834    
835     =item 32 (URI)
836    
837     These objects decode into L<URI> objects. The corresponding
838     C<URI::TO_CBOR> method again results in a CBOR URI value.
839    
840     =back
841    
842     =cut
843    
844     our %FILTER = (
845     # 0 # rfc4287 datetime, utf-8
846     # 1 # unix timestamp, any
847    
848     2 => sub { # pos bigint
849     require Math::BigInt;
850     Math::BigInt->new ("0x" . unpack "H*", pop)
851     },
852    
853     3 => sub { # neg bigint
854     require Math::BigInt;
855     -Math::BigInt->new ("0x" . unpack "H*", pop)
856     },
857    
858     4 => sub { # decimal fraction, array
859     require Math::BigFloat;
860     Math::BigFloat->new ($_[1][1] . "E" . $_[1][0])
861     },
862    
863     5 => sub { # bigfloat, array
864     require Math::BigFloat;
865     scalar Math::BigFloat->new ($_[1][1])->blsft ($_[1][0], 2)
866     },
867    
868     21 => sub { pop }, # expected conversion to base64url encoding
869     22 => sub { pop }, # expected conversion to base64 encoding
870     23 => sub { pop }, # expected conversion to base16 encoding
871    
872     # 24 # embedded cbor, byte string
873    
874     32 => sub {
875     require URI;
876     URI->new (pop)
877     },
878    
879     # 33 # base64url rfc4648, utf-8
880     # 34 # base64 rfc46484, utf-8
881     # 35 # regex pcre/ecma262, utf-8
882     # 36 # mime message rfc2045, utf-8
883     );
884    
885 root 1.19
886 root 1.7 =head1 CBOR and JSON
887 root 1.1
888 root 1.4 CBOR is supposed to implement a superset of the JSON data model, and is,
889     with some coercion, able to represent all JSON texts (something that other
890     "binary JSON" formats such as BSON generally do not support).
891    
892     CBOR implements some extra hints and support for JSON interoperability,
893     and the spec offers further guidance for conversion between CBOR and
894     JSON. None of this is currently implemented in CBOR, and the guidelines
895     in the spec do not result in correct round-tripping of data. If JSON
896     interoperability is improved in the future, then the goal will be to
897     ensure that decoded JSON data will round-trip encoding and decoding to
898     CBOR intact.
899 root 1.1
900    
901     =head1 SECURITY CONSIDERATIONS
902    
903     When you are using CBOR in a protocol, talking to untrusted potentially
904     hostile creatures requires relatively few measures.
905    
906     First of all, your CBOR decoder should be secure, that is, should not have
907     any buffer overflows. Obviously, this module should ensure that and I am
908     trying hard on making that true, but you never know.
909    
910     Second, you need to avoid resource-starving attacks. That means you should
911     limit the size of CBOR data you accept, or make sure then when your
912     resources run out, that's just fine (e.g. by using a separate process that
913     can crash safely). The size of a CBOR string in octets is usually a good
914     indication of the size of the resources required to decode it into a Perl
915     structure. While CBOR::XS can check the size of the CBOR text, it might be
916     too late when you already have it in memory, so you might want to check
917     the size before you accept the string.
918    
919     Third, CBOR::XS recurses using the C stack when decoding objects and
920     arrays. The C stack is a limited resource: for instance, on my amd64
921     machine with 8MB of stack size I can decode around 180k nested arrays but
922     only 14k nested CBOR objects (due to perl itself recursing deeply on croak
923     to free the temporary). If that is exceeded, the program crashes. To be
924     conservative, the default nesting limit is set to 512. If your process
925     has a smaller stack, you should adjust this setting accordingly with the
926     C<max_depth> method.
927    
928     Something else could bomb you, too, that I forgot to think of. In that
929     case, you get to keep the pieces. I am always open for hints, though...
930    
931     Also keep in mind that CBOR::XS might leak contents of your Perl data
932     structures in its error messages, so when you serialise sensitive
933     information you might want to make sure that exceptions thrown by CBOR::XS
934     will not end up in front of untrusted eyes.
935    
936     =head1 CBOR IMPLEMENTATION NOTES
937    
938     This section contains some random implementation notes. They do not
939     describe guaranteed behaviour, but merely behaviour as-is implemented
940     right now.
941    
942     64 bit integers are only properly decoded when Perl was built with 64 bit
943     support.
944    
945     Strings and arrays are encoded with a definite length. Hashes as well,
946     unless they are tied (or otherwise magical).
947    
948     Only the double data type is supported for NV data types - when Perl uses
949     long double to represent floating point values, they might not be encoded
950     properly. Half precision types are accepted, but not encoded.
951    
952     Strict mode and canonical mode are not implemented.
953    
954    
955 root 1.30 =head1 LIMITATIONS ON PERLS WITHOUT 64-BIT INTEGER SUPPORT
956    
957     On perls that were built without 64 bit integer support (these are rare
958     nowadays, even on 32 bit architectures), support for any kind of 64 bit
959     integer in CBOR is very limited - most likely, these 64 bit values will
960     be truncated, corrupted, or otherwise not decoded correctly. This also
961     includes string, array and map sizes that are stored as 64 bit integers.
962    
963    
964 root 1.1 =head1 THREADS
965    
966     This module is I<not> guaranteed to be thread safe and there are no
967     plans to change this until Perl gets thread support (as opposed to the
968     horribly slow so-called "threads" which are simply slow and bloated
969     process simulations - use fork, it's I<much> faster, cheaper, better).
970    
971     (It might actually work, but you have been warned).
972    
973    
974     =head1 BUGS
975    
976     While the goal of this module is to be correct, that unfortunately does
977     not mean it's bug-free, only that I think its design is bug-free. If you
978     keep reporting bugs they will be fixed swiftly, though.
979    
980     Please refrain from using rt.cpan.org or any other bug reporting
981     service. I put the contact address into my modules for a reason.
982    
983     =cut
984    
985 root 1.22 our %FILTER = (
986 root 1.35 0 => sub { # rfc4287 datetime, utf-8
987     require Time::Piece;
988     # Time::Piece::Strptime uses the "incredibly flexible date parsing routine"
989     # from FreeBSD, which can't parse ISO 8601, RFC3339, RFC4287 or much of anything
990     # else either. Whats incredibe over standard strptime totally escapes me.
991     # doesn't do fractional times, either. sigh.
992 root 1.36 # In fact, it's all a lie, it uses whatever strptime it wants, and of course,
993     # they are all incomptible. The openbsd one simply ignores %z (but according to the
994 root 1.37 # docs, it would be much more incredibly flexible indeed. If it worked, that is.).
995 root 1.35 scalar eval {
996     my $s = $_[1];
997    
998     $s =~ s/Z$/+00:00/;
999 root 1.36 $s =~ s/(\.[0-9]+)?([+-][0-9][0-9]):([0-9][0-9])$//
1000 root 1.35 or die;
1001    
1002 root 1.36 my $b = $1 - ($2 * 60 + $3) * 60; # fractional part + offset. hopefully
1003     my $d = Time::Piece->strptime ($s, "%Y-%m-%dT%H:%M:%S");
1004 root 1.35
1005 root 1.36 Time::Piece::gmtime ($d->epoch + $b)
1006 root 1.35 } || die "corrupted CBOR date/time string ($_[0])";
1007     },
1008    
1009     1 => sub { # seconds since the epoch, possibly fractional
1010     require Time::Piece;
1011     scalar Time::Piece::gmtime (pop)
1012     },
1013 root 1.22
1014     2 => sub { # pos bigint
1015     require Math::BigInt;
1016     Math::BigInt->new ("0x" . unpack "H*", pop)
1017     },
1018    
1019     3 => sub { # neg bigint
1020     require Math::BigInt;
1021     -Math::BigInt->new ("0x" . unpack "H*", pop)
1022     },
1023    
1024     4 => sub { # decimal fraction, array
1025     require Math::BigFloat;
1026     Math::BigFloat->new ($_[1][1] . "E" . $_[1][0])
1027     },
1028    
1029     5 => sub { # bigfloat, array
1030     require Math::BigFloat;
1031     scalar Math::BigFloat->new ($_[1][1])->blsft ($_[1][0], 2)
1032     },
1033    
1034     21 => sub { pop }, # expected conversion to base64url encoding
1035     22 => sub { pop }, # expected conversion to base64 encoding
1036     23 => sub { pop }, # expected conversion to base16 encoding
1037    
1038     # 24 # embedded cbor, byte string
1039    
1040     32 => sub {
1041     require URI;
1042     URI->new (pop)
1043     },
1044    
1045     # 33 # base64url rfc4648, utf-8
1046     # 34 # base64 rfc46484, utf-8
1047     # 35 # regex pcre/ecma262, utf-8
1048     # 36 # mime message rfc2045, utf-8
1049     );
1050    
1051     sub CBOR::XS::default_filter {
1052     &{ $FILTER{$_[0]} or return }
1053     }
1054    
1055     sub URI::TO_CBOR {
1056     my $uri = $_[0]->as_string;
1057     utf8::upgrade $uri;
1058 root 1.35 tag 32, $uri
1059 root 1.22 }
1060    
1061     sub Math::BigInt::TO_CBOR {
1062     if ($_[0] >= -2147483648 && $_[0] <= 2147483647) {
1063     $_[0]->numify
1064     } else {
1065     my $hex = substr $_[0]->as_hex, 2;
1066     $hex = "0$hex" if 1 & length $hex; # sigh
1067 root 1.35 tag $_[0] >= 0 ? 2 : 3, pack "H*", $hex
1068 root 1.22 }
1069     }
1070    
1071     sub Math::BigFloat::TO_CBOR {
1072     my ($m, $e) = $_[0]->parts;
1073 root 1.35 tag 4, [$e->numify, $m]
1074     }
1075    
1076     sub Time::Piece::TO_CBOR {
1077     tag 1, $_[0]->epoch
1078 root 1.22 }
1079    
1080 root 1.1 XSLoader::load "CBOR::XS", $VERSION;
1081    
1082     =head1 SEE ALSO
1083    
1084     The L<JSON> and L<JSON::XS> modules that do similar, but human-readable,
1085     serialisation.
1086    
1087 root 1.6 The L<Types::Serialiser> module provides the data model for true, false
1088     and error values.
1089    
1090 root 1.1 =head1 AUTHOR
1091    
1092     Marc Lehmann <schmorp@schmorp.de>
1093     http://home.schmorp.de/
1094    
1095     =cut
1096    
1097 root 1.6 1
1098