ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent/lib/AnyEvent/Util.pm
Revision: 1.156
Committed: Thu Mar 12 13:43:19 2020 UTC (4 years, 8 months ago) by root
Branch: MAIN
CVS Tags: HEAD
Changes since 1.155: +1 -1 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 =encoding utf-8
2
3 =head1 NAME
4
5 AnyEvent::Util - various utility functions.
6
7 =head1 SYNOPSIS
8
9 use AnyEvent::Util;
10
11 =head1 DESCRIPTION
12
13 This module implements various utility functions, mostly replacing
14 well-known functions by event-ised counterparts.
15
16 All functions documented without C<AnyEvent::Util::> prefix are exported
17 by default.
18
19 =over 4
20
21 =cut
22
23 package AnyEvent::Util;
24
25 use Carp ();
26 use Errno ();
27 use Socket ();
28
29 use AnyEvent (); BEGIN { AnyEvent::common_sense }
30
31 use base 'Exporter';
32
33 our @EXPORT = qw(fh_nonblocking guard fork_call portable_pipe portable_socketpair run_cmd);
34 our @EXPORT_OK = qw(
35 AF_INET6 WSAEWOULDBLOCK WSAEINPROGRESS WSAEINVAL
36 close_all_fds_except
37 punycode_encode punycode_decode idn_nameprep idn_to_ascii idn_to_unicode
38 );
39
40 our $VERSION = $AnyEvent::VERSION;
41
42 BEGIN {
43 # provide us with AF_INET6, but only if allowed
44 if (
45 $AnyEvent::PROTOCOL{ipv6}
46 && _AF_INET6
47 && socket my $ipv6_socket, _AF_INET6, Socket::SOCK_DGRAM(), 0 # check if they can be created
48 ) {
49 *AF_INET6 = \&_AF_INET6;
50 } else {
51 # disable ipv6
52 *AF_INET6 = sub () { 0 };
53 delete $AnyEvent::PROTOCOL{ipv6};
54 }
55
56 # fix buggy Errno on some non-POSIX platforms
57 # such as openbsd and windows.
58 my %ERR = (
59 EBADMSG => Errno::EDOM (),
60 EPROTO => Errno::ESPIPE (),
61 );
62
63 while (my ($k, $v) = each %ERR) {
64 next if eval "Errno::$k ()";
65 AE::log 8 => "Broken Errno module, adding Errno::$k.";
66
67 eval "sub Errno::$k () { $v }";
68 push @Errno::EXPORT_OK, $k;
69 push @{ $Errno::EXPORT_TAGS{POSIX} }, $k;
70 }
71 }
72
73 =item ($r, $w) = portable_pipe
74
75 Calling C<pipe> in Perl is portable - except it doesn't really work on
76 sucky windows platforms (at least not with most perls - cygwin's perl
77 notably works fine): On windows, you actually get two file handles you
78 cannot use select on.
79
80 This function gives you a pipe that actually works even on the broken
81 windows platform (by creating a pair of TCP sockets on windows, so do not
82 expect any speed from that) and using C<pipe> everywhere else.
83
84 See C<portable_socketpair>, below, for a bidirectional "pipe".
85
86 Returns the empty list on any errors.
87
88 =item ($fh1, $fh2) = portable_socketpair
89
90 Just like C<portable_pipe>, above, but returns a bidirectional pipe
91 (usually by calling C<socketpair> to create a local loopback socket pair,
92 except on windows, where it again returns two interconnected TCP sockets).
93
94 Returns the empty list on any errors.
95
96 =cut
97
98 BEGIN {
99 if (AnyEvent::WIN32) {
100 *_win32_socketpair = sub () {
101 # perl's socketpair emulation fails on many vista machines, because
102 # vista returns fantasy port numbers.
103
104 for (1..10) {
105 socket my $l, Socket::AF_INET(), Socket::SOCK_STREAM(), 0
106 or next;
107
108 bind $l, Socket::pack_sockaddr_in 0, "\x7f\x00\x00\x01"
109 or next;
110
111 my $sa = getsockname $l
112 or next;
113
114 listen $l, 1
115 or next;
116
117 socket my $r, Socket::AF_INET(), Socket::SOCK_STREAM(), 0
118 or next;
119
120 bind $r, Socket::pack_sockaddr_in 0, "\x7f\x00\x00\x01"
121 or next;
122
123 connect $r, $sa
124 or next;
125
126 accept my $w, $l
127 or next;
128
129 # vista has completely broken peername/sockname that return
130 # fantasy ports. this combo seems to work, though.
131 (Socket::unpack_sockaddr_in getpeername $r)[0]
132 == (Socket::unpack_sockaddr_in getsockname $w)[0]
133 or (($! = WSAEINVAL), next);
134
135 # vista example (you can't make this shit up...):
136 #(Socket::unpack_sockaddr_in getsockname $r)[0] == 53364
137 #(Socket::unpack_sockaddr_in getpeername $r)[0] == 53363
138 #(Socket::unpack_sockaddr_in getsockname $w)[0] == 53363
139 #(Socket::unpack_sockaddr_in getpeername $w)[0] == 53365
140
141 return ($r, $w);
142 }
143
144 ()
145 };
146
147 *portable_socketpair = \&_win32_socketpair;
148 *portable_pipe = \&_win32_socketpair;
149 } else {
150 *portable_pipe = sub () {
151 my ($r, $w);
152
153 pipe $r, $w
154 or return;
155
156 ($r, $w);
157 };
158
159 *portable_socketpair = sub () {
160 socketpair my $fh1, my $fh2, Socket::AF_UNIX(), Socket::SOCK_STREAM(), 0
161 or return;
162
163 ($fh1, $fh2)
164 };
165 }
166 }
167
168 =item fork_call { CODE } @args, $cb->(@res)
169
170 Executes the given code block asynchronously, by forking. Everything the
171 block returns will be transferred to the calling process (by serialising and
172 deserialising via L<Storable>).
173
174 If there are any errors, then the C<$cb> will be called without any
175 arguments. In that case, either C<$@> contains the exception (and C<$!> is
176 irrelevant), or C<$!> contains an error number. In all other cases, C<$@>
177 will be C<undef>ined.
178
179 The code block must not ever call an event-polling function or use
180 event-based programming that might cause any callbacks registered in the
181 parent to run.
182
183 Win32 spoilers: Due to the endlessly sucky and broken native windows
184 perls (there is no way to cleanly exit a child process on that platform
185 that doesn't also kill the parent), you have to make sure that your main
186 program doesn't exit as long as any C<fork_calls> are still in progress,
187 otherwise the program won't exit. Also, on most windows platforms some
188 memory will leak for every invocation. We are open for improvements that
189 don't require XS hackery.
190
191 Note that forking can be expensive in large programs (RSS 200MB+). On
192 windows, it is abysmally slow, do not expect more than 5..20 forks/s on
193 that sucky platform (note this uses perl's pseudo-threads, so avoid those
194 like the plague).
195
196 Example: poor man's async disk I/O (better use L<AnyEvent::IO> together
197 with L<IO::AIO>).
198
199 fork_call {
200 open my $fh, "</etc/passwd"
201 or die "passwd: $!";
202 local $/;
203 <$fh>
204 } sub {
205 my ($passwd) = @_;
206 ...
207 };
208
209 =item $AnyEvent::Util::MAX_FORKS [default: 10]
210
211 The maximum number of child processes that C<fork_call> will fork in
212 parallel. Any additional requests will be queued until a slot becomes free
213 again.
214
215 The environment variable C<PERL_ANYEVENT_MAX_FORKS> is used to initialise
216 this value.
217
218 =cut
219
220 our $MAX_FORKS = int 1 * $ENV{PERL_ANYEVENT_MAX_FORKS};
221 $MAX_FORKS = 10 if $MAX_FORKS <= 0;
222
223 my $forks;
224 my @fork_queue;
225
226 sub _fork_schedule;
227 sub _fork_schedule {
228 require Storable unless $Storable::VERSION;
229 require POSIX unless $POSIX::VERSION;
230
231 while ($forks < $MAX_FORKS) {
232 my $job = shift @fork_queue
233 or last;
234
235 ++$forks;
236
237 my $coderef = shift @$job;
238 my $cb = pop @$job;
239
240 # gimme a break...
241 my ($r, $w) = portable_pipe
242 or ($forks and last) # allow failures when we have at least one job
243 or die "fork_call: $!";
244
245 my $pid = fork;
246
247 if ($pid != 0) {
248 # parent
249 close $w;
250
251 my $buf;
252
253 my $ww; $ww = AE::io $r, 0, sub {
254 my $len = sysread $r, $buf, 65536, length $buf;
255
256 return unless defined $len or $! != Errno::EINTR;
257
258 if (!$len) {
259 undef $ww;
260 close $r;
261 --$forks;
262 _fork_schedule;
263
264 my $result = eval { Storable::thaw ($buf) };
265 $result = [$@] unless $result;
266 $@ = shift @$result;
267
268 $cb->(@$result);
269
270 # work around the endlessly broken windows perls
271 kill 9, $pid if AnyEvent::WIN32;
272
273 # clean up the pid
274 waitpid $pid, 0;
275 }
276 };
277
278 } elsif (defined $pid) {
279 # child
280 close $r;
281
282 my $result = eval {
283 local $SIG{__DIE__};
284
285 Storable::freeze ([undef, $coderef->(@$job)])
286 };
287
288 $result = Storable::freeze (["$@"])
289 if $@;
290
291 # windows forces us to these contortions
292 my $ofs;
293
294 while () {
295 my $len = (length $result) - $ofs
296 or last;
297
298 $len = syswrite $w, $result, $len < 65536 ? $len : 65536, $ofs;
299
300 last unless $len || (!defined $len && $! == Errno::EINTR);
301
302 $ofs += $len;
303 }
304
305 # on native windows, _exit KILLS YOUR FORKED CHILDREN!
306 if (AnyEvent::WIN32) {
307 shutdown $w, 1; # signal parent to please kill us
308 sleep 10; # give parent a chance to clean up
309 sysread $w, (my $buf), 1; # this *might* detect the parent exiting in some cases.
310 }
311 POSIX::_exit (0);
312 exit 1;
313
314 } elsif (($! != &Errno::EAGAIN && $! != &Errno::EWOULDBLOCK && $! != &Errno::ENOMEM) || !$forks) {
315 # we ignore some errors as long as we can run at least one job
316 # maybe we should wait a few seconds and retry instead
317 die "fork_call: $!";
318 }
319 }
320 }
321
322 sub fork_call(&@) {
323 push @fork_queue, [@_];
324 _fork_schedule;
325 }
326
327 END {
328 if (AnyEvent::WIN32) {
329 while ($forks) {
330 @fork_queue = ();
331 AnyEvent->one_event;
332 }
333 }
334 }
335
336 # to be removed
337 sub dotted_quad($) {
338 $_[0] =~ /^(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]?)
339 \.(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]?)
340 \.(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]?)
341 \.(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]?)$/x
342 }
343
344 # just a forwarder
345 sub inet_aton {
346 require AnyEvent::Socket;
347 *inet_aton = \&AnyEvent::Socket::inet_aton;
348 goto &inet_aton
349 }
350
351 =item fh_nonblocking $fh, $nonblocking
352
353 Sets the blocking state of the given filehandle (true == nonblocking,
354 false == blocking). Uses fcntl on anything sensible and ioctl FIONBIO on
355 broken (i.e. windows) platforms.
356
357 Instead of using this function, you could use C<AnyEvent::fh_block> or
358 C<AnyEvent::fh_unblock>.
359
360 =cut
361
362 BEGIN {
363 *fh_nonblocking = \&AnyEvent::_fh_nonblocking;
364 }
365
366 =item $guard = guard { CODE }
367
368 This function creates a special object that, when destroyed, will execute
369 the code block.
370
371 This is often handy in continuation-passing style code to clean up some
372 resource regardless of where you break out of a process.
373
374 The L<Guard> module will be used to implement this function, if it is
375 available. Otherwise a pure-perl implementation is used.
376
377 While the code is allowed to throw exceptions in unusual conditions, it is
378 not defined whether this exception will be reported (at the moment, the
379 Guard module and AnyEvent's pure-perl implementation both try to report
380 the error and continue).
381
382 You can call one method on the returned object:
383
384 =item $guard->cancel
385
386 This simply causes the code block not to be invoked: it "cancels" the
387 guard.
388
389 =cut
390
391 BEGIN {
392 if (!$ENV{PERL_ANYEVENT_AVOID_GUARD} && eval { require Guard; $Guard::VERSION >= 0.5 }) {
393 *guard = \&Guard::guard;
394 AE::log 8 => "Using Guard module to implement guards.";
395 } else {
396 *AnyEvent::Util::guard::DESTROY = sub {
397 local $@;
398
399 eval {
400 local $SIG{__DIE__};
401 ${$_[0]}->();
402 };
403
404 AE::log 4 => "Runtime error in AnyEvent::guard callback: $@" if $@;
405 };
406
407 *AnyEvent::Util::guard::cancel = sub ($) {
408 ${$_[0]} = sub { };
409 };
410
411 *guard = sub (&) {
412 bless \(my $cb = shift), "AnyEvent::Util::guard"
413 };
414
415 AE::log 8 => "Using pure-perl guard implementation.";
416 }
417 }
418
419 =item AnyEvent::Util::close_all_fds_except @fds
420
421 This rarely-used function simply closes all file descriptors (or tries to)
422 of the current process except the ones given as arguments.
423
424 When you want to start a long-running background server, then it is often
425 beneficial to do this, as too many C-libraries are too stupid to mark
426 their internal fd's as close-on-exec.
427
428 The function expects to be called shortly before an C<exec> call.
429
430 Example: close all fds except 0, 1, 2.
431
432 close_all_fds_except 0, 2, 1;
433
434 =cut
435
436 sub close_all_fds_except {
437 my %except; @except{@_} = ();
438
439 require POSIX unless $POSIX::VERSION;
440
441 # some OSes have a usable /dev/fd, sadly, very few
442 if ($^O =~ /(freebsd|cygwin|linux)/) {
443 # netbsd, openbsd, solaris have a broken /dev/fd
444 my $dir;
445 if (opendir $dir, "/dev/fd" or opendir $dir, "/proc/self/fd") {
446 my @fds = sort { $a <=> $b } grep /^\d+$/, readdir $dir;
447 # broken OS's have device nodes for 0..63 usually, solaris 0..255
448 if (@fds < 20 or "@fds" ne join " ", 0..$#fds) {
449 # assume the fds array is valid now
450 exists $except{$_} or POSIX::close ($_)
451 for @fds;
452 return;
453 }
454 }
455 }
456
457 my $fd_max = eval { POSIX::sysconf (POSIX::_SC_OPEN_MAX ()) - 1 } || 1023;
458
459 exists $except{$_} or POSIX::close ($_)
460 for 0..$fd_max;
461 }
462
463 =item $cv = run_cmd $cmd, key => value...
464
465 Run a given external command, potentially redirecting file descriptors and
466 return a condition variable that gets sent the exit status (like C<$?>)
467 when the program exits I<and> all redirected file descriptors have been
468 exhausted.
469
470 The C<$cmd> is either a single string, which is then passed to a shell, or
471 an arrayref, which is passed to the C<execvp> function (the first array
472 element is used both for the executable name and argv[0]).
473
474 The key-value pairs can be:
475
476 =over 4
477
478 =item ">" => $filename
479
480 Redirects program standard output into the specified filename, similar to C<<
481 >filename >> in the shell.
482
483 =item ">" => \$data
484
485 Appends program standard output to the referenced scalar. The condvar will
486 not be signalled before EOF or an error is signalled.
487
488 Specifying the same scalar in multiple ">" pairs is allowed, e.g. to
489 redirect both stdout and stderr into the same scalar:
490
491 ">" => \$output,
492 "2>" => \$output,
493
494 =item ">" => $filehandle
495
496 Redirects program standard output to the given filehandle (or actually its
497 underlying file descriptor).
498
499 =item ">" => $callback->($data)
500
501 Calls the given callback each time standard output receives some data,
502 passing it the data received. On EOF or error, the callback will be
503 invoked once without any arguments.
504
505 The condvar will not be signalled before EOF or an error is signalled.
506
507 =item "fd>" => $see_above
508
509 Like ">", but redirects the specified fd number instead.
510
511 =item "<" => $see_above
512
513 The same, but redirects the program's standard input instead. The same
514 forms as for ">" are allowed.
515
516 In the callback form, the callback is supposed to return data to be
517 written, or the empty list or C<undef> or a zero-length scalar to signal
518 EOF.
519
520 Similarly, either the write data must be exhausted or an error is to be
521 signalled before the condvar is signalled, for both string-reference and
522 callback forms.
523
524 =item "fd<" => $see_above
525
526 Like "<", but redirects the specified file descriptor instead.
527
528 =item on_prepare => $cb
529
530 Specify a callback that is executed just before the command is C<exec>'ed,
531 in the child process. Be careful not to use any event handling or other
532 services not available in the child.
533
534 This can be useful to set up the environment in special ways, such as
535 changing the priority of the command or manipulating signal handlers (e.g.
536 setting C<SIGINT> to C<IGNORE>).
537
538 =item close_all => $boolean
539
540 When C<close_all> is enabled (default is disabled), then all extra file
541 descriptors will be closed, except the ones that were redirected and C<0>,
542 C<1> and C<2>.
543
544 See C<close_all_fds_except> for more details.
545
546 =item '$$' => \$pid
547
548 A reference to a scalar which will receive the PID of the newly-created
549 subprocess after C<run_cmd> returns.
550
551 Note the the PID might already have been recycled and used by an unrelated
552 process at the time C<run_cmd> returns, so it's not useful to send
553 signals, or for use as a unique key in data structures and so on.
554
555 =back
556
557 Example: run C<rm -rf />, redirecting standard input, output and error to
558 F</dev/null>.
559
560 my $cv = run_cmd [qw(rm -rf /)],
561 "<", "/dev/null",
562 ">", "/dev/null",
563 "2>", "/dev/null";
564 $cv->recv and die "d'oh! something survived!"
565
566 Example: run F<openssl> and create a self-signed certificate and key,
567 storing them in C<$cert> and C<$key>. When finished, check the exit status
568 in the callback and print key and certificate.
569
570 my $cv = run_cmd [qw(openssl req
571 -new -nodes -x509 -days 3650
572 -newkey rsa:2048 -keyout /dev/fd/3
573 -batch -subj /CN=AnyEvent
574 )],
575 "<", "/dev/null",
576 ">" , \my $cert,
577 "3>", \my $key,
578 "2>", "/dev/null";
579
580 $cv->cb (sub {
581 shift->recv and die "openssl failed";
582
583 print "$key\n$cert\n";
584 });
585
586 =cut
587
588 sub run_cmd {
589 my $cmd = shift;
590
591 require POSIX unless $POSIX::VERSION;
592
593 my $cv = AE::cv;
594
595 my %arg;
596 my %redir;
597 my @exe;
598
599 while (@_) {
600 my ($type, $ob) = splice @_, 0, 2;
601
602 my $fd = $type =~ s/^(\d+)// ? $1 : undef;
603
604 if ($type eq ">") {
605 $fd = 1 unless defined $fd;
606
607 if (defined eval { fileno $ob }) {
608 $redir{$fd} = $ob;
609 } elsif (ref $ob) {
610 my ($pr, $pw) = AnyEvent::Util::portable_pipe;
611 $cv->begin;
612
613 fcntl $pr, AnyEvent::F_SETFD, AnyEvent::FD_CLOEXEC;
614 fh_nonblocking $pr, 1;
615 my $w; $w = AE::io $pr, 0,
616 "SCALAR" eq ref $ob
617 ? sub {
618 defined (sysread $pr, $$ob, 16384, length $$ob
619 and return)
620 or ($! == Errno::EINTR and return);
621 undef $w; $cv->end;
622 }
623 : sub {
624 my $buf;
625 defined (sysread $pr, $buf, 16384
626 and return $ob->($buf))
627 or ($! == Errno::EINTR and return);
628 undef $w; $cv->end;
629 $ob->();
630 }
631 ;
632 $redir{$fd} = $pw;
633 } else {
634 push @exe, sub {
635 open my $fh, ">", $ob
636 or POSIX::_exit (125);
637 $redir{$fd} = $fh;
638 };
639 }
640
641 } elsif ($type eq "<") {
642 $fd = 0 unless defined $fd;
643
644 if (defined eval { fileno $ob }) {
645 $redir{$fd} = $ob;
646 } elsif (ref $ob) {
647 my ($pr, $pw) = AnyEvent::Util::portable_pipe;
648 $cv->begin;
649
650 my $data;
651 if ("SCALAR" eq ref $ob) {
652 $data = $$ob;
653 $ob = sub { };
654 } else {
655 $data = $ob->();
656 }
657
658 fcntl $pw, AnyEvent::F_SETFD, AnyEvent::FD_CLOEXEC;
659 fh_nonblocking $pw, 1;
660 my $w; $w = AE::io $pw, 1, sub {
661 my $len = syswrite $pw, $data;
662
663 return unless defined $len or $! != Errno::EINTR;
664
665 if (!$len) {
666 undef $w; $cv->end;
667 } else {
668 substr $data, 0, $len, "";
669 unless (length $data) {
670 $data = $ob->();
671 unless (length $data) {
672 undef $w; $cv->end
673 }
674 }
675 }
676 };
677
678 $redir{$fd} = $pr;
679 } else {
680 push @exe, sub {
681 open my $fh, "<", $ob
682 or POSIX::_exit (125);
683 $redir{$fd} = $fh;
684 };
685 }
686
687 } else {
688 $arg{$type} = $ob;
689 }
690 }
691
692 my $pid = fork;
693
694 defined $pid
695 or Carp::croak "fork: $!";
696
697 unless ($pid) {
698 # step 1, execute
699 $_->() for @exe;
700
701 # step 2, move any existing fd's out of the way
702 # this also ensures that dup2 is never called with fd1==fd2
703 # so the cloexec flag is always cleared
704 my (@oldfh, @close);
705 for my $fh (values %redir) {
706 push @oldfh, $fh; # make sure we keep it open
707 $fh = fileno $fh; # we only want the fd
708
709 # dup if we are in the way
710 # if we "leak" fds here, they will be dup2'ed over later
711 defined ($fh = POSIX::dup ($fh)) or POSIX::_exit (124)
712 while exists $redir{$fh};
713 }
714
715 # step 3, execute redirects
716 while (my ($k, $v) = each %redir) {
717 defined POSIX::dup2 ($v, $k)
718 or POSIX::_exit (123);
719 }
720
721 # step 4, close everything else, except 0, 1, 2
722 if ($arg{close_all}) {
723 close_all_fds_except 0, 1, 2, keys %redir
724 } else {
725 POSIX::close ($_)
726 for values %redir;
727 }
728
729 eval { $arg{on_prepare}(); 1 } or POSIX::_exit (123)
730 if exists $arg{on_prepare};
731
732 ref $cmd
733 ? exec {$cmd->[0]} @$cmd
734 : exec $cmd;
735
736 POSIX::_exit (126);
737 }
738
739 ${$arg{'$$'}} = $pid
740 if $arg{'$$'};
741
742 %redir = (); # close child side of the fds
743
744 my $status;
745 $cv->begin (sub { shift->send ($status) });
746 my $cw; $cw = AE::child $pid, sub {
747 $status = $_[1];
748 undef $cw; $cv->end;
749 };
750
751 $cv
752 }
753
754 =item AnyEvent::Util::punycode_encode $string
755
756 Punycode-encodes the given C<$string> and returns its punycode form. Note
757 that uppercase letters are I<not> casefolded - you have to do that
758 yourself.
759
760 Croaks when it cannot encode the string.
761
762 =item AnyEvent::Util::punycode_decode $string
763
764 Tries to punycode-decode the given C<$string> and return its unicode
765 form. Again, uppercase letters are not casefoled, you have to do that
766 yourself.
767
768 Croaks when it cannot decode the string.
769
770 =cut
771
772 sub punycode_encode($) {
773 require "AnyEvent/Util/idna.pl";
774 goto &punycode_encode;
775 }
776
777 sub punycode_decode($) {
778 require "AnyEvent/Util/idna.pl";
779 goto &punycode_decode;
780 }
781
782 =item AnyEvent::Util::idn_nameprep $idn[, $display]
783
784 Implements the IDNA nameprep normalisation algorithm. Or actually the
785 UTS#46 algorithm. Or maybe something similar - reality is complicated
786 between IDNA2003, UTS#46 and IDNA2008. If C<$display> is true then the name
787 is prepared for display, otherwise it is prepared for lookup (default).
788
789 If you have no clue what this means, look at C<idn_to_ascii> instead.
790
791 This function is designed to avoid using a lot of resources - it uses
792 about 1MB of RAM (most of this due to Unicode::Normalize). Also, names
793 that are already "simple" will only be checked for basic validity, without
794 the overhead of full nameprep processing.
795
796 =cut
797
798 our ($uts46_valid, $uts46_imap);
799
800 sub idn_nameprep($;$) {
801 local $_ = $_[0];
802
803 # lowercasing these should always be valid, and is required for xn-- detection
804 y/A-Z/a-z/;
805
806 if (/[^0-9a-z\-.]/) {
807 # load the mapping data
808 unless (defined $uts46_imap) {
809 require Unicode::Normalize;
810 require "AnyEvent/Util/uts46data.pl";
811 }
812
813 # uts46 nameprep
814
815 # I naively tried to use a regex/transliterate approach first,
816 # with one regex and one y///, but the compiled code was 4.5MB.
817 # this version has a bit-table for the valid class, and
818 # a char-replacement search string
819
820 # for speed (cough) reasons, we skip-case 0-9a-z, -, ., which
821 # really ought to be trivially valid. A-Z is valid, but already lowercased.
822 s{
823 ([^0-9a-z\-.])
824 }{
825 my $chr = $1;
826 unless (vec $uts46_valid, ord $chr, 1) {
827 # not in valid class, search for mapping
828 utf8::encode $chr; # the imap table is in utf-8
829 (my $rep = index $uts46_imap, "\x00$chr") >= 0
830 or Carp::croak "$_[0]: disallowed characters (U+" . (unpack "H*", $chr) . ") during idn_nameprep";
831
832 (substr $uts46_imap, $rep, 128) =~ /\x00 .[\x80-\xbf]* ([^\x00]*) \x00/x
833 or die "FATAL: idn_nameprep imap table has unexpected contents";
834
835 $rep = $1;
836 $chr = $rep unless $rep =~ s/^\x01// && $_[1]; # replace unless deviation and display
837 utf8::decode $chr;
838 }
839 $chr
840 }gex;
841
842 # KC
843 $_ = Unicode::Normalize::NFKC ($_);
844 }
845
846 # decode punycode components, check for invalid xx-- prefixes
847 s{
848 (^|\.)(..)--([^\.]*)
849 }{
850 my ($pfx, $ace, $pc) = ($1, $2, $3);
851
852 if ($ace eq "xn") {
853 $pc = punycode_decode $pc; # will croak on error (we hope :)
854
855 require Unicode::Normalize;
856 $pc eq Unicode::Normalize::NFC ($pc)
857 or Carp::croak "$_[0]: punycode label not in NFC detected during idn_nameprep";
858
859 "$pfx$pc"
860 } elsif ($ace !~ /^[a-z0-9]{2}$/) {
861 "$pfx$ace--$pc"
862 } else {
863 Carp::croak "$_[0]: hyphens in 3rd/4th position of a label are not allowed";
864 }
865 }gex;
866
867 # uts46 verification
868 /\.-|-\./
869 and Carp::croak "$_[0]: invalid hyphens detected during idn_nameprep";
870
871 # missing: label begin with combining mark, idna2008 bidi
872
873 # now check validity of each codepoint
874 if (/[^0-9a-z\-.]/) {
875 # load the mapping data
876 unless (defined $uts46_imap) {
877 require "AnyEvent/Util/uts46data.pl";
878 }
879
880 vec $uts46_valid, ord, 1
881 or $_[1] && 0 <= index $uts46_imap, pack "C0U*", 0, ord, 1 # deviation == \x00$chr\x01
882 or Carp::croak "$_[0]: disallowed characters during idn_nameprep"
883 for split //;
884 }
885
886 $_
887 }
888
889 =item $domainname = AnyEvent::Util::idn_to_ascii $idn
890
891 Converts the given unicode string (C<$idn>, international domain name,
892 e.g. 日本語。JP) to a pure-ASCII domain name (this is usually
893 called the "IDN ToAscii" transform). This transformation is idempotent,
894 which means you can call it just in case and it will do the right thing.
895
896 Unlike some other "ToAscii" implementations, this one works on full domain
897 names and should never fail - if it cannot convert the name, then it will
898 return it unchanged.
899
900 This function is an amalgam of IDNA2003, UTS#46 and IDNA2008 - it tries to
901 be reasonably compatible to other implementations, reasonably secure, as
902 much as IDNs can be secure, and reasonably efficient when confronted with
903 IDNs that are already valid DNS names.
904
905 =cut
906
907 sub idn_to_ascii($) {
908 return $_[0]
909 unless $_[0] =~ /[^\x00-\x7f]/;
910
911 my @output;
912
913 eval {
914 # punycode by label
915 for (split /\./, (idn_nameprep $_[0]), -1) {
916 if (/[^\x00-\x7f]/) {
917 eval {
918 push @output, "xn--" . punycode_encode $_;
919 1;
920 } or do {
921 push @output, $_;
922 };
923 } else {
924 push @output, $_;
925 }
926 }
927
928 1
929 } or return $_[0];
930
931 shift @output
932 while !length $output[0] && @output > 1;
933
934 join ".", @output
935 }
936
937 =item $idn = AnyEvent::Util::idn_to_unicode $idn
938
939 Converts the given unicode string (C<$idn>, international domain name,
940 e.g. 日本語。JP, www.deliantra.net, www.xn--l-0ga.de) to
941 unicode form (this is usually called the "IDN ToUnicode" transform). This
942 transformation is idempotent, which means you can call it just in case and
943 it will do the right thing.
944
945 Unlike some other "ToUnicode" implementations, this one works on full
946 domain names and should never fail - if it cannot convert the name, then
947 it will return it unchanged.
948
949 This function is an amalgam of IDNA2003, UTS#46 and IDNA2008 - it tries to
950 be reasonably compatible to other implementations, reasonably secure, as
951 much as IDNs can be secure, and reasonably efficient when confronted with
952 IDNs that are already valid DNS names.
953
954 At the moment, this function simply calls C<idn_nameprep $idn, 1>,
955 returning its argument when that function fails.
956
957 =cut
958
959 sub idn_to_unicode($) {
960 my $res = eval { idn_nameprep $_[0], 1 };
961 defined $res ? $res : $_[0]
962 }
963
964 =back
965
966 =head1 AUTHOR
967
968 Marc Lehmann <schmorp@schmorp.de>
969 http://anyevent.schmorp.de
970
971 =cut
972
973 1
974