#!/opt/bin/perl # creates lib/AnyEvent/Util/uts46.pl - better do not run it! use common::sense; use utf8; no warnings 'utf8'; binmode STDOUT, ":utf8"; open my $fh, "GET http://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt |" or die; my $valid; my $imap; # index map \x00 char replacement while (<$fh>) { next unless /^[0-9A-F]/; /^ ([0-9A-F]{4,}) (?: \.\.([0-9A-F]{4,}) )? \s*;\s*(\S+) (?: \s*;\s*([0-9A-F ]+?) )? (?: \s*;[^;]+ )? \s* (?: \#.* )? $ /x or die "$_: unparsable"; my ($r1, $r2, $type, $map) = (hex $1, hex $2, $3, $4); my $R1 = chr $r1; my $R2 = chr $r2; $map = join "", map chr hex, split ' ', $map; $type = "valid" if $type eq "deviation"; # use non-transitional behaviour for deviation characters given ($type) { when (/^(?:disallowed|disallowed_STD3_valid|disallowed_STD3_mapped)$/) { # nop } when (/^(?:mapped|deviation|ignored)$/) { $map = "\x01$map" if $type eq "deviation"; $imap .= "\x00" . chr . $map for $r1 .. $r2 || $r1; } when (/^(?:valid)$/) { (vec $valid, $_, 1) = 1 for $r1 .. $r2 || $r1; } default { die "default: $R1,$R2,$type,$map;\n"; } } } open my $fh, ">lib/AnyEvent/Util/uts46data.pl" or die; binmode $fh, ":perlio"; print $fh "# autogenerated by util/gen_uts46data\n"; utf8::encode $imap; 0 > index $imap, "\x02" # it's not supposed to be anywhere in there or die "imap contains \\x02"; print $fh "\$uts46_imap = q\x02$imap\x00\x02;\n"; # try to find a valid quoting character - there usually are many legal combos for (33..112, 1..31) { # stay out of utf-8 range, prefer printable things if (0 >= index $valid, chr) { my $q = chr; # primitive compression $valid =~ s/(\x00{32,})/"$q.(\"\x00\"x" . (length $1) . ").$q"/ge; $valid =~ s/(\xff{32,})/"$q.(\"\xff\"x" . (length $1) . ").$q"/ge; print $fh "\$uts46_valid = q$q$valid$q;\n"; goto valid_ok; } } die "unable to found valid quoting character"; valid_ok:; print $fh "1;\n"; close $fh;