| 1 |
#!/opt/bin/perl |
| 2 |
|
| 3 |
# creates lib/AnyEvent/Util/uts46.pl - better do not run it! |
| 4 |
|
| 5 |
use common::sense; |
| 6 |
use utf8; |
| 7 |
no warnings 'utf8'; |
| 8 |
|
| 9 |
binmode STDOUT, ":utf8"; |
| 10 |
|
| 11 |
open my $fh, "GET http://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt |" |
| 12 |
or die; |
| 13 |
|
| 14 |
my $valid; |
| 15 |
my $imap; # index map \x00 char replacement |
| 16 |
|
| 17 |
while (<$fh>) { |
| 18 |
next unless /^[0-9A-F]/; |
| 19 |
|
| 20 |
/^ |
| 21 |
([0-9A-F]{4,}) (?: \.\.([0-9A-F]{4,}) )? |
| 22 |
\s*;\s*(\S+) |
| 23 |
(?: \s*;\s*([0-9A-F ]+?) )? |
| 24 |
(?: \s*;[^;]+ )? |
| 25 |
\s* |
| 26 |
(?: \#.* )? |
| 27 |
$ |
| 28 |
/x or die "$_: unparsable"; |
| 29 |
|
| 30 |
my ($r1, $r2, $type, $map) = (hex $1, hex $2, $3, $4); |
| 31 |
|
| 32 |
my $R1 = chr $r1; |
| 33 |
my $R2 = chr $r2; |
| 34 |
|
| 35 |
$map = join "", map chr hex, split ' ', $map; |
| 36 |
|
| 37 |
$type = "valid" if $type eq "deviation"; # use non-transitional behaviour for deviation characters |
| 38 |
|
| 39 |
given ($type) { |
| 40 |
when (/^(?:disallowed|disallowed_STD3_valid|disallowed_STD3_mapped)$/) { |
| 41 |
# nop |
| 42 |
} |
| 43 |
when (/^(?:mapped|deviation|ignored)$/) { |
| 44 |
$map = "\x01$map" if $type eq "deviation"; |
| 45 |
|
| 46 |
$imap .= "\x00" . chr . $map |
| 47 |
for $r1 .. $r2 || $r1; |
| 48 |
} |
| 49 |
when (/^(?:valid)$/) { |
| 50 |
(vec $valid, $_, 1) = 1 |
| 51 |
for $r1 .. $r2 || $r1; |
| 52 |
} |
| 53 |
default { |
| 54 |
die "default: $R1,$R2,$type,$map;\n"; |
| 55 |
} |
| 56 |
} |
| 57 |
} |
| 58 |
|
| 59 |
open my $fh, ">lib/AnyEvent/Util/uts46data.pl" |
| 60 |
or die; |
| 61 |
binmode $fh, ":perlio"; |
| 62 |
print $fh "# autogenerated by util/gen_uts46data\n"; |
| 63 |
|
| 64 |
utf8::encode $imap; |
| 65 |
0 > index $imap, "\x02" # it's not supposed to be anywhere in there |
| 66 |
or die "imap contains \\x02"; |
| 67 |
print $fh "\$uts46_imap = q\x02$imap\x00\x02;\n"; |
| 68 |
|
| 69 |
# try to find a valid quoting character - there usually are many legal combos |
| 70 |
for (33..112, 1..31) { # stay out of utf-8 range, prefer printable things |
| 71 |
if (0 >= index $valid, chr) { |
| 72 |
my $q = chr; |
| 73 |
|
| 74 |
# primitive compression |
| 75 |
$valid =~ s/(\x00{32,})/"$q.(\"\x00\"x" . (length $1) . ").$q"/ge; |
| 76 |
$valid =~ s/(\xff{32,})/"$q.(\"\xff\"x" . (length $1) . ").$q"/ge; |
| 77 |
|
| 78 |
print $fh "\$uts46_valid = q$q$valid$q;\n"; |
| 79 |
goto valid_ok; |
| 80 |
} |
| 81 |
} |
| 82 |
die "unable to found valid quoting character"; |
| 83 |
valid_ok:; |
| 84 |
|
| 85 |
print $fh "1;\n"; |
| 86 |
close $fh; |
| 87 |
|