… | |
… | |
6 | use utf8; |
6 | use utf8; |
7 | no warnings 'utf8'; |
7 | no warnings 'utf8'; |
8 | |
8 | |
9 | binmode STDOUT, ":utf8"; |
9 | binmode STDOUT, ":utf8"; |
10 | |
10 | |
11 | open my $fh, "GET http://www.unicode.org/Public/idna/5.2.0/IdnaMappingTable.txt |" |
11 | open my $fh, "GET http://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt |" |
12 | or die; |
12 | or die; |
13 | |
13 | |
14 | my $valid; |
14 | my $valid; |
15 | my $imap; # index map \x00 char replacement |
15 | my $imap; # index map \x00 char replacement |
16 | |
16 | |
… | |
… | |
19 | |
19 | |
20 | /^ |
20 | /^ |
21 | ([0-9A-F]{4,}) (?: \.\.([0-9A-F]{4,}) )? |
21 | ([0-9A-F]{4,}) (?: \.\.([0-9A-F]{4,}) )? |
22 | \s*;\s*(\S+) |
22 | \s*;\s*(\S+) |
23 | (?: \s*;\s*([0-9A-F ]+?) )? |
23 | (?: \s*;\s*([0-9A-F ]+?) )? |
|
|
24 | (?: \s*;[^;]+ )? |
24 | \s* |
25 | \s* |
25 | (?: \#.* )? |
26 | (?: \#.* )? |
26 | $ |
27 | $ |
27 | /x or die "$_: unparsable"; |
28 | /x or die "$_: unparsable"; |
28 | |
29 | |
… | |
… | |
31 | my $R1 = chr $r1; |
32 | my $R1 = chr $r1; |
32 | my $R2 = chr $r2; |
33 | my $R2 = chr $r2; |
33 | |
34 | |
34 | $map = join "", map chr hex, split ' ', $map; |
35 | $map = join "", map chr hex, split ' ', $map; |
35 | |
36 | |
|
|
37 | $type = "valid" if $type eq "deviation"; # use non-transitional behaviour for deviation characters |
|
|
38 | |
36 | given ($type) { |
39 | given ($type) { |
37 | when ("disallowed") { |
40 | when (/^(?:disallowed|disallowed_STD3_valid|disallowed_STD3_mapped)$/) { |
38 | # nop |
41 | # nop |
39 | } |
42 | } |
40 | when (/mapped|deviation|ignored/) { |
43 | when (/^(?:mapped|deviation|ignored)$/) { |
41 | $map = "\x01$map" if $type eq "deviation"; |
44 | $map = "\x01$map" if $type eq "deviation"; |
42 | |
45 | |
43 | $imap .= "\x00" . chr . $map |
46 | $imap .= "\x00" . chr . $map |
44 | for $r1 .. $r2 || $r1; |
47 | for $r1 .. $r2 || $r1; |
45 | } |
48 | } |
46 | when ("valid") { |
49 | when (/^(?:valid)$/) { |
47 | (vec $valid, $_, 1) = 1 |
50 | (vec $valid, $_, 1) = 1 |
48 | for $r1 .. $r2 || $r1; |
51 | for $r1 .. $r2 || $r1; |
49 | } |
52 | } |
50 | default { |
53 | default { |
51 | die "default: $R1,$R2,$type,$map;\n"; |
54 | die "default: $R1,$R2,$type,$map;\n"; |
… | |
… | |
54 | } |
57 | } |
55 | |
58 | |
56 | open my $fh, ">lib/AnyEvent/Util/uts46data.pl" |
59 | open my $fh, ">lib/AnyEvent/Util/uts46data.pl" |
57 | or die; |
60 | or die; |
58 | binmode $fh, ":perlio"; |
61 | binmode $fh, ":perlio"; |
59 | print $fh "# created by gen_uts46data\n"; |
62 | print $fh "# autogenerated by util/gen_uts46data\n"; |
60 | |
63 | |
61 | utf8::encode $imap; |
64 | utf8::encode $imap; |
62 | 0 > index $imap, "\x02" # it's not supposed to be anywhere in there |
65 | 0 > index $imap, "\x02" # it's not supposed to be anywhere in there |
63 | or die "imap contains \\x02"; |
66 | or die "imap contains \\x02"; |
64 | print $fh "\$uts46_imap = q\x02$imap\x00\x02;\n"; |
67 | print $fh "\$uts46_imap = q\x02$imap\x00\x02;\n"; |
65 | |
68 | |
66 | # try to find a valid quoting character - there usually are many legal combos |
69 | # try to find a valid quoting character - there usually are many legal combos |
67 | for (1..127) { # stay out of utf-8 range |
70 | for (33..112, 1..31) { # stay out of utf-8 range, prefer printable things |
68 | if (0 >= index $valid, chr) { |
71 | if (0 >= index $valid, chr) { |
|
|
72 | my $q = chr; |
|
|
73 | |
|
|
74 | # primitive compression |
|
|
75 | $valid =~ s/(\x00{32,})/"$q.(\"\x00\"x" . (length $1) . ").$q"/ge; |
|
|
76 | $valid =~ s/(\xff{32,})/"$q.(\"\xff\"x" . (length $1) . ").$q"/ge; |
|
|
77 | |
69 | print $fh "\$uts46_valid = q", chr, $valid, chr, ";\n"; |
78 | print $fh "\$uts46_valid = q$q$valid$q;\n"; |
70 | goto valid_ok; |
79 | goto valid_ok; |
71 | } |
80 | } |
72 | } |
81 | } |
73 | die "unable to found valid quoting character"; |
82 | die "unable to found valid quoting character"; |
74 | valid_ok:; |
83 | valid_ok:; |