1 |
#!/opt/bin/perl |
2 |
|
3 |
# the generated tables mostly have NOT been checked so far! |
4 |
|
5 |
# apt install libencode-hanextra-perl libencode-jis2k-perl |
6 |
|
7 |
use v5.8.2; |
8 |
use utf8; |
9 |
use Encode; |
10 |
use Encode::JP; |
11 |
use Encode::CN; |
12 |
use Encode::KR; |
13 |
use Encode::HanExtra; |
14 |
use Encode::JIS2K; |
15 |
|
16 |
my $gen; |
17 |
|
18 |
$TO_UNICODE = 0; # also generate to_unicode tables |
19 |
|
20 |
sub linear { |
21 |
my ($a, $l, $h, $b) = @_; |
22 |
for ($l .. $h) { |
23 |
return unless defined $a->[$_] && $a->[$_] == $_ + ($b - $l); |
24 |
} |
25 |
1; |
26 |
} |
27 |
|
28 |
sub wrap { |
29 |
my $res = " "; |
30 |
$res .= "$1\n " while $_[0] =~ /\G(.{90}\S*?)\s+/gc; |
31 |
$res .= "$1" if $_[0] =~ /\G(.*)$/; |
32 |
$res; |
33 |
} |
34 |
|
35 |
my $last_tab_full; # hack |
36 |
|
37 |
sub gentab { |
38 |
my ($enc, $l, $h, $f, $rep) = @_; |
39 |
$last_tab_full = 0; |
40 |
|
41 |
" = {\n" |
42 |
. (wrap join ", ", |
43 |
map +(sprintf "0x$f", |
44 |
defined $enc->[$_] ? $enc->[$_] : $last_tab_full++ * 0 + $rep |
45 |
), $l..$h) |
46 |
. "\n};\n"; |
47 |
} |
48 |
|
49 |
sub gen { |
50 |
my ($enc, $base, $ch) = @_; |
51 |
|
52 |
my ($fun, $tab); |
53 |
my (@t1, @t2); |
54 |
|
55 |
for (255, 159, 127, 126, 125) { |
56 |
if (linear $enc, 0, $_, 0) { |
57 |
undef $enc->[$_] for 0..$_; |
58 |
$fun .= sprintf "if (· <= 0x%04x) return ·;\n", $_; |
59 |
} |
60 |
} |
61 |
|
62 |
for (126, 127, 128) { |
63 |
if (linear $enc, $_, 159, $_) { |
64 |
undef $enc->[$_] for $_..159; |
65 |
$fun .= sprintf "if (0x%04x <= · && · <= 0x%04x) return ·;\n", $_, 159; |
66 |
} |
67 |
} |
68 |
|
69 |
for (126, 127) { |
70 |
next unless defined $enc->[$_]; |
71 |
$fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $_, delete $enc->[$_]; |
72 |
} |
73 |
|
74 |
my @map; |
75 |
my @map2; |
76 |
|
77 |
for (0 .. $#$enc) { |
78 |
if (defined $enc->[$_]) { |
79 |
$map[$_] = $enc->[$_]; |
80 |
$map2[$_ >> 8]++; |
81 |
} |
82 |
} |
83 |
|
84 |
for my $p (0..255) { |
85 |
if ($map2[$p]) { |
86 |
my $b = $p << 8; |
87 |
|
88 |
my ($l, $h); |
89 |
for my $i (0..255) { $l = $i, last if defined $map[$b + $i]; } |
90 |
for my $i (0..255) { $h = 255 - $i, last if defined $map[$b + 255 - $i]; } |
91 |
|
92 |
if ($map2[$p] <= 5) { |
93 |
for ($l .. $h) { |
94 |
next unless defined $enc->[$b + $_]; |
95 |
$fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $b + $_, $enc->[$b + $_]; |
96 |
} |
97 |
} elsif (linear $enc, $b + $l, $b + $h, $enc->[$b + $l]) { |
98 |
my $dif = $enc->[$b + $l] - ($b + $l); |
99 |
$dif = $dif < 0 ? sprintf "- 0x%04x", -$dif : sprintf "+ 0x%04x", $dif; |
100 |
$fun .= sprintf "if (· >= 0x%04x && 0x%04x >= ·) return · %s;\n", $b + $l, $b + $h, $dif; |
101 |
} elsif ($map2[$p] <= 5) { # defunct |
102 |
$fun .= "switch (·)\n {\n"; |
103 |
for ($l .. $h) { |
104 |
next unless defined $enc->[$b + $_]; |
105 |
$fun .= sprintf " case 0x%04x: return 0x%04x;\n", $b + $_, $enc->[$b + $_]; |
106 |
} |
107 |
$fun .= " }\n"; |
108 |
} else { |
109 |
my ($i, $t, $f) = (0, "uint8_t", "%02x"); |
110 |
for ($l .. $h) { |
111 |
if ($enc->[$b + $_] > 255) { |
112 |
($i, $t, $f) = (1, "uint16_t", "%04x"); |
113 |
last; |
114 |
} |
115 |
} |
116 |
|
117 |
$i ? push @t2, [$p, $l, $h, $t, $f] |
118 |
: push @t1, [$b, $l, $h, $t, $f]; |
119 |
} |
120 |
|
121 |
} |
122 |
} |
123 |
|
124 |
if (@t2 <= 3) { |
125 |
push @t1, @t2; |
126 |
@t2 = (); |
127 |
} |
128 |
|
129 |
for (@t1) { |
130 |
my ($b, $l, $h, $t, $f) = @$_; |
131 |
my $rep = $b + $l == 0 ? 128 : 0; |
132 |
$tab .= "static const $t $base\_$b\[]" . gentab $enc, $b + $l, $b + $h, $f; |
133 |
$fun .= sprintf "if (0x%04x <= · && · <= 0x%04x)\n" |
134 |
. " return %s$base\_$b\[· - 0x%04x];\n", |
135 |
$b + $l, $b + $h, |
136 |
($last_tab_full ? sprintf "$base\_$b\[· - 0x%04x] == $rep ? NOCHAR : ", $b + $l : ""), |
137 |
$b + $l; |
138 |
} |
139 |
|
140 |
if (@t2) { |
141 |
my ($min, $max) = (255, 0); |
142 |
my ($l, $h) = (255, 0); |
143 |
for (@t2) { |
144 |
my ($p, $L, $H, $t, $f) = @$_; |
145 |
$max = $p if $p > $max; |
146 |
$min = $p if $p < $min; |
147 |
$l = $L if $L < $l; |
148 |
$h = $H if $H > $h; |
149 |
} |
150 |
|
151 |
$fun .= "uint8_t l = ·;\n"; |
152 |
$fun .= "uint16_t h = · >> 8;\n"; |
153 |
|
154 |
if ($h - $l < 200) { |
155 |
my $d = $h - $l + 1; |
156 |
|
157 |
my @enc; |
158 |
for $p ($min .. $max) { |
159 |
for $i ($l .. $h) { |
160 |
$enc[($p - $min) * $d + $i - $l] = $enc->[$p * 256 + $i]; |
161 |
} |
162 |
} |
163 |
$tab .= "static const uint16_t $base\_m[]" . gentab \@enc, 0, $#enc, "%04x"; |
164 |
|
165 |
if ($last_tab_full) { |
166 |
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
167 |
. " && 0x%02x <= l && l <= 0x%02x)\n" |
168 |
. " return $base\_m\[h * 0x%02x + l - 0x%04x]\n" |
169 |
. " ? $base\_m\[h * 0x%02x + l - 0x%04x]\n" |
170 |
. " : NOCHAR;\n", |
171 |
$min, $max, $l, $h, |
172 |
($d, $min * $d + $l) x 2; |
173 |
} else { |
174 |
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
175 |
. " && 0x%02x <= l && l <= 0x%02x)\n" |
176 |
. " return $base\_m\[h * 0x%02x + l - 0x%04x];\n", |
177 |
$min, $max, $l, $h, |
178 |
$d, |
179 |
$min * $d + $l; |
180 |
} |
181 |
} else { |
182 |
my @tab = (0) x ($max - $min); |
183 |
for (@t2) { |
184 |
my ($p, undef, undef, $t, $f) = @$_; |
185 |
$tab .= "static const $t $base\_$p\[]" . gentab $enc, $p * 256 + $l, $p * 256 + $h, $f, 0; |
186 |
$tab[$p - $min] = "$base\_$p"; |
187 |
} |
188 |
|
189 |
$tab .= "const uint16_t *$base\_i[] = {\n" |
190 |
. (wrap join ", ", @tab) |
191 |
. "\n};\n\n"; |
192 |
|
193 |
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
194 |
. " && 0x%02x <= l && l <= 0x%02x\n" |
195 |
. " && $base\_i[h - 0x%02x])\n" |
196 |
. " return $base\_i\[h - 0x%02x][l - 0x%02x]\n" |
197 |
. " ? $base\_i\[h - 0x%02x][l - 0x%02x]\n" |
198 |
. " : NOCHAR;\n", |
199 |
$min, $max, $l, $h, |
200 |
$min, ($min, $l) x 2; |
201 |
} |
202 |
} |
203 |
|
204 |
$fun .= "return NOCHAR;\n"; |
205 |
|
206 |
$fun =~ s/·/$ch/g; |
207 |
|
208 |
($tab, $fun); |
209 |
} |
210 |
|
211 |
while (<DATA>) { |
212 |
my ($group, $base, $cs, $type) = split /\s+/; |
213 |
|
214 |
my @enc1; |
215 |
my @enc2; |
216 |
|
217 |
for (0 .. 65535) { |
218 |
my $enc = encode $cs, (chr $_), Encode::FB_QUIET; |
219 |
|
220 |
if (length $enc) { |
221 |
my $code = hex unpack "H*", $enc; |
222 |
$enc1[$_] = $code; |
223 |
$enc2[$code] = $_ unless defined $enc2[$code]; |
224 |
} |
225 |
} |
226 |
|
227 |
my ($tab1, $fun1); |
228 |
my ($tab2, $fun2); |
229 |
|
230 |
($tab1, $fun1) = gen \@enc1, "$base\_f", "unicode"; |
231 |
($tab2, $fun2) = gen \@enc2, "$base\_t", "enc" if $TO_UNICODE; |
232 |
|
233 |
$fun1 =~ s/^/ /gm; |
234 |
$fun2 =~ s/^/ /gm; |
235 |
|
236 |
print "$base\n"; |
237 |
open OUT, ">table/$base.h" or die; |
238 |
|
239 |
print OUT <<EOF; |
240 |
// |
241 |
// AUTOMATICALLLY GENERATED by gentables |
242 |
// |
243 |
#ifdef ENCODING_$group |
244 |
|
245 |
$tab1$tab2 |
246 |
static uint32_t cs_$base\_from_unicode (unicode_t unicode) { |
247 |
$fun1} |
248 |
EOF |
249 |
|
250 |
if ($TO_UNICODE) { |
251 |
print OUT <<EOF |
252 |
#if ENCODING_TO_UNICODE |
253 |
static unicode_t cs_$base\_to_unicode (uint32_t enc) { |
254 |
$fun2} |
255 |
#endif |
256 |
EOF |
257 |
} |
258 |
|
259 |
print OUT <<EOF; |
260 |
|
261 |
#else |
262 |
|
263 |
#define cs_$base\_from_unicode cs_unknown_from_unicode |
264 |
#define cs_$base\_to_unicode cs_unknown_to_unicode |
265 |
|
266 |
#endif |
267 |
EOF |
268 |
|
269 |
close OUT; |
270 |
} |
271 |
|
272 |
__DATA__ |
273 |
VN viscii viscii |
274 |
KR ksc5601_1987_0 ksc5601-raw |
275 |
ZH gb2312_1980_0 gb2312-raw |
276 |
ZH gbk_0 gbk |
277 |
ZH_EXT cns11643_1992_1 cns11643-1 |
278 |
ZH_EXT cns11643_1992_2 cns11643-2 |
279 |
ZH_EXT cns11643_1992_3 cns11643-3 |
280 |
ZH_EXT cns11643_1992_4 cns11643-4 |
281 |
ZH_EXT cns11643_1992_5 cns11643-5 |
282 |
ZH_EXT cns11643_1992_6 cns11643-6 |
283 |
ZH_EXT cns11643_1992_7 cns11643-7 |
284 |
ZH_EXT cns11643_1992_f cns11643-f |
285 |
ZH big5 big5 |
286 |
ZH_EXT big5_ext big5ext |
287 |
ZH_EXT big5_plus big5plus |
288 |
EU koi8_r koi8-r |
289 |
EU koi8_u koi8-u |
290 |
DEFAULT iso8859_1 iso-8859-1 |
291 |
EU iso8859_2 iso-8859-2 |
292 |
EU iso8859_3 iso-8859-3 |
293 |
EU iso8859_4 iso-8859-4 |
294 |
EU iso8859_5 iso-8859-5 |
295 |
EU iso8859_6 iso-8859-6 |
296 |
EU iso8859_7 iso-8859-7 |
297 |
EU iso8859_8 iso-8859-8 |
298 |
EU iso8859_9 iso-8859-9 |
299 |
EU iso8859_10 iso-8859-10 |
300 |
EU iso8859_11 iso-8859-11 |
301 |
EU iso8859_13 iso-8859-13 |
302 |
EU iso8859_14 iso-8859-14 |
303 |
DEFAULT iso8859_15 iso-8859-15 |
304 |
EU iso8859_16 iso-8859-16 |
305 |
JP jis0201_1976_0 jis0201-raw |
306 |
JP jis0208_1990_0 jis0208-raw |
307 |
JP jis0212_1990_0 jis0212-raw |
308 |
JP_EXT jis0213_1 jis0213-1-raw |
309 |
JP_EXT jis0213_2 jis0213-2-raw |