1 |
pcg |
1.1 |
#!/opt/bin/perl |
2 |
|
|
|
3 |
pcg |
1.5 |
# the generated tables mostly have NOT been checked so far! |
4 |
pcg |
1.1 |
|
5 |
|
|
use v5.8.2; |
6 |
|
|
use utf8; |
7 |
|
|
use Encode; |
8 |
root |
1.8 |
use Encode::JP; |
9 |
|
|
use Encode::CN; |
10 |
|
|
use Encode::KR; |
11 |
|
|
use Encode::HanExtra; |
12 |
|
|
use Encode::JIS2K; |
13 |
pcg |
1.1 |
|
14 |
|
|
my $gen; |
15 |
|
|
|
16 |
pcg |
1.5 |
$TO_UNICODE = 0; # also generate to_unicode tables |
17 |
|
|
|
18 |
pcg |
1.1 |
sub linear { |
19 |
|
|
my ($a, $l, $h, $b) = @_; |
20 |
|
|
for ($l .. $h) { |
21 |
|
|
return unless defined $a->[$_] && $a->[$_] == $_ + ($b - $l); |
22 |
|
|
} |
23 |
|
|
1; |
24 |
|
|
} |
25 |
|
|
|
26 |
|
|
sub wrap { |
27 |
|
|
my $res = " "; |
28 |
|
|
$res .= "$1\n " while $_[0] =~ /\G(.{90}\S*?)\s+/gc; |
29 |
|
|
$res .= "$1" if $_[0] =~ /\G(.*)$/; |
30 |
|
|
$res; |
31 |
|
|
} |
32 |
|
|
|
33 |
|
|
my $last_tab_full; # hack |
34 |
|
|
|
35 |
|
|
sub gentab { |
36 |
|
|
my ($enc, $l, $h, $f, $rep) = @_; |
37 |
|
|
$last_tab_full = 0; |
38 |
|
|
|
39 |
|
|
" = {\n" |
40 |
|
|
. (wrap join ", ", |
41 |
|
|
map +(sprintf "0x$f", |
42 |
|
|
defined $enc->[$_] ? $enc->[$_] : $last_tab_full++ * 0 + $rep |
43 |
|
|
), $l..$h) |
44 |
|
|
. "\n};\n"; |
45 |
|
|
} |
46 |
|
|
|
47 |
|
|
sub gen { |
48 |
|
|
my ($enc, $base, $ch) = @_; |
49 |
|
|
|
50 |
|
|
my ($fun, $tab); |
51 |
|
|
my (@t1, @t2); |
52 |
|
|
|
53 |
|
|
for (255, 159, 127, 126, 125) { |
54 |
|
|
if (linear $enc, 0, $_, 0) { |
55 |
|
|
undef $enc->[$_] for 0..$_; |
56 |
|
|
$fun .= sprintf "if (· <= 0x%04x) return ·;\n", $_; |
57 |
|
|
} |
58 |
|
|
} |
59 |
|
|
|
60 |
|
|
for (126, 127, 128) { |
61 |
|
|
if (linear $enc, $_, 159, $_) { |
62 |
|
|
undef $enc->[$_] for $_..159; |
63 |
|
|
$fun .= sprintf "if (0x%04x <= · && · <= 0x%04x) return ·;\n", $_, 159; |
64 |
|
|
} |
65 |
|
|
} |
66 |
|
|
|
67 |
|
|
for (126, 127) { |
68 |
|
|
next unless defined $enc->[$_]; |
69 |
|
|
$fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $_, delete $enc->[$_]; |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
my @map; |
73 |
|
|
my @map2; |
74 |
|
|
|
75 |
|
|
for (0 .. $#$enc) { |
76 |
|
|
if (defined $enc->[$_]) { |
77 |
|
|
$map[$_] = $enc->[$_]; |
78 |
|
|
$map2[$_ >> 8]++; |
79 |
|
|
} |
80 |
|
|
} |
81 |
|
|
|
82 |
|
|
for my $p (0..255) { |
83 |
|
|
if ($map2[$p]) { |
84 |
|
|
my $b = $p << 8; |
85 |
|
|
|
86 |
|
|
my ($l, $h); |
87 |
|
|
for my $i (0..255) { $l = $i, last if defined $map[$b + $i]; } |
88 |
|
|
for my $i (0..255) { $h = 255 - $i, last if defined $map[$b + 255 - $i]; } |
89 |
|
|
|
90 |
|
|
if ($map2[$p] <= 5) { |
91 |
|
|
for ($l .. $h) { |
92 |
|
|
next unless defined $enc->[$b + $_]; |
93 |
|
|
$fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $b + $_, $enc->[$b + $_]; |
94 |
|
|
} |
95 |
|
|
} elsif (linear $enc, $b + $l, $b + $h, $enc->[$b + $l]) { |
96 |
|
|
my $dif = $enc->[$b + $l] - ($b + $l); |
97 |
|
|
$dif = $dif < 0 ? sprintf "- 0x%04x", -$dif : sprintf "+ 0x%04x", $dif; |
98 |
|
|
$fun .= sprintf "if (· <= 0x%04x && 0x%04x <= ·) return · %s;\n", $b + $l, $b + $h, $dif; |
99 |
|
|
} elsif ($map2[$p] <= 5) { # defunct |
100 |
|
|
$fun .= "switch (·)\n {\n"; |
101 |
|
|
for ($l .. $h) { |
102 |
|
|
next unless defined $enc->[$b + $_]; |
103 |
|
|
$fun .= sprintf " case 0x%04x: return 0x%04x;\n", $b + $_, $enc->[$b + $_]; |
104 |
|
|
} |
105 |
|
|
$fun .= " }\n"; |
106 |
|
|
} else { |
107 |
|
|
my ($i, $t, $f) = (0, "uint8_t", "%02x"); |
108 |
|
|
for ($l .. $h) { |
109 |
|
|
if ($enc->[$b + $_] > 255) { |
110 |
|
|
($i, $t, $f) = (1, "uint16_t", "%04x"); |
111 |
|
|
last; |
112 |
|
|
} |
113 |
|
|
} |
114 |
|
|
|
115 |
|
|
$i ? push @t2, [$p, $l, $h, $t, $f] |
116 |
|
|
: push @t1, [$b, $l, $h, $t, $f]; |
117 |
|
|
} |
118 |
|
|
|
119 |
|
|
} |
120 |
|
|
} |
121 |
|
|
|
122 |
|
|
if (@t2 <= 3) { |
123 |
|
|
push @t1, @t2; |
124 |
|
|
@t2 = (); |
125 |
|
|
} |
126 |
|
|
|
127 |
|
|
for (@t1) { |
128 |
|
|
my ($b, $l, $h, $t, $f) = @$_; |
129 |
|
|
my $rep = $b + $l == 0 ? 128 : 0; |
130 |
|
|
$tab .= "static const $t $base\_$b\[]" . gentab $enc, $b + $l, $b + $h, $f; |
131 |
|
|
$fun .= sprintf "if (0x%04x <= · && · <= 0x%04x)\n" |
132 |
|
|
. " return %s$base\_$b\[· - 0x%04x];\n", |
133 |
|
|
$b + $l, $b + $h, |
134 |
|
|
($last_tab_full ? sprintf "$base\_$b\[· - 0x%04x] == $rep ? NOCHAR : ", $b + $l : ""), |
135 |
|
|
$b + $l; |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
if (@t2) { |
139 |
|
|
my ($min, $max) = (255, 0); |
140 |
|
|
my ($l, $h) = (255, 0); |
141 |
|
|
for (@t2) { |
142 |
|
|
my ($p, $L, $H, $t, $f) = @$_; |
143 |
|
|
$max = $p if $p > $max; |
144 |
|
|
$min = $p if $p < $min; |
145 |
|
|
$l = $L if $L < $l; |
146 |
|
|
$h = $H if $H > $h; |
147 |
|
|
} |
148 |
|
|
|
149 |
|
|
$fun .= "uint8_t l = ·;\n"; |
150 |
|
|
$fun .= "uint16_t h = · >> 8;\n"; |
151 |
|
|
|
152 |
|
|
if ($h - $l < 200) { |
153 |
|
|
my $d = $h - $l + 1; |
154 |
|
|
|
155 |
|
|
my @enc; |
156 |
|
|
for $p ($min .. $max) { |
157 |
|
|
for $i ($l .. $h) { |
158 |
|
|
$enc[($p - $min) * $d + $i - $l] = $enc->[$p * 256 + $i]; |
159 |
|
|
} |
160 |
|
|
} |
161 |
|
|
$tab .= "static const uint16_t $base\_m[]" . gentab \@enc, 0, $#enc, "%04x"; |
162 |
|
|
|
163 |
|
|
if ($last_tab_full) { |
164 |
|
|
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
165 |
|
|
. " && 0x%02x <= l && l <= 0x%02x)\n" |
166 |
|
|
. " return $base\_m\[h * 0x%02x + l - 0x%04x]\n" |
167 |
|
|
. " ? $base\_m\[h * 0x%02x + l - 0x%04x]\n" |
168 |
|
|
. " : NOCHAR;\n", |
169 |
|
|
$min, $max, $l, $h, |
170 |
|
|
($d, $min * $d + $l) x 2; |
171 |
|
|
} else { |
172 |
|
|
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
173 |
|
|
. " && 0x%02x <= l && l <= 0x%02x)\n" |
174 |
|
|
. " return $base\_m\[h * 0x%02x + l - 0x%04x];\n", |
175 |
|
|
$min, $max, $l, $h, |
176 |
|
|
$d, |
177 |
|
|
$min * $d + $l; |
178 |
|
|
} |
179 |
|
|
} else { |
180 |
|
|
my @tab = (0) x ($max - $min); |
181 |
|
|
for (@t2) { |
182 |
|
|
my ($p, undef, undef, $t, $f) = @$_; |
183 |
|
|
$tab .= "static const $t $base\_$p\[]" . gentab $enc, $p * 256 + $l, $p * 256 + $h, $f, 0; |
184 |
|
|
$tab[$p - $min] = "$base\_$p"; |
185 |
|
|
} |
186 |
|
|
|
187 |
|
|
$tab .= "const uint16_t *$base\_i[] = {\n" |
188 |
|
|
. (wrap join ", ", @tab) |
189 |
|
|
. "\n};\n\n"; |
190 |
|
|
|
191 |
|
|
$fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" |
192 |
|
|
. " && 0x%02x <= l && l <= 0x%02x\n" |
193 |
|
|
. " && $base\_i[h - 0x%02x])\n" |
194 |
|
|
. " return $base\_i\[h - 0x%02x][l - 0x%02x]\n" |
195 |
|
|
. " ? $base\_i\[h - 0x%02x][l - 0x%02x]\n" |
196 |
|
|
. " : NOCHAR;\n", |
197 |
|
|
$min, $max, $l, $h, |
198 |
|
|
$min, ($min, $l) x 2; |
199 |
|
|
} |
200 |
|
|
} |
201 |
|
|
|
202 |
|
|
$fun .= "return NOCHAR;\n"; |
203 |
|
|
|
204 |
|
|
$fun =~ s/·/$ch/g; |
205 |
|
|
|
206 |
|
|
($tab, $fun); |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
while (<DATA>) { |
210 |
|
|
my ($group, $base, $cs, $type) = split /\s+/; |
211 |
|
|
|
212 |
|
|
my @enc1; |
213 |
|
|
my @enc2; |
214 |
|
|
|
215 |
|
|
for (0 .. 65535) { |
216 |
|
|
my $enc = encode $cs, (chr $_), Encode::FB_QUIET; |
217 |
|
|
|
218 |
|
|
if (length $enc) { |
219 |
|
|
my $code = hex unpack "H*", $enc; |
220 |
|
|
$enc1[$_] = $code; |
221 |
|
|
$enc2[$code] = $_ unless defined $enc2[$code]; |
222 |
|
|
} |
223 |
|
|
} |
224 |
|
|
|
225 |
pcg |
1.5 |
my ($tab1, $fun1); |
226 |
|
|
my ($tab2, $fun2); |
227 |
|
|
|
228 |
|
|
($tab1, $fun1) = gen \@enc1, "$base\_f", "unicode"; |
229 |
|
|
($tab2, $fun2) = gen \@enc2, "$base\_t", "enc" if $TO_UNICODE; |
230 |
pcg |
1.1 |
|
231 |
|
|
$fun1 =~ s/^/ /gm; |
232 |
|
|
$fun2 =~ s/^/ /gm; |
233 |
|
|
|
234 |
|
|
print "$base\n"; |
235 |
|
|
open OUT, ">table/$base.h" or die; |
236 |
|
|
|
237 |
|
|
print OUT <<EOF; |
238 |
|
|
// |
239 |
|
|
// AUTOMATICALLLY GENERATED by gentables |
240 |
|
|
// |
241 |
|
|
#ifdef ENCODING_$group |
242 |
|
|
|
243 |
|
|
$tab1$tab2 |
244 |
root |
1.6 |
static uint32_t cs_$base\_from_unicode (unicode_t unicode) { |
245 |
|
|
$fun1} |
246 |
pcg |
1.5 |
EOF |
247 |
|
|
|
248 |
|
|
if ($TO_UNICODE) { |
249 |
|
|
print OUT <<EOF |
250 |
|
|
#if ENCODING_TO_UNICODE |
251 |
root |
1.6 |
static unicode_t cs_$base\_to_unicode (uint32_t enc) { |
252 |
|
|
$fun2} |
253 |
pcg |
1.5 |
#endif |
254 |
|
|
EOF |
255 |
|
|
} |
256 |
|
|
|
257 |
|
|
print OUT <<EOF; |
258 |
pcg |
1.1 |
|
259 |
|
|
#else |
260 |
|
|
|
261 |
root |
1.6 |
#define cs_$base\_from_unicode cs_unknown_from_unicode |
262 |
|
|
#define cs_$base\_to_unicode cs_unknown_to_unicode |
263 |
pcg |
1.1 |
|
264 |
|
|
#endif |
265 |
|
|
EOF |
266 |
|
|
|
267 |
|
|
close OUT; |
268 |
|
|
} |
269 |
|
|
|
270 |
|
|
__DATA__ |
271 |
|
|
VN viscii viscii |
272 |
|
|
KR ksc5601_1987_0 ksc5601-raw |
273 |
root |
1.7 |
ZH gb2312_1980_0 gb2312-raw |
274 |
root |
1.9 |
ZH gbk_0 gbk |
275 |
root |
1.7 |
ZH_EXT cns11643_1992_1 cns11643-1 |
276 |
|
|
ZH_EXT cns11643_1992_2 cns11643-2 |
277 |
|
|
ZH_EXT cns11643_1992_3 cns11643-3 |
278 |
|
|
ZH_EXT cns11643_1992_4 cns11643-4 |
279 |
|
|
ZH_EXT cns11643_1992_5 cns11643-5 |
280 |
|
|
ZH_EXT cns11643_1992_6 cns11643-6 |
281 |
|
|
ZH_EXT cns11643_1992_7 cns11643-7 |
282 |
|
|
ZH_EXT cns11643_1992_f cns11643-f |
283 |
|
|
ZH big5 big5 |
284 |
|
|
ZH_EXT big5_ext big5ext |
285 |
|
|
ZH_EXT big5_plus big5plus |
286 |
pcg |
1.1 |
EU koi8_r koi8-r |
287 |
|
|
EU koi8_u koi8-u |
288 |
|
|
DEFAULT iso8859_1 iso-8859-1 |
289 |
|
|
EU iso8859_2 iso-8859-2 |
290 |
|
|
EU iso8859_3 iso-8859-3 |
291 |
|
|
EU iso8859_4 iso-8859-4 |
292 |
|
|
EU iso8859_5 iso-8859-5 |
293 |
|
|
EU iso8859_6 iso-8859-6 |
294 |
|
|
EU iso8859_7 iso-8859-7 |
295 |
|
|
EU iso8859_8 iso-8859-8 |
296 |
|
|
EU iso8859_9 iso-8859-9 |
297 |
|
|
EU iso8859_10 iso-8859-10 |
298 |
|
|
EU iso8859_11 iso-8859-11 |
299 |
|
|
EU iso8859_13 iso-8859-13 |
300 |
ayin |
1.10 |
EU iso8859_14 iso-8859-14 |
301 |
pcg |
1.1 |
DEFAULT iso8859_15 iso-8859-15 |
302 |
|
|
EU iso8859_16 iso-8859-16 |
303 |
|
|
JP jis0201_1976_0 jis0201-raw |
304 |
pcg |
1.5 |
JP jis0208_1990_0 jis0208-raw |
305 |
pcg |
1.1 |
JP jis0212_1990_0 jis0212-raw |
306 |
|
|
JP_EXT jis0213_1 jis0213-1-raw |
307 |
|
|
JP_EXT jis0213_2 jis0213-2-raw |