rxvt-unicode/src/encoding.C

/*--------------------------------*-C-*---------------------------------*
 * File:        encoding.C
 *----------------------------------------------------------------------*
 *
 * All portions of code are copyright by their respective author/s.
 * Copyright (c) 2003-2004 Marc Lehmann <pcg@goof.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *----------------------------------------------------------------------*/

#include "../config.h"

#include "encoding.h"

#include <cstdlib>
#include <cstring>

const struct n2cs {
  const char *name;
  codeset cs;
} n2cs[] = {
  /* first one found is the normalized one */
  { "ISO88591",         CS_ISO8859_1        },
  { "ISO8859PRIMARY",   CS_ISO8859_1        }, // some stupid fonts use this (hi tigert)
  { "ISO88592",         CS_ISO8859_2        },
  { "ISO88593",         CS_ISO8859_3        },
  { "ISO88594",         CS_ISO8859_4        },
  { "ISO88595",         CS_ISO8859_5        },
  { "ISO88596",         CS_ISO8859_6        },
  { "ISO88597",         CS_ISO8859_7        },
  { "ISO88598",         CS_ISO8859_8        },
  { "ISO88599",         CS_ISO8859_9        },
  { "ISO885910",        CS_ISO8859_10       },
  { "ISO885911",        CS_ISO8859_11       },
  { "ISO885913",        CS_ISO8859_13       },
  { "ISO885914",        CS_ISO8859_14       },
  { "ISO885915",        CS_ISO8859_15       },
  { "FCD885915",        CS_ISO8859_15       },
  { "ISO885916",        CS_ISO8859_16       },
                                            
  { "TIS620*",          CS_ISO8859_11       }, // close enough

  { "ISO10646*",        CS_UNICODE          },
  { "UNICODE",          CS_UNICODE          },
  { "UTF8",             CS_UNICODE          },
                                            
  { "ASCII",            CS_US_ASCII         },
  { "USASCII",          CS_US_ASCII         },
  { "ANSIX341968",      CS_US_ASCII         },
                                            
  { "KOI8R",            CS_KOI8_R           },
  { "GOST1976874*",     CS_KOI8_R           },
  { "KOI8RU",           CS_KOI8_U           },
  { "KOI8U",            CS_KOI8_U           },

  { "VISCII*",          CS_VISCII           },
                                            
  { "JISX0201*",        CS_JIS0201_1976_0   },
  { "JISX0208*",        CS_JIS0208_1990_0   }, // also wrongly matches -1987-0? (check Encode::JP)
  { "JISX0212*",        CS_JIS0212_1990_0   },
  { "JISX021320001",    CS_JIS0213_1        },
  { "JISX021320002",    CS_JIS0213_2        },
  { "JISX0221*",        CS_UNICODE          }, // _very_ close
                                            
  { "KSC5601*",         CS_KSC5601_1987_0   },
  { "KSX1001*",         CS_KSC5601_1987_0   },
  { "KSC5700*",         CS_UNICODE          }, // unicode plus extensions
                                            
  { "BIG5P*",           CS_BIG5_PLUS        },
  { "BIG5ETEN*",        CS_BIG5_EXT         },
  { "BIG5*",            CS_BIG5             },
  { "GB2312*",          CS_GB2312_1980_0    },
  { "GB6345*",          CS_GB2312_1980_0    }, // slightly different to gb2312??
  { "GB8565*",          CS_GB2312_1980_0    }, // a superset of gb2312??
  { "GB13000*",         CS_UNICODE          },
  { "CNS1164319921",    CS_CNS11643_1992_1  },
  { "CNS1164319922",    CS_CNS11643_1992_2  },
  { "CNS1164319923",    CS_CNS11643_1992_3  },
  { "CNS1164319924",    CS_CNS11643_1992_4  },
  { "CNS1164319925",    CS_CNS11643_1992_5  },
  { "CNS1164319926",    CS_CNS11643_1992_6  },
  { "CNS1164319927",    CS_CNS11643_1992_7  },
  { "CNS116431992F",    CS_CNS11643_1992_F  },

  { 0,                  CS_UNKNOWN      }
};

static const char *
normalize_name (const char *name)
{
  static char res[16];
  char *r;

  for (r = res; *name && r < res + 15; name++)
    if ((*name >= '0' && *name <= '9')
        || (*name >= 'A' && *name <= 'Z'))
      *r++ = *name;
    else if (*name >= 'a' && *name <= 'z')
      *r++ = *name - ('a' - 'A');

  *r = 0;

  return res;
}

codeset
codeset_from_name (const char *name)
{
  if (!name)
    return CS_UNKNOWN;

  name = normalize_name (name);

  const struct n2cs *i = n2cs;

  do {
    int len = strlen (i->name);

    if ((i->name[len - 1] == '*'
         && !strncmp (name, i->name, len - 1))
        || !strcmp (name, i->name))
        return i->cs;

  } while ((++i)->name);

  return CS_UNKNOWN;
}

struct rxvt_codeset_conv_unknown : rxvt_codeset_conv {
  unicode_t to_unicode (uint32_t enc) const { return NOCHAR; }
  uint32_t from_unicode (unicode_t unicode) const { return NOCHAR; }
} rxvt_codeset_conv_unknown;

struct rxvt_codeset_conv_us_ascii : rxvt_codeset_conv {
  uint32_t from_unicode (unicode_t unicode) const { return unicode <= 127 ? unicode : NOCHAR; }
} rxvt_codeset_conv_us_ascii;

struct rxvt_codeset_conv_unicode : rxvt_codeset_conv {
  /* transparent */
} rxvt_codeset_conv_unicode;

struct rxvt_codeset_conv_unicode_16 : rxvt_codeset_conv {
  unicode_t to_unicode (uint32_t enc) const { return enc; }
  uint32_t from_unicode (unicode_t unicode) const { return unicode <= 65535 ? unicode : NOCHAR; }
} rxvt_codeset_conv_unicode_16;

#define ENCODING_DEFAULT

#include "table/iso8859_1.h"
#include "table/iso8859_15.h"

//#define ENCODING_EU

#include "table/iso8859_2.h"
#include "table/iso8859_3.h"
#include "table/iso8859_4.h"
#include "table/iso8859_5.h"
#include "table/iso8859_6.h"
#include "table/iso8859_7.h"
#include "table/iso8859_8.h"
#include "table/iso8859_9.h"
#include "table/iso8859_10.h"
#include "table/iso8859_11.h"
#include "table/iso8859_13.h"
#include "table/iso8859_14.h"
#include "table/iso8859_16.h"

#include "table/koi8_r.h"
#include "table/koi8_u.h"

//#define ENCODING_KR

#include "table/ksc5601_1987_0.h"

//#define ENCODING_CN

#include "table/gb2312_1980_0.h"
#include "table/big5.h"

//#define ENCODING_CN_EXT

#include "table/cns11643_1992_1.h"
#include "table/cns11643_1992_2.h"
#include "table/cns11643_1992_3.h"
#include "table/cns11643_1992_4.h"
#include "table/cns11643_1992_5.h"
#include "table/cns11643_1992_6.h"
#include "table/cns11643_1992_7.h"
#include "table/cns11643_1992_f.h"
#include "table/big5_ext.h"
#include "table/big5_plus.h"

//#define ENCODING_VN

#include "table/viscii.h"

//#define ENCODING_JP

#include "table/jis0201_1976_0.h"
#include "table/jis0208_1990_0.h"
#include "table/jis0212_1990_0.h"

//#define ENCODING_JP_EXT

#include "table/jis0213_1.h"
#include "table/jis0213_2.h"

// order must match table in encoding.h(!)
const rxvt_codeset_conv *rxvt_codeset[NUM_CODESETS] = {
  &rxvt_codeset_conv_unknown,

  &rxvt_codeset_conv_us_ascii,

  &rxvt_codeset_conv_iso8859_1,
  &rxvt_codeset_conv_iso8859_2,
  &rxvt_codeset_conv_iso8859_3,
  &rxvt_codeset_conv_iso8859_4,
  &rxvt_codeset_conv_iso8859_5,
  &rxvt_codeset_conv_iso8859_6,
  &rxvt_codeset_conv_iso8859_7,
  &rxvt_codeset_conv_iso8859_8,
  &rxvt_codeset_conv_iso8859_9,
  &rxvt_codeset_conv_iso8859_10,
  &rxvt_codeset_conv_iso8859_11,
  &rxvt_codeset_conv_iso8859_13,
  &rxvt_codeset_conv_iso8859_14,
  &rxvt_codeset_conv_iso8859_15,
  &rxvt_codeset_conv_iso8859_16,

  &rxvt_codeset_conv_koi8_r,
  &rxvt_codeset_conv_koi8_u,

  &rxvt_codeset_conv_jis0201_1976_0,
  &rxvt_codeset_conv_jis0208_1990_0,
  &rxvt_codeset_conv_jis0212_1990_0,

  &rxvt_codeset_conv_jis0213_1,
  &rxvt_codeset_conv_jis0213_2,

  &rxvt_codeset_conv_ksc5601_1987_0,

  &rxvt_codeset_conv_gb2312_1980_0,

  &rxvt_codeset_conv_cns11643_1992_1,
  &rxvt_codeset_conv_cns11643_1992_2,
  &rxvt_codeset_conv_cns11643_1992_3,
  &rxvt_codeset_conv_cns11643_1992_4,
  &rxvt_codeset_conv_cns11643_1992_5,
  &rxvt_codeset_conv_cns11643_1992_6,
  &rxvt_codeset_conv_cns11643_1992_7,
  &rxvt_codeset_conv_cns11643_1992_f,
  &rxvt_codeset_conv_big5,
  &rxvt_codeset_conv_big5_ext,
  &rxvt_codeset_conv_big5_plus,

  &rxvt_codeset_conv_viscii,

  &rxvt_codeset_conv_unicode_16,
  &rxvt_codeset_conv_unicode
};

#if ENABLE_COMBINING
# define ENCODING_COMPOSE
#endif

#include "table/compose.h"

unicode_t
rxvt_compose (unicode_t c1, unicode_t c2)
{
  int l = 0;
  int r = sizeof (rxvt_compose_table) / sizeof (rxvt_compose_entry) - 1;
  int m;

  while (r > l)
    {
      m = (l + r) / 2;
      rxvt_compose_entry &c = rxvt_compose_table[m];

      if (c.c1 < c1 || (c.c1 == c1 && c.c2 < c2))
        l = m + 1;
      else if (c.c1 > c1 || (c.c1 == c1 && c.c2 > c2))
        r = m - 1;
      else
        return c.r;
    }

  return NOCHAR;
}

#include "table/category.h"

bool unicode::is_space (unicode_t c)
{
  return IS_SPACE (c);
}
Revision:	1.16
Committed:	Thu Apr 8 20:31:45 2004 UTC (20 years, 3 months ago) by pcg
Content type:	text/plain
Branch:	MAIN
CVS Tags:	rel-2_8, rel-3_0
Changes since 1.15:	+22 -0 lines
Log Message:	* empty log message *
#	Content
1	/---------------------------------C----------------------------------
2	* File: encoding.C
3	----------------------------------------------------------------------
4	*
5	* All portions of code are copyright by their respective author/s.
6	* Copyright (c) 2003-2004 Marc Lehmann <pcg@goof.com>
7	*
8	* This program is free software; you can redistribute it and/or modify
9	* it under the terms of the GNU General Public License as published by
10	* the Free Software Foundation; either version 2 of the License, or
11	* (at your option) any later version.
12	*
13	* This program is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	* GNU General Public License for more details.
17	*
18	* You should have received a copy of the GNU General Public License
19	* along with this program; if not, write to the Free Software
20	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21	----------------------------------------------------------------------/
22
23	#include "../config.h"
24
25	#include "encoding.h"
26
27	#include <cstdlib>
28	#include <cstring>
29
30	const struct n2cs {
31	const char *name;
32	codeset cs;
33	} n2cs[] = {
34	/* first one found is the normalized one */
35	{ "ISO88591", CS_ISO8859_1 },
36	{ "ISO8859PRIMARY", CS_ISO8859_1 }, // some stupid fonts use this (hi tigert)
37	{ "ISO88592", CS_ISO8859_2 },
38	{ "ISO88593", CS_ISO8859_3 },
39	{ "ISO88594", CS_ISO8859_4 },
40	{ "ISO88595", CS_ISO8859_5 },
41	{ "ISO88596", CS_ISO8859_6 },
42	{ "ISO88597", CS_ISO8859_7 },
43	{ "ISO88598", CS_ISO8859_8 },
44	{ "ISO88599", CS_ISO8859_9 },
45	{ "ISO885910", CS_ISO8859_10 },
46	{ "ISO885911", CS_ISO8859_11 },
47	{ "ISO885913", CS_ISO8859_13 },
48	{ "ISO885914", CS_ISO8859_14 },
49	{ "ISO885915", CS_ISO8859_15 },
50	{ "FCD885915", CS_ISO8859_15 },
51	{ "ISO885916", CS_ISO8859_16 },
52
53	{ "TIS620*", CS_ISO8859_11 }, // close enough
54
55	{ "ISO10646*", CS_UNICODE },
56	{ "UNICODE", CS_UNICODE },
57	{ "UTF8", CS_UNICODE },
58
59	{ "ASCII", CS_US_ASCII },
60	{ "USASCII", CS_US_ASCII },
61	{ "ANSIX341968", CS_US_ASCII },
62
63	{ "KOI8R", CS_KOI8_R },
64	{ "GOST1976874*", CS_KOI8_R },
65	{ "KOI8RU", CS_KOI8_U },
66	{ "KOI8U", CS_KOI8_U },
67
68	{ "VISCII*", CS_VISCII },
69
70	{ "JISX0201*", CS_JIS0201_1976_0 },
71	{ "JISX0208*", CS_JIS0208_1990_0 }, // also wrongly matches -1987-0? (check Encode::JP)
72	{ "JISX0212*", CS_JIS0212_1990_0 },
73	{ "JISX021320001", CS_JIS0213_1 },
74	{ "JISX021320002", CS_JIS0213_2 },
75	{ "JISX0221*", CS_UNICODE }, // _very_ close
76
77	{ "KSC5601*", CS_KSC5601_1987_0 },
78	{ "KSX1001*", CS_KSC5601_1987_0 },
79	{ "KSC5700*", CS_UNICODE }, // unicode plus extensions
80
81	{ "BIG5P*", CS_BIG5_PLUS },
82	{ "BIG5ETEN*", CS_BIG5_EXT },
83	{ "BIG5*", CS_BIG5 },
84	{ "GB2312*", CS_GB2312_1980_0 },
85	{ "GB6345*", CS_GB2312_1980_0 }, // slightly different to gb2312??
86	{ "GB8565*", CS_GB2312_1980_0 }, // a superset of gb2312??
87	{ "GB13000*", CS_UNICODE },
88	{ "CNS1164319921", CS_CNS11643_1992_1 },
89	{ "CNS1164319922", CS_CNS11643_1992_2 },
90	{ "CNS1164319923", CS_CNS11643_1992_3 },
91	{ "CNS1164319924", CS_CNS11643_1992_4 },
92	{ "CNS1164319925", CS_CNS11643_1992_5 },
93	{ "CNS1164319926", CS_CNS11643_1992_6 },
94	{ "CNS1164319927", CS_CNS11643_1992_7 },
95	{ "CNS116431992F", CS_CNS11643_1992_F },
96
97	{ 0, CS_UNKNOWN }
98	};
99
100	static const char *
101	normalize_name (const char *name)
102	{
103	static char res[16];
104	char *r;
105
106	for (r = res; *name && r < res + 15; name++)
107	if ((name >= '0' && name <= '9')
108	\|\| (name >= 'A' && name <= 'Z'))
109	r++ = name;
110	else if (name >= 'a' && name <= 'z')
111	r++ = name - ('a' - 'A');
112
113	*r = 0;
114
115	return res;
116	}
117
118	codeset
119	codeset_from_name (const char *name)
120	{
121	if (!name)
122	return CS_UNKNOWN;
123
124	name = normalize_name (name);
125
126	const struct n2cs *i = n2cs;
127
128	do {
129	int len = strlen (i->name);
130
131	if ((i->name[len - 1] == '*'
132	&& !strncmp (name, i->name, len - 1))
133	\|\| !strcmp (name, i->name))
134	return i->cs;
135
136	} while ((++i)->name);
137
138	return CS_UNKNOWN;
139	}
140
141	struct rxvt_codeset_conv_unknown : rxvt_codeset_conv {
142	unicode_t to_unicode (uint32_t enc) const { return NOCHAR; }
143	uint32_t from_unicode (unicode_t unicode) const { return NOCHAR; }
144	} rxvt_codeset_conv_unknown;
145
146	struct rxvt_codeset_conv_us_ascii : rxvt_codeset_conv {
147	uint32_t from_unicode (unicode_t unicode) const { return unicode <= 127 ? unicode : NOCHAR; }
148	} rxvt_codeset_conv_us_ascii;
149
150	struct rxvt_codeset_conv_unicode : rxvt_codeset_conv {
151	/* transparent */
152	} rxvt_codeset_conv_unicode;
153
154	struct rxvt_codeset_conv_unicode_16 : rxvt_codeset_conv {
155	unicode_t to_unicode (uint32_t enc) const { return enc; }
156	uint32_t from_unicode (unicode_t unicode) const { return unicode <= 65535 ? unicode : NOCHAR; }
157	} rxvt_codeset_conv_unicode_16;
158
159	#define ENCODING_DEFAULT
160
161	#include "table/iso8859_1.h"
162	#include "table/iso8859_15.h"
163
164	//#define ENCODING_EU
165
166	#include "table/iso8859_2.h"
167	#include "table/iso8859_3.h"
168	#include "table/iso8859_4.h"
169	#include "table/iso8859_5.h"
170	#include "table/iso8859_6.h"
171	#include "table/iso8859_7.h"
172	#include "table/iso8859_8.h"
173	#include "table/iso8859_9.h"
174	#include "table/iso8859_10.h"
175	#include "table/iso8859_11.h"
176	#include "table/iso8859_13.h"
177	#include "table/iso8859_14.h"
178	#include "table/iso8859_16.h"
179
180	#include "table/koi8_r.h"
181	#include "table/koi8_u.h"
182
183	//#define ENCODING_KR
184
185	#include "table/ksc5601_1987_0.h"
186
187	//#define ENCODING_CN
188
189	#include "table/gb2312_1980_0.h"
190	#include "table/big5.h"
191
192	//#define ENCODING_CN_EXT
193
194	#include "table/cns11643_1992_1.h"
195	#include "table/cns11643_1992_2.h"
196	#include "table/cns11643_1992_3.h"
197	#include "table/cns11643_1992_4.h"
198	#include "table/cns11643_1992_5.h"
199	#include "table/cns11643_1992_6.h"
200	#include "table/cns11643_1992_7.h"
201	#include "table/cns11643_1992_f.h"
202	#include "table/big5_ext.h"
203	#include "table/big5_plus.h"
204
205	//#define ENCODING_VN
206
207	#include "table/viscii.h"
208
209	//#define ENCODING_JP
210
211	#include "table/jis0201_1976_0.h"
212	#include "table/jis0208_1990_0.h"
213	#include "table/jis0212_1990_0.h"
214
215	//#define ENCODING_JP_EXT
216
217	#include "table/jis0213_1.h"
218	#include "table/jis0213_2.h"
219
220	// order must match table in encoding.h(!)
221	const rxvt_codeset_conv *rxvt_codeset[NUM_CODESETS] = {
222	&rxvt_codeset_conv_unknown,
223
224	&rxvt_codeset_conv_us_ascii,
225
226	&rxvt_codeset_conv_iso8859_1,
227	&rxvt_codeset_conv_iso8859_2,
228	&rxvt_codeset_conv_iso8859_3,
229	&rxvt_codeset_conv_iso8859_4,
230	&rxvt_codeset_conv_iso8859_5,
231	&rxvt_codeset_conv_iso8859_6,
232	&rxvt_codeset_conv_iso8859_7,
233	&rxvt_codeset_conv_iso8859_8,
234	&rxvt_codeset_conv_iso8859_9,
235	&rxvt_codeset_conv_iso8859_10,
236	&rxvt_codeset_conv_iso8859_11,
237	&rxvt_codeset_conv_iso8859_13,
238	&rxvt_codeset_conv_iso8859_14,
239	&rxvt_codeset_conv_iso8859_15,
240	&rxvt_codeset_conv_iso8859_16,
241
242	&rxvt_codeset_conv_koi8_r,
243	&rxvt_codeset_conv_koi8_u,
244
245	&rxvt_codeset_conv_jis0201_1976_0,
246	&rxvt_codeset_conv_jis0208_1990_0,
247	&rxvt_codeset_conv_jis0212_1990_0,
248
249	&rxvt_codeset_conv_jis0213_1,
250	&rxvt_codeset_conv_jis0213_2,
251
252	&rxvt_codeset_conv_ksc5601_1987_0,
253
254	&rxvt_codeset_conv_gb2312_1980_0,
255
256	&rxvt_codeset_conv_cns11643_1992_1,
257	&rxvt_codeset_conv_cns11643_1992_2,
258	&rxvt_codeset_conv_cns11643_1992_3,
259	&rxvt_codeset_conv_cns11643_1992_4,
260	&rxvt_codeset_conv_cns11643_1992_5,
261	&rxvt_codeset_conv_cns11643_1992_6,
262	&rxvt_codeset_conv_cns11643_1992_7,
263	&rxvt_codeset_conv_cns11643_1992_f,
264	&rxvt_codeset_conv_big5,
265	&rxvt_codeset_conv_big5_ext,
266	&rxvt_codeset_conv_big5_plus,
267
268	&rxvt_codeset_conv_viscii,
269
270	&rxvt_codeset_conv_unicode_16,
271	&rxvt_codeset_conv_unicode
272	};
273
274	#if ENABLE_COMBINING
275	# define ENCODING_COMPOSE
276	#endif
277
278	#include "table/compose.h"
279
280	unicode_t
281	rxvt_compose (unicode_t c1, unicode_t c2)
282	{
283	int l = 0;
284	int r = sizeof (rxvt_compose_table) / sizeof (rxvt_compose_entry) - 1;
285	int m;
286
287	while (r > l)
288	{
289	m = (l + r) / 2;
290	rxvt_compose_entry &c = rxvt_compose_table[m];
291
292	if (c.c1 < c1 \|\| (c.c1 == c1 && c.c2 < c2))
293	l = m + 1;
294	else if (c.c1 > c1 \|\| (c.c1 == c1 && c.c2 > c2))
295	r = m - 1;
296	else
297	return c.r;
298	}
299
300	return NOCHAR;
301	}
302
303	#include "table/category.h"
304
305	bool unicode::is_space (unicode_t c)
306	{
307	return IS_SPACE (c);
308	}