rxvt-unicode/src/encoding.C

/*----------------------------------------------------------------------*
 * File:        encoding.C
 *----------------------------------------------------------------------*
 *
 * All portions of code are copyright by their respective author/s.
 * Copyright (c) 2003-2006 Marc Lehmann <pcg@goof.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *----------------------------------------------------------------------*/

#include "../config.h"

#include "encoding.h"

#include <cstdlib>
#include <cstring>

const struct n2cs {
  const char *name;
  codeset cs;
} n2cs[] = {
  /* first one found is the normalized one */
  { "ISO88591",         CS_ISO8859_1        },
  { "ISO8859PRIMARY",   CS_ISO8859_1        }, // some stupid fonts use this (hi tigert)
  { "ISO88592",         CS_ISO8859_2        },
  { "ISO88593",         CS_ISO8859_3        },
  { "ISO88594",         CS_ISO8859_4        },
  { "ISO88595",         CS_ISO8859_5        },
  { "ISO88596",         CS_ISO8859_6        },
  { "ISO88597",         CS_ISO8859_7        },
  { "ISO88598",         CS_ISO8859_8        },
  { "ISO88599",         CS_ISO8859_9        },
  { "ISO885910",        CS_ISO8859_10       },
  { "ISO885911",        CS_ISO8859_11       },
  { "ISO885913",        CS_ISO8859_13       },
  { "ISO885914",        CS_ISO8859_14       },
  { "ISO885915",        CS_ISO8859_15       },
  { "FCD885915",        CS_ISO8859_15       },
  { "ISO885916",        CS_ISO8859_16       },

  { "TIS620*",          CS_ISO8859_11       }, // close enough

  { "ISO10646*",        CS_UNICODE          },
  { "UNICODE",          CS_UNICODE          },
  { "UTF8",             CS_UNICODE          },

  { "ASCII",            CS_US_ASCII         },
  { "USASCII",          CS_US_ASCII         },
  { "ANSIX341968",      CS_US_ASCII         },
  { "ISO6461991IRV",    CS_US_ASCII         }, // older versions used the currency sign

  { "KOI8R*",           CS_KOI8_R           },
  { "GOST1976874*",     CS_KOI8_R           },
  { "KOI8RU",           CS_KOI8_U           },
  { "KOI8U",            CS_KOI8_U           },

  { "VISCII*",          CS_VISCII           },

  { "JISX0201*",        CS_JIS0201_1976_0   },
  { "JISC6226*",        CS_JIS0208_1990_0   }, // also wrongly matches -1987-0? (check Encode::JP)
  { "JISX0208*",        CS_JIS0208_1990_0   }, // also wrongly matches -1987-0? (check Encode::JP)
  { "JISX0212*",        CS_JIS0212_1990_0   },
  { "JISX021320001",    CS_JIS0213_1        },
  { "JISX021320002",    CS_JIS0213_2        },
  { "JISX0221*",        CS_UNICODE          }, // _very_ close

  { "KSC5601*",         CS_KSC5601_1987_0   },
  { "KSX1001*",         CS_KSC5601_1987_0   },
  { "KSC5700*",         CS_UNICODE          }, // unicode plus extensions

  { "BIG5P*",           CS_BIG5_PLUS        },
  { "BIG5ETEN*",        CS_BIG5_EXT         },
  { "BIG5*",            CS_BIG5             },
  { "GB2312*",          CS_GB2312_1980_0    },
  { "GBK*",             CS_GBK_0            },
  { "GB6345*",          CS_GB2312_1980_0    }, // slightly different to gb2312??
  { "GB8565*",          CS_GB2312_1980_0    }, // a superset of gb2312??
  { "GB13000*",         CS_UNICODE          },
  { "CNS1164319921",    CS_CNS11643_1992_1  },
  { "CNS1164319922",    CS_CNS11643_1992_2  },
  { "CNS1164319923",    CS_CNS11643_1992_3  },
  { "CNS1164319924",    CS_CNS11643_1992_4  },
  { "CNS1164319925",    CS_CNS11643_1992_5  },
  { "CNS1164319926",    CS_CNS11643_1992_6  },
  { "CNS1164319927",    CS_CNS11643_1992_7  },
  { "CNS116431992F",    CS_CNS11643_1992_F  },

  { 0,                  CS_UNKNOWN      }
};

static const char *
normalize_name (const char *name)
{
  static char res[16];
  char *r;

  for (r = res; *name && r < res + 15; name++)
    if ((*name >= '0' && *name <= '9')
        || (*name >= 'A' && *name <= 'Z'))
      *r++ = *name;
    else if (*name >= 'a' && *name <= 'z')
      *r++ = *name - ('a' - 'A');

  *r = 0;

  return res;
}

codeset
codeset_from_name (const char *name)
{
  if (!name)
    return CS_UNKNOWN;

  name = normalize_name (name);

  const struct n2cs *i = n2cs;

  do {
    int len = strlen (i->name);

    if ((i->name[len - 1] == '*'
         && !strncmp (name, i->name, len - 1))
        || !strcmp (name, i->name))
        return i->cs;

  } while ((++i)->name);

  return CS_UNKNOWN;
}

static unicode_t cs_unknown_to_unicode (uint32_t enc)          { return NOCHAR; }
static uint32_t cs_unknown_from_unicode (unicode_t unicode)    { return NOCHAR; }

static unicode_t cs_unicode_to_unicode (uint32_t enc)          { return enc; }
static uint32_t cs_unicode_from_unicode (unicode_t unicode)    { return unicode; }

#define cs_us_ascii_to_unicode cs_unicode_to_unicode
static uint32_t cs_us_ascii_from_unicode (unicode_t unicode)   { return unicode <= 127 ? unicode : NOCHAR; }

#define cs_us_ascii_to_unicode_16 cs_unicode_to_unicode
static uint32_t cs_unicode_16_from_unicode (unicode_t unicode) { return unicode <= 65535 ? unicode : NOCHAR; }

#define ENCODING_DEFAULT

#include "table/iso8859_1.h"
#include "table/iso8859_15.h"

//#define ENCODING_EU

#include "table/iso8859_2.h"
#include "table/iso8859_3.h"
#include "table/iso8859_4.h"
#include "table/iso8859_5.h"
#include "table/iso8859_6.h"
#include "table/iso8859_7.h"
#include "table/iso8859_8.h"
#include "table/iso8859_9.h"
#include "table/iso8859_10.h"
#include "table/iso8859_11.h"
#include "table/iso8859_13.h"
#include "table/iso8859_14.h"
#include "table/iso8859_16.h"

#include "table/koi8_r.h"
#include "table/koi8_u.h"

//#define ENCODING_KR

#include "table/ksc5601_1987_0.h"

//#define ENCODING_ZH

#include "table/big5.h"
#include "table/gbk_0.h"
#include "table/gb2312_1980_0.h"

//#define ENCODING_ZH_EXT

#include "table/cns11643_1992_1.h"
#include "table/cns11643_1992_2.h"
#include "table/cns11643_1992_3.h"
#include "table/cns11643_1992_4.h"
#include "table/cns11643_1992_5.h"
#include "table/cns11643_1992_6.h"
#include "table/cns11643_1992_7.h"
#include "table/cns11643_1992_f.h"
#include "table/big5_ext.h"
#include "table/big5_plus.h"

//#define ENCODING_VN

#include "table/viscii.h"

//#define ENCODING_JP

#include "table/jis0201_1976_0.h"
#include "table/jis0208_1990_0.h"
#include "table/jis0212_1990_0.h"

//#define ENCODING_JP_EXT

#include "table/jis0213_1.h"
#include "table/jis0213_2.h"

#if ENCODING_TO_UNICODE
# define ENC(base) { cs_ ## base ## _from_unicode, cs_ ## base ## _to_unicode }
#else
# define ENC(base) { cs_ ## base ## _from_unicode }
#endif


// order must match table in encoding.h(!)
const rxvt_codeset_conv rxvt_codeset[NUM_CODESETS] = {
  ENC (unknown),

  ENC (us_ascii),

  ENC (iso8859_1),
  ENC (iso8859_2),
  ENC (iso8859_3),
  ENC (iso8859_4),
  ENC (iso8859_5),
  ENC (iso8859_6),
  ENC (iso8859_7),
  ENC (iso8859_8),
  ENC (iso8859_9),
  ENC (iso8859_10),
  ENC (iso8859_11),
  ENC (iso8859_13),
  ENC (iso8859_14),
  ENC (iso8859_15),
  ENC (iso8859_16),

  ENC (koi8_r),
  ENC (koi8_u),

  ENC (jis0201_1976_0),
  ENC (jis0208_1990_0),
  ENC (jis0212_1990_0),

  ENC (jis0213_1),
  ENC (jis0213_2),

  ENC (ksc5601_1987_0),

  ENC (gb2312_1980_0),
  ENC (gbk_0),

  ENC (cns11643_1992_1),
  ENC (cns11643_1992_2),
  ENC (cns11643_1992_3),
  ENC (cns11643_1992_4),
  ENC (cns11643_1992_5),
  ENC (cns11643_1992_6),
  ENC (cns11643_1992_7),
  ENC (cns11643_1992_f),
  ENC (big5),
  ENC (big5_ext),
  ENC (big5_plus),

  ENC (viscii),

  ENC (unicode_16),
  ENC (unicode),
};

#if ENABLE_COMBINING
# define ENCODING_COMPOSE
#endif

#include "table/compose.h"

unicode_t
rxvt_compose (unicode_t c1, unicode_t c2)
{
  int l = 0;
  int r = sizeof (rxvt_compose_table) / sizeof (rxvt_compose_entry) - 1;
  int m;

  while (r >= l)
    {
      m = (l + r) / 2;
      rxvt_compose_entry &c = rxvt_compose_table[m];

      if (c.c1 < c1 || (c.c1 == c1 && c.c2 < c2))
        l = m + 1;
      else if (c.c1 > c1 || (c.c1 == c1 && c.c2 > c2))
        r = m - 1;
      else
        return c.r;
    }

  return NOCHAR;
}

#include "table/category.h"

bool unicode::is_space (unicode_t c)
{
  return IS_SPACE (c)
         || c == 0x09; // exclude tabs, too, as we store them in the buffer
}
Revision:	1.28
Committed:	Sat Apr 4 21:27:53 2009 UTC (15 years, 3 months ago) by root
Content type:	text/plain
Branch:	MAIN
CVS Tags:	before_dynamic_fontidx, rel-9_10, dynamic_fontidx, rel-9_09, rel-9_07
Changes since 1.27:	+1 -1 lines
Log Message:	* empty log message *
#	User	Rev	Content
1	root	1.25	/----------------------------------------------------------------------
2	pcg	1.16	* File: encoding.C
3			----------------------------------------------------------------------
4			*
5			* All portions of code are copyright by their respective author/s.
6	root	1.24	* Copyright (c) 2003-2006 Marc Lehmann <pcg@goof.com>
7	pcg	1.16	*
8			* This program is free software; you can redistribute it and/or modify
9			* it under the terms of the GNU General Public License as published by
10			* the Free Software Foundation; either version 2 of the License, or
11			* (at your option) any later version.
12			*
13			* This program is distributed in the hope that it will be useful,
14			* but WITHOUT ANY WARRANTY; without even the implied warranty of
15			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16			* GNU General Public License for more details.
17			*
18			* You should have received a copy of the GNU General Public License
19			* along with this program; if not, write to the Free Software
20			* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21			----------------------------------------------------------------------/
22
23	pcg	1.1	#include "../config.h"
24
25			#include "encoding.h"
26
27			#include <cstdlib>
28			#include <cstring>
29
30			const struct n2cs {
31			const char *name;
32			codeset cs;
33			} n2cs[] = {
34			/* first one found is the normalized one */
35	pcg	1.3	{ "ISO88591", CS_ISO8859_1 },
36			{ "ISO8859PRIMARY", CS_ISO8859_1 }, // some stupid fonts use this (hi tigert)
37			{ "ISO88592", CS_ISO8859_2 },
38			{ "ISO88593", CS_ISO8859_3 },
39			{ "ISO88594", CS_ISO8859_4 },
40			{ "ISO88595", CS_ISO8859_5 },
41			{ "ISO88596", CS_ISO8859_6 },
42			{ "ISO88597", CS_ISO8859_7 },
43			{ "ISO88598", CS_ISO8859_8 },
44			{ "ISO88599", CS_ISO8859_9 },
45			{ "ISO885910", CS_ISO8859_10 },
46			{ "ISO885911", CS_ISO8859_11 },
47			{ "ISO885913", CS_ISO8859_13 },
48			{ "ISO885914", CS_ISO8859_14 },
49			{ "ISO885915", CS_ISO8859_15 },
50			{ "FCD885915", CS_ISO8859_15 },
51			{ "ISO885916", CS_ISO8859_16 },
52	ayin	1.26
53	pcg	1.9	{ "TIS620*", CS_ISO8859_11 }, // close enough
54
55	pcg	1.3	{ "ISO10646*", CS_UNICODE },
56			{ "UNICODE", CS_UNICODE },
57			{ "UTF8", CS_UNICODE },
58	ayin	1.26
59	pcg	1.3	{ "ASCII", CS_US_ASCII },
60			{ "USASCII", CS_US_ASCII },
61			{ "ANSIX341968", CS_US_ASCII },
62	root	1.28	{ "ISO6461991IRV", CS_US_ASCII }, // older versions used the currency sign
63	ayin	1.26
64	root	1.22	{ "KOI8R*", CS_KOI8_R },
65	pcg	1.3	{ "GOST1976874*", CS_KOI8_R },
66			{ "KOI8RU", CS_KOI8_U },
67			{ "KOI8U", CS_KOI8_U },
68
69			{ "VISCII*", CS_VISCII },
70	ayin	1.26
71	pcg	1.3	{ "JISX0201*", CS_JIS0201_1976_0 },
72	root	1.17	{ "JISC6226*", CS_JIS0208_1990_0 }, // also wrongly matches -1987-0? (check Encode::JP)
73	pcg	1.15	{ "JISX0208*", CS_JIS0208_1990_0 }, // also wrongly matches -1987-0? (check Encode::JP)
74	pcg	1.3	{ "JISX0212*", CS_JIS0212_1990_0 },
75	pcg	1.15	{ "JISX021320001", CS_JIS0213_1 },
76			{ "JISX021320002", CS_JIS0213_2 },
77			{ "JISX0221*", CS_UNICODE }, // _very_ close
78	ayin	1.26
79	pcg	1.3	{ "KSC5601*", CS_KSC5601_1987_0 },
80			{ "KSX1001*", CS_KSC5601_1987_0 },
81			{ "KSC5700*", CS_UNICODE }, // unicode plus extensions
82	ayin	1.26
83	pcg	1.3	{ "BIG5P*", CS_BIG5_PLUS },
84			{ "BIG5ETEN*", CS_BIG5_EXT },
85			{ "BIG5*", CS_BIG5 },
86			{ "GB2312*", CS_GB2312_1980_0 },
87	root	1.21	{ "GBK*", CS_GBK_0 },
88	pcg	1.3	{ "GB6345*", CS_GB2312_1980_0 }, // slightly different to gb2312??
89			{ "GB8565*", CS_GB2312_1980_0 }, // a superset of gb2312??
90			{ "GB13000*", CS_UNICODE },
91			{ "CNS1164319921", CS_CNS11643_1992_1 },
92			{ "CNS1164319922", CS_CNS11643_1992_2 },
93			{ "CNS1164319923", CS_CNS11643_1992_3 },
94			{ "CNS1164319924", CS_CNS11643_1992_4 },
95			{ "CNS1164319925", CS_CNS11643_1992_5 },
96			{ "CNS1164319926", CS_CNS11643_1992_6 },
97			{ "CNS1164319927", CS_CNS11643_1992_7 },
98			{ "CNS116431992F", CS_CNS11643_1992_F },
99	pcg	1.1
100			{ 0, CS_UNKNOWN }
101			};
102
103			static const char *
104			normalize_name (const char *name)
105			{
106			static char res[16];
107			char *r;
108
109			for (r = res; *name && r < res + 15; name++)
110			if ((name >= '0' && name <= '9')
111			\|\| (name >= 'A' && name <= 'Z'))
112			r++ = name;
113			else if (name >= 'a' && name <= 'z')
114			r++ = name - ('a' - 'A');
115
116			*r = 0;
117
118			return res;
119			}
120
121			codeset
122			codeset_from_name (const char *name)
123			{
124			if (!name)
125			return CS_UNKNOWN;
126
127			name = normalize_name (name);
128
129			const struct n2cs *i = n2cs;
130
131			do {
132	pcg	1.3	int len = strlen (i->name);
133
134			if ((i->name[len - 1] == '*'
135			&& !strncmp (name, i->name, len - 1))
136			\|\| !strcmp (name, i->name))
137			return i->cs;
138
139	pcg	1.1	} while ((++i)->name);
140
141			return CS_UNKNOWN;
142			}
143
144	root	1.18	static unicode_t cs_unknown_to_unicode (uint32_t enc) { return NOCHAR; }
145			static uint32_t cs_unknown_from_unicode (unicode_t unicode) { return NOCHAR; }
146
147			static unicode_t cs_unicode_to_unicode (uint32_t enc) { return enc; }
148			static uint32_t cs_unicode_from_unicode (unicode_t unicode) { return unicode; }
149
150			#define cs_us_ascii_to_unicode cs_unicode_to_unicode
151			static uint32_t cs_us_ascii_from_unicode (unicode_t unicode) { return unicode <= 127 ? unicode : NOCHAR; }
152
153			#define cs_us_ascii_to_unicode_16 cs_unicode_to_unicode
154			static uint32_t cs_unicode_16_from_unicode (unicode_t unicode) { return unicode <= 65535 ? unicode : NOCHAR; }
155	pcg	1.1
156			#define ENCODING_DEFAULT
157
158			#include "table/iso8859_1.h"
159			#include "table/iso8859_15.h"
160
161			//#define ENCODING_EU
162
163			#include "table/iso8859_2.h"
164			#include "table/iso8859_3.h"
165			#include "table/iso8859_4.h"
166			#include "table/iso8859_5.h"
167			#include "table/iso8859_6.h"
168			#include "table/iso8859_7.h"
169			#include "table/iso8859_8.h"
170			#include "table/iso8859_9.h"
171			#include "table/iso8859_10.h"
172			#include "table/iso8859_11.h"
173			#include "table/iso8859_13.h"
174			#include "table/iso8859_14.h"
175			#include "table/iso8859_16.h"
176
177			#include "table/koi8_r.h"
178			#include "table/koi8_u.h"
179
180			//#define ENCODING_KR
181
182			#include "table/ksc5601_1987_0.h"
183
184	root	1.20	//#define ENCODING_ZH
185	pcg	1.1
186	root	1.21	#include "table/big5.h"
187			#include "table/gbk_0.h"
188	pcg	1.1	#include "table/gb2312_1980_0.h"
189
190	root	1.20	//#define ENCODING_ZH_EXT
191	pcg	1.1
192			#include "table/cns11643_1992_1.h"
193			#include "table/cns11643_1992_2.h"
194			#include "table/cns11643_1992_3.h"
195			#include "table/cns11643_1992_4.h"
196			#include "table/cns11643_1992_5.h"
197			#include "table/cns11643_1992_6.h"
198			#include "table/cns11643_1992_7.h"
199			#include "table/cns11643_1992_f.h"
200			#include "table/big5_ext.h"
201			#include "table/big5_plus.h"
202
203			//#define ENCODING_VN
204
205			#include "table/viscii.h"
206
207			//#define ENCODING_JP
208
209			#include "table/jis0201_1976_0.h"
210	pcg	1.15	#include "table/jis0208_1990_0.h"
211	pcg	1.1	#include "table/jis0212_1990_0.h"
212
213			//#define ENCODING_JP_EXT
214
215			#include "table/jis0213_1.h"
216			#include "table/jis0213_2.h"
217
218	root	1.18	#if ENCODING_TO_UNICODE
219			# define ENC(base) { cs_ ## base ## _from_unicode, cs_ ## base ## _to_unicode }
220			#else
221			# define ENC(base) { cs_ ## base ## _from_unicode }
222			#endif
223	ayin	1.26
224	root	1.18
225	pcg	1.3	// order must match table in encoding.h(!)
226	root	1.18	const rxvt_codeset_conv rxvt_codeset[NUM_CODESETS] = {
227			ENC (unknown),
228	pcg	1.1
229	root	1.18	ENC (us_ascii),
230	pcg	1.1
231	root	1.18	ENC (iso8859_1),
232			ENC (iso8859_2),
233			ENC (iso8859_3),
234			ENC (iso8859_4),
235			ENC (iso8859_5),
236			ENC (iso8859_6),
237			ENC (iso8859_7),
238			ENC (iso8859_8),
239			ENC (iso8859_9),
240			ENC (iso8859_10),
241			ENC (iso8859_11),
242			ENC (iso8859_13),
243			ENC (iso8859_14),
244			ENC (iso8859_15),
245			ENC (iso8859_16),
246
247			ENC (koi8_r),
248			ENC (koi8_u),
249
250			ENC (jis0201_1976_0),
251			ENC (jis0208_1990_0),
252			ENC (jis0212_1990_0),
253
254			ENC (jis0213_1),
255			ENC (jis0213_2),
256
257			ENC (ksc5601_1987_0),
258
259			ENC (gb2312_1980_0),
260	root	1.21	ENC (gbk_0),
261	root	1.18
262			ENC (cns11643_1992_1),
263			ENC (cns11643_1992_2),
264			ENC (cns11643_1992_3),
265			ENC (cns11643_1992_4),
266			ENC (cns11643_1992_5),
267			ENC (cns11643_1992_6),
268			ENC (cns11643_1992_7),
269			ENC (cns11643_1992_f),
270			ENC (big5),
271			ENC (big5_ext),
272			ENC (big5_plus),
273	pcg	1.1
274	root	1.18	ENC (viscii),
275	pcg	1.1
276	root	1.18	ENC (unicode_16),
277			ENC (unicode),
278	pcg	1.1	};
279
280	pcg	1.10	#if ENABLE_COMBINING
281			# define ENCODING_COMPOSE
282			#endif
283	pcg	1.9
284			#include "table/compose.h"
285
286	pcg	1.15	unicode_t
287			rxvt_compose (unicode_t c1, unicode_t c2)
288	pcg	1.9	{
289			int l = 0;
290			int r = sizeof (rxvt_compose_table) / sizeof (rxvt_compose_entry) - 1;
291			int m;
292
293	root	1.27	while (r >= l)
294	pcg	1.9	{
295			m = (l + r) / 2;
296			rxvt_compose_entry &c = rxvt_compose_table[m];
297
298			if (c.c1 < c1 \|\| (c.c1 == c1 && c.c2 < c2))
299			l = m + 1;
300			else if (c.c1 > c1 \|\| (c.c1 == c1 && c.c2 > c2))
301			r = m - 1;
302			else
303			return c.r;
304			}
305
306			return NOCHAR;
307			}
308	pcg	1.15
309			#include "table/category.h"
310
311			bool unicode::is_space (unicode_t c)
312			{
313	root	1.19	return IS_SPACE (c)
314			\|\| c == 0x09; // exclude tabs, too, as we store them in the buffer
315	pcg	1.15	}