1 |
/* utf8.h |
2 |
* |
3 |
* Copyright (C) 2000, 2001, 2002, by Larry Wall and others |
4 |
* |
5 |
* You may distribute under the terms of either the GNU General Public |
6 |
* License or the Artistic License, as specified in the README file. |
7 |
* |
8 |
*/ |
9 |
|
10 |
/* Use UTF-8 as the default script encoding? |
11 |
* Turning this on will break scripts having non-UTF-8 binary |
12 |
* data (such as Latin-1) in string literals. */ |
13 |
#ifdef USE_UTF8_SCRIPTS |
14 |
# define USE_UTF8_IN_NAMES (!IN_BYTES) |
15 |
#else |
16 |
# define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8) |
17 |
#endif |
18 |
|
19 |
#ifdef EBCDIC |
20 |
/* The equivalent of these macros but implementing UTF-EBCDIC |
21 |
are in the following header file: |
22 |
*/ |
23 |
|
24 |
#include "utfebcdic.h" |
25 |
#else |
26 |
START_EXTERN_C |
27 |
|
28 |
#ifdef DOINIT |
29 |
EXTCONST unsigned char PL_utf8skip[] = { |
30 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ |
31 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ |
32 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ |
33 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ |
34 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */ |
35 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */ |
36 |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* scripts */ |
37 |
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6, /* cjk etc. */ |
38 |
7,13, /* Perl extended (not UTF-8). Up to 72bit allowed (64-bit + reserved). */ |
39 |
}; |
40 |
#else |
41 |
EXTCONST unsigned char PL_utf8skip[]; |
42 |
#endif |
43 |
|
44 |
END_EXTERN_C |
45 |
#define UTF8SKIP(s) PL_utf8skip[*(U8*)s] |
46 |
|
47 |
/* Native character to iso-8859-1 */ |
48 |
#define NATIVE_TO_ASCII(ch) (ch) |
49 |
#define ASCII_TO_NATIVE(ch) (ch) |
50 |
/* Transform after encoding */ |
51 |
#define NATIVE_TO_UTF(ch) (ch) |
52 |
#define UTF_TO_NATIVE(ch) (ch) |
53 |
/* Transforms in wide UV chars */ |
54 |
#define UNI_TO_NATIVE(ch) (ch) |
55 |
#define NATIVE_TO_UNI(ch) (ch) |
56 |
/* Transforms in invariant space */ |
57 |
#define NATIVE_TO_NEED(enc,ch) (ch) |
58 |
#define ASCII_TO_NEED(enc,ch) (ch) |
59 |
|
60 |
/* As there are no translations avoid the function wrapper */ |
61 |
#define Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni |
62 |
#define Perl_uvchr_to_utf8 Perl_uvuni_to_utf8 |
63 |
|
64 |
/* |
65 |
|
66 |
The following table is from Unicode 3.2. |
67 |
|
68 |
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte |
69 |
|
70 |
U+0000..U+007F 00..7F |
71 |
U+0080..U+07FF C2..DF 80..BF |
72 |
U+0800..U+0FFF E0 A0..BF 80..BF |
73 |
U+1000..U+CFFF E1..EC 80..BF 80..BF |
74 |
U+D000..U+D7FF ED 80..9F 80..BF |
75 |
U+D800..U+DFFF ******* ill-formed ******* |
76 |
U+E000..U+FFFF EE..EF 80..BF 80..BF |
77 |
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
78 |
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
79 |
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
80 |
|
81 |
Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF, |
82 |
the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF. |
83 |
The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings: |
84 |
it is technically possible to UTF-8-encode a single code point in different |
85 |
ways, but that is explicitly forbidden, and the shortest possible encoding |
86 |
should always be used (and that is what Perl does). |
87 |
|
88 |
*/ |
89 |
|
90 |
/* |
91 |
Another way to look at it, as bits: |
92 |
|
93 |
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte |
94 |
|
95 |
0aaaaaaa 0aaaaaaa |
96 |
00000bbbbbaaaaaa 110bbbbb 10aaaaaa |
97 |
ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa |
98 |
00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa |
99 |
|
100 |
As you can see, the continuation bytes all begin with C<10>, and the |
101 |
leading bits of the start byte tell how many bytes the are in the |
102 |
encoded character. |
103 |
|
104 |
*/ |
105 |
|
106 |
|
107 |
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) |
108 |
#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) |
109 |
#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) |
110 |
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) |
111 |
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) |
112 |
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) |
113 |
#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0) |
114 |
|
115 |
#define UTF_START_MARK(len) ((len > 7) ? 0xFF : (0xFE << (7-len))) |
116 |
#define UTF_START_MASK(len) ((len >= 7) ? 0x00 : (0x1F >> (len-2))) |
117 |
|
118 |
#define UTF_CONTINUATION_MARK 0x80 |
119 |
#define UTF_ACCUMULATION_SHIFT 6 |
120 |
#define UTF_CONTINUATION_MASK ((U8)0x3f) |
121 |
#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) |
122 |
|
123 |
#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) |
124 |
#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) |
125 |
|
126 |
#ifdef HAS_QUAD |
127 |
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ |
128 |
(uv) < 0x800 ? 2 : \ |
129 |
(uv) < 0x10000 ? 3 : \ |
130 |
(uv) < 0x200000 ? 4 : \ |
131 |
(uv) < 0x4000000 ? 5 : \ |
132 |
(uv) < 0x80000000 ? 6 : \ |
133 |
(uv) < UTF8_QUAD_MAX ? 7 : 13 ) |
134 |
#else |
135 |
/* No, I'm not even going to *TRY* putting #ifdef inside a #define */ |
136 |
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ |
137 |
(uv) < 0x800 ? 2 : \ |
138 |
(uv) < 0x10000 ? 3 : \ |
139 |
(uv) < 0x200000 ? 4 : \ |
140 |
(uv) < 0x4000000 ? 5 : \ |
141 |
(uv) < 0x80000000 ? 6 : 7 ) |
142 |
#endif |
143 |
|
144 |
/* |
145 |
* Note: we try to be careful never to call the isXXX_utf8() functions |
146 |
* unless we're pretty sure we've seen the beginning of a UTF-8 character |
147 |
* (that is, the two high bits are set). Otherwise we risk loading in the |
148 |
* heavy-duty SWASHINIT and SWASHGET routines unnecessarily. |
149 |
*/ |
150 |
#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \ |
151 |
? isIDFIRST(*(p)) \ |
152 |
: isIDFIRST_utf8((U8*)p)) |
153 |
#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \ |
154 |
? isALNUM(*(p)) \ |
155 |
: isALNUM_utf8((U8*)p)) |
156 |
|
157 |
|
158 |
#endif /* EBCDIC vs ASCII */ |
159 |
|
160 |
/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */ |
161 |
|
162 |
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) |
163 |
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1) |
164 |
|
165 |
#define UTF8_MAXBYTES 13 |
166 |
/* How wide can a single UTF-8 encoded character become in bytes. |
167 |
* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 |
168 |
* since UTF-8 is an encoding of Unicode and given Unicode's current |
169 |
* upper limit only four bytes is possible. Perl thinks of UTF-8 |
170 |
* as a way to encode non-negative integers in a binary format. */ |
171 |
#define UTF8_MAXLEN UTF8_MAXBYTES |
172 |
|
173 |
#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */ |
174 |
#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */ |
175 |
#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */ |
176 |
#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */ |
177 |
|
178 |
/* The maximum number of UTF-8 bytes a single Unicode character can |
179 |
* uppercase/lowercase/fold into; this number depends on the Unicode |
180 |
* version. An example of maximal expansion is the U+03B0 which |
181 |
* uppercases to U+03C5 U+0308 U+0301. The Unicode databases that |
182 |
* tell these things are UnicodeDatabase.txt, CaseFolding.txt, and |
183 |
* SpecialCasing.txt. */ |
184 |
#define UTF8_MAXBYTES_CASE 6 |
185 |
|
186 |
#define IN_BYTES (PL_curcop->op_private & HINT_BYTES) |
187 |
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) |
188 |
|
189 |
#define UTF8_ALLOW_EMPTY 0x0001 |
190 |
#define UTF8_ALLOW_CONTINUATION 0x0002 |
191 |
#define UTF8_ALLOW_NON_CONTINUATION 0x0004 |
192 |
#define UTF8_ALLOW_FE_FF 0x0008 |
193 |
#define UTF8_ALLOW_SHORT 0x0010 |
194 |
#define UTF8_ALLOW_SURROGATE 0x0020 |
195 |
#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */ |
196 |
#define UTF8_ALLOW_LONG 0x0080 |
197 |
#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ |
198 |
UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
199 |
#define UTF8_ALLOW_ANY 0x00FF |
200 |
#define UTF8_CHECK_ONLY 0x0200 |
201 |
|
202 |
#define UNICODE_SURROGATE_FIRST 0xD800 |
203 |
#define UNICODE_SURROGATE_LAST 0xDFFF |
204 |
#define UNICODE_REPLACEMENT 0xFFFD |
205 |
#define UNICODE_BYTE_ORDER_MARK 0xFEFF |
206 |
#define UNICODE_ILLEGAL 0xFFFF |
207 |
|
208 |
/* Though our UTF-8 encoding can go beyond this, |
209 |
* let's be conservative and do as Unicode 3.2 says. */ |
210 |
#define PERL_UNICODE_MAX 0x10FFFF |
211 |
|
212 |
#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ |
213 |
#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ |
214 |
#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */ |
215 |
#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */ |
216 |
#define UNICODE_ALLOW_ANY 0x000F |
217 |
|
218 |
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ |
219 |
(c) <= UNICODE_SURROGATE_LAST) |
220 |
#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT) |
221 |
#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK) |
222 |
#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL) |
223 |
|
224 |
#ifdef HAS_QUAD |
225 |
# define UTF8_QUAD_MAX UINT64_C(0x1000000000) |
226 |
#endif |
227 |
|
228 |
#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c) |
229 |
|
230 |
#define UNICODE_LATIN_SMALL_LETTER_SHARP_S 0x00DF |
231 |
#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3 |
232 |
#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2 |
233 |
#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3 |
234 |
|
235 |
#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S 0x0059 |
236 |
|
237 |
#define UNI_DISPLAY_ISPRINT 0x0001 |
238 |
#define UNI_DISPLAY_BACKSLASH 0x0002 |
239 |
#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH) |
240 |
#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH) |
241 |
|
242 |
#ifdef EBCDIC |
243 |
# define ANYOF_FOLD_SHARP_S(node, input, end) \ |
244 |
(ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \ |
245 |
(ANYOF_FLAGS(node) & ANYOF_UNICODE) && \ |
246 |
(ANYOF_FLAGS(node) & ANYOF_FOLD) && \ |
247 |
((end) > (input) + 1) && \ |
248 |
toLOWER((input)[0]) == 's' && \ |
249 |
toLOWER((input)[1]) == 's') |
250 |
#else |
251 |
# define ANYOF_FOLD_SHARP_S(node, input, end) \ |
252 |
(ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \ |
253 |
(ANYOF_FLAGS(node) & ANYOF_UNICODE) && \ |
254 |
(ANYOF_FLAGS(node) & ANYOF_FOLD) && \ |
255 |
((end) > (input) + 1) && \ |
256 |
toLOWER((input)[0]) == 's' && \ |
257 |
toLOWER((input)[1]) == 's') |
258 |
#endif |
259 |
#define SHARP_S_SKIP 2 |