blob: cd25f10a2e69fae2f969a6e2fc0a54d37547a2c4 [file] [log] [blame]
Neil Booth1613e522003-04-20 07:29:23 +00001/* CPP Library - charsets
Kazu Hiratad9221e012004-01-21 20:40:04 +00002 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
Neil Booth1613e522003-04-20 07:29:23 +00003 Free Software Foundation, Inc.
4
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6
7This program is free software; you can redistribute it and/or modify it
8under the terms of the GNU General Public License as published by the
9Free Software Foundation; either version 2, or (at your option) any
10later version.
11
12This program is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with this program; if not, write to the Free Software
19Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20
21#include "config.h"
22#include "system.h"
Neil Booth1613e522003-04-20 07:29:23 +000023#include "cpplib.h"
Paolo Bonzini4f4e53dd2004-05-24 10:50:45 +000024#include "internal.h"
25#include "ucnid.h"
Neil Booth1613e522003-04-20 07:29:23 +000026
Zack Weinberge6cc3a22003-07-05 00:24:00 +000027/* Character set handling for C-family languages.
28
29 Terminological note: In what follows, "charset" or "character set"
30 will be taken to mean both an abstract set of characters and an
31 encoding for that set.
32
33 The C99 standard discusses two character sets: source and execution.
34 The source character set is used for internal processing in translation
35 phases 1 through 4; the execution character set is used thereafter.
36 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
37 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
38 of these terms). Furthermore, the "basic character set" (listed in
39 5.2.1p3) is to be encoded in each with values one byte wide, and is
40 to appear in the initial shift state.
41
42 It is not explicitly mentioned, but there is also a "wide execution
43 character set" used to encode wide character constants and wide
44 string literals; this is supposed to be the result of applying the
45 standard library function mbstowcs() to an equivalent narrow string
46 (6.4.5p5). However, the behavior of hexadecimal and octal
47 \-escapes is at odds with this; they are supposed to be translated
48 directly to wchar_t values (6.4.4.4p5,6).
49
50 The source character set is not necessarily the character set used
51 to encode physical source files on disk; translation phase 1 converts
52 from whatever that encoding is to the source character set.
53
54 The presence of universal character names in C99 (6.4.3 et seq.)
55 forces the source character set to be isomorphic to ISO 10646,
56 that is, Unicode. There is no such constraint on the execution
57 character set; note also that the conversion from source to
58 execution character set does not occur for identifiers (5.1.1.2p1#5).
59
60 For convenience of implementation, the source character set's
61 encoding of the basic character set should be identical to the
62 execution character set OF THE HOST SYSTEM's encoding of the basic
63 character set, and it should not be a state-dependent encoding.
64
65 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
66 depending on whether the host is based on ASCII or EBCDIC (see
67 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinbergdea55da2003-07-11 05:49:47 +000068 Technical Report #16). With limited exceptions, it relies on the
69 system library's iconv() primitive to do charset conversion
70 (specified in SUSv2). */
Zack Weinberge6cc3a22003-07-05 00:24:00 +000071
72#if !HAVE_ICONV
73/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
74 below, which are guarded only by if statements with compile-time
75 constant conditions, do not cause link errors. */
76#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinbergf1c4bc42003-07-05 16:44:29 +020077#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinbergdea55da2003-07-11 05:49:47 +000078#define iconv_close(x) (void)0
Andrew Pinski5beadb32003-07-07 04:46:29 +000079#define ICONV_CONST
Zack Weinberge6cc3a22003-07-05 00:24:00 +000080#endif
81
82#if HOST_CHARSET == HOST_CHARSET_ASCII
83#define SOURCE_CHARSET "UTF-8"
Zack Weinbergc5ff0692005-02-20 17:01:32 +000084#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberge6cc3a22003-07-05 00:24:00 +000085#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
86#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinbergc5ff0692005-02-20 17:01:32 +000087#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberge6cc3a22003-07-05 00:24:00 +000088#else
89#error "Unrecognized basic host character set"
90#endif
91
Zack Weinbergdea55da2003-07-11 05:49:47 +000092#ifndef EILSEQ
93#define EILSEQ EINVAL
94#endif
95
Zack Weinberg6b883142003-07-10 23:16:31 +000096/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerodea8016862003-09-26 05:52:43 +000097/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata0ee55ad2003-10-05 13:09:48 +000098 such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerodea8016862003-09-26 05:52:43 +000099struct _cpp_strbuf
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000100{
101 uchar *text;
102 size_t asize;
103 size_t len;
104};
105
106/* This is enough to hold any string that fits on a single 80-column
107 line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg6b883142003-07-10 23:16:31 +0000108 ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000109#define OUTBUF_BLOCK_SIZE 256
110
Zack Weinberg6b883142003-07-10 23:16:31 +0000111/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
112 logic. This is because a depressing number of systems lack iconv,
113 or have have iconv libraries that do not do these conversions, so
114 we need a fallback implementation for them. To ensure the fallback
115 doesn't break due to neglect, it is used on all systems.
116
117 UTF-32 encoding is nice and simple: a four-byte binary number,
118 constrained to the range 00000000-7FFFFFFF to avoid questions of
119 signedness. We do have to cope with big- and little-endian
120 variants.
121
122 UTF-16 encoding uses two-byte binary numbers, again in big- and
123 little-endian variants, for all values in the 00000000-0000FFFF
124 range. Values in the 00010000-0010FFFF range are encoded as pairs
125 of two-byte numbers, called "surrogate pairs": given a number S in
126 this range, it is mapped to a pair (H, L) as follows:
127
128 H = (S - 0x10000) / 0x400 + 0xD800
129 L = (S - 0x10000) % 0x400 + 0xDC00
130
131 Two-byte values in the D800...DFFF range are ill-formed except as a
132 component of a surrogate pair. Even if the encoding within a
133 two-byte value is little-endian, the H member of the surrogate pair
134 comes first.
135
136 There is no way to encode values in the 00110000-7FFFFFFF range,
137 which is not currently a problem as there are no assigned code
138 points in that range; however, the author expects that it will
139 eventually become necessary to abandon UTF-16 due to this
140 limitation. Note also that, because of these pairs, UTF-16 does
141 not meet the requirements of the C standard for a wide character
142 encoding (see 3.7.3 and 6.4.4.4p11).
143
144 UTF-8 encoding looks like this:
145
146 value range encoded as
147 00000000-0000007F 0xxxxxxx
148 00000080-000007FF 110xxxxx 10xxxxxx
149 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
150 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
151 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
153
154 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
155 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
156 never occur. Note also that any value that can be encoded by a
157 given row of the table can also be encoded by all successive rows,
158 but this is not done; only the shortest possible encoding for any
159 given value is valid. For instance, the character 07C0 could be
160 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
161 FC 80 80 80 9F 80. Only the first is valid.
162
163 An implementation note: the transformation from UTF-16 to UTF-8, or
164 vice versa, is easiest done by using UTF-32 as an intermediary. */
165
166/* Internal primitives which go from an UTF-8 byte stream to native-endian
167 UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
168 operation in several places below. */
169static inline int
170one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
171 cppchar_t *cp)
172{
173 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
174 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christophercf551fb2004-01-16 22:37:49 +0000175
Zack Weinberg6b883142003-07-10 23:16:31 +0000176 cppchar_t c;
177 const uchar *inbuf = *inbufp;
178 size_t nbytes, i;
179
180 if (*inbytesleftp < 1)
181 return EINVAL;
182
183 c = *inbuf;
184 if (c < 0x80)
185 {
186 *cp = c;
187 *inbytesleftp -= 1;
188 *inbufp += 1;
189 return 0;
190 }
191
192 /* The number of leading 1-bits in the first byte indicates how many
193 bytes follow. */
194 for (nbytes = 2; nbytes < 7; nbytes++)
195 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
196 goto found;
197 return EILSEQ;
198 found:
199
200 if (*inbytesleftp < nbytes)
201 return EINVAL;
202
203 c = (c & masks[nbytes-1]);
204 inbuf++;
205 for (i = 1; i < nbytes; i++)
206 {
207 cppchar_t n = *inbuf++;
208 if ((n & 0xC0) != 0x80)
209 return EILSEQ;
210 c = ((c << 6) + (n & 0x3F));
211 }
212
213 /* Make sure the shortest possible encoding was used. */
214 if (c <= 0x7F && nbytes > 1) return EILSEQ;
215 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
216 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
217 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
218 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
219
220 /* Make sure the character is valid. */
221 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
222
223 *cp = c;
224 *inbufp = inbuf;
225 *inbytesleftp -= nbytes;
226 return 0;
227}
228
229static inline int
230one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
231{
232 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
233 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
234 size_t nbytes;
235 uchar buf[6], *p = &buf[6];
236 uchar *outbuf = *outbufp;
237
238 nbytes = 1;
239 if (c < 0x80)
240 *--p = c;
241 else
242 {
243 do
244 {
245 *--p = ((c & 0x3F) | 0x80);
246 c >>= 6;
247 nbytes++;
248 }
249 while (c >= 0x3F || (c & limits[nbytes-1]));
250 *--p = (c | masks[nbytes-1]);
251 }
252
253 if (*outbytesleftp < nbytes)
254 return E2BIG;
255
256 while (p < &buf[6])
257 *outbuf++ = *p++;
258 *outbytesleftp -= nbytes;
259 *outbufp = outbuf;
260 return 0;
261}
262
263/* The following four functions transform one character between the two
264 encodings named in the function name. All have the signature
265 int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
266 uchar **outbufp, size_t *outbytesleftp)
267
268 BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
269 interpreted as a boolean indicating whether big-endian or
270 little-endian encoding is to be used for the member of the pair
271 that is not UTF-8.
272
273 INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
274 do for iconv.
275
276 The return value is either 0 for success, or an errno value for
277 failure, which may be E2BIG (need more space), EILSEQ (ill-formed
278 input sequence), ir EINVAL (incomplete input sequence). */
Eric Christophercf551fb2004-01-16 22:37:49 +0000279
Zack Weinberg6b883142003-07-10 23:16:31 +0000280static inline int
281one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
282 uchar **outbufp, size_t *outbytesleftp)
283{
284 uchar *outbuf;
Jan Hubickabd184962003-10-25 17:12:01 +0200285 cppchar_t s = 0;
Zack Weinberg6b883142003-07-10 23:16:31 +0000286 int rval;
287
288 /* Check for space first, since we know exactly how much we need. */
289 if (*outbytesleftp < 4)
290 return E2BIG;
291
292 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
293 if (rval)
294 return rval;
295
296 outbuf = *outbufp;
297 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
298 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
299 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
300 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
301
302 *outbufp += 4;
303 *outbytesleftp -= 4;
304 return 0;
305}
306
307static inline int
308one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
309 uchar **outbufp, size_t *outbytesleftp)
310{
311 cppchar_t s;
312 int rval;
313 const uchar *inbuf;
314
315 if (*inbytesleftp < 4)
316 return EINVAL;
317
318 inbuf = *inbufp;
319
320 s = inbuf[bigend ? 0 : 3] << 24;
321 s += inbuf[bigend ? 1 : 2] << 16;
322 s += inbuf[bigend ? 2 : 1] << 8;
323 s += inbuf[bigend ? 3 : 0];
324
325 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
326 return EILSEQ;
327
328 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
329 if (rval)
330 return rval;
331
332 *inbufp += 4;
333 *inbytesleftp -= 4;
334 return 0;
335}
336
337static inline int
338one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
339 uchar **outbufp, size_t *outbytesleftp)
340{
341 int rval;
Richard Henderson671ca9e2003-10-30 08:36:27 -0800342 cppchar_t s = 0;
Zack Weinberg6b883142003-07-10 23:16:31 +0000343 const uchar *save_inbuf = *inbufp;
344 size_t save_inbytesleft = *inbytesleftp;
345 uchar *outbuf = *outbufp;
346
347 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
348 if (rval)
349 return rval;
350
351 if (s > 0x0010FFFF)
352 {
353 *inbufp = save_inbuf;
354 *inbytesleftp = save_inbytesleft;
355 return EILSEQ;
356 }
357
358 if (s < 0xFFFF)
359 {
360 if (*outbytesleftp < 2)
361 {
362 *inbufp = save_inbuf;
363 *inbytesleftp = save_inbytesleft;
364 return E2BIG;
365 }
366 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
367 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
368
369 *outbufp += 2;
370 *outbytesleftp -= 2;
371 return 0;
372 }
373 else
374 {
375 cppchar_t hi, lo;
376
377 if (*outbytesleftp < 4)
378 {
379 *inbufp = save_inbuf;
380 *inbytesleftp = save_inbytesleft;
381 return E2BIG;
382 }
383
384 hi = (s - 0x10000) / 0x400 + 0xD800;
385 lo = (s - 0x10000) % 0x400 + 0xDC00;
386
387 /* Even if we are little-endian, put the high surrogate first.
388 ??? Matches practice? */
389 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
390 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
391 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
392 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
393
394 *outbufp += 4;
395 *outbytesleftp -= 4;
396 return 0;
397 }
398}
399
400static inline int
401one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
402 uchar **outbufp, size_t *outbytesleftp)
403{
404 cppchar_t s;
405 const uchar *inbuf = *inbufp;
406 int rval;
407
408 if (*inbytesleftp < 2)
409 return EINVAL;
410 s = inbuf[bigend ? 0 : 1] << 8;
411 s += inbuf[bigend ? 1 : 0];
412
413 /* Low surrogate without immediately preceding high surrogate is invalid. */
414 if (s >= 0xDC00 && s <= 0xDFFF)
415 return EILSEQ;
416 /* High surrogate must have a following low surrogate. */
417 else if (s >= 0xD800 && s <= 0xDBFF)
418 {
419 cppchar_t hi = s, lo;
420 if (*inbytesleftp < 4)
421 return EINVAL;
422
423 lo = inbuf[bigend ? 2 : 3] << 8;
424 lo += inbuf[bigend ? 3 : 2];
425
426 if (lo < 0xDC00 || lo > 0xDFFF)
427 return EILSEQ;
428
429 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
430 }
431
432 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
433 if (rval)
434 return rval;
435
436 /* Success - update the input pointers (one_cppchar_to_utf8 has done
437 the output pointers for us). */
438 if (s <= 0xFFFF)
439 {
440 *inbufp += 2;
441 *inbytesleftp -= 2;
442 }
443 else
444 {
445 *inbufp += 4;
446 *inbytesleftp -= 4;
447 }
448 return 0;
449}
450
451/* Helper routine for the next few functions. The 'const' on
452 one_conversion means that we promise not to modify what function is
Kazu Hirata4ed43212003-07-12 22:49:48 +0000453 pointed to, which lets the inliner see through it. */
Zack Weinberg6b883142003-07-10 23:16:31 +0000454
455static inline bool
456conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
457 uchar **, size_t *),
Nathanael Nerodea8016862003-09-26 05:52:43 +0000458 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000459{
460 const uchar *inbuf;
461 uchar *outbuf;
462 size_t inbytesleft, outbytesleft;
463 int rval;
464
465 inbuf = from;
466 inbytesleft = flen;
467 outbuf = to->text + to->len;
468 outbytesleft = to->asize - to->len;
469
470 for (;;)
471 {
472 do
473 rval = one_conversion (cd, &inbuf, &inbytesleft,
474 &outbuf, &outbytesleft);
475 while (inbytesleft && !rval);
476
477 if (__builtin_expect (inbytesleft == 0, 1))
478 {
479 to->len = to->asize - outbytesleft;
480 return true;
481 }
482 if (rval != E2BIG)
483 {
484 errno = rval;
485 return false;
486 }
487
488 outbytesleft += OUTBUF_BLOCK_SIZE;
489 to->asize += OUTBUF_BLOCK_SIZE;
490 to->text = xrealloc (to->text, to->asize);
491 outbuf = to->text + to->asize - outbytesleft;
492 }
493}
Eric Christophercf551fb2004-01-16 22:37:49 +0000494
Zack Weinberg6b883142003-07-10 23:16:31 +0000495
496/* These functions convert entire strings between character sets.
497 They all have the signature
498
Nathanael Nerodea8016862003-09-26 05:52:43 +0000499 bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg6b883142003-07-10 23:16:31 +0000500
501 The input string FROM is converted as specified by the function
502 name plus the iconv descriptor CD (which may be fake), and the
503 result appended to TO. On any error, false is returned, otherwise true. */
504
505/* These four use the custom conversion code above. */
506static bool
507convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000508 struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000509{
510 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
511}
512
513static bool
514convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000515 struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000516{
517 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
518}
519
520static bool
521convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000522 struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000523{
524 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
525}
526
527static bool
528convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000529 struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000530{
531 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
532}
533
534/* Identity conversion, used when we have no alternative. */
535static bool
536convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000537 const uchar *from, size_t flen, struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000538{
539 if (to->len + flen > to->asize)
540 {
541 to->asize = to->len + flen;
542 to->text = xrealloc (to->text, to->asize);
543 }
544 memcpy (to->text + to->len, from, flen);
545 to->len += flen;
546 return true;
547}
548
549/* And this one uses the system iconv primitive. It's a little
550 different, since iconv's interface is a little different. */
Zack Weinbergdea55da2003-07-11 05:49:47 +0000551#if HAVE_ICONV
Zack Weinberg6b883142003-07-10 23:16:31 +0000552static bool
553convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000554 struct _cpp_strbuf *to)
Zack Weinberg6b883142003-07-10 23:16:31 +0000555{
556 ICONV_CONST char *inbuf;
557 char *outbuf;
558 size_t inbytesleft, outbytesleft;
559
560 /* Reset conversion descriptor and check that it is valid. */
561 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
562 return false;
563
564 inbuf = (ICONV_CONST char *)from;
565 inbytesleft = flen;
566 outbuf = (char *)to->text + to->len;
567 outbytesleft = to->asize - to->len;
568
569 for (;;)
570 {
571 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
572 if (__builtin_expect (inbytesleft == 0, 1))
573 {
574 to->len = to->asize - outbytesleft;
575 return true;
576 }
577 if (errno != E2BIG)
578 return false;
579
580 outbytesleft += OUTBUF_BLOCK_SIZE;
581 to->asize += OUTBUF_BLOCK_SIZE;
582 to->text = xrealloc (to->text, to->asize);
583 outbuf = (char *)to->text + to->asize - outbytesleft;
584 }
585}
Zack Weinbergdea55da2003-07-11 05:49:47 +0000586#else
587#define convert_using_iconv 0 /* prevent undefined symbol error below */
588#endif
Zack Weinberg6b883142003-07-10 23:16:31 +0000589
590/* Arrange for the above custom conversion logic to be used automatically
591 when conversion between a suitable pair of character sets is requested. */
592
593#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
594 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
595
596struct conversion
597{
598 const char *pair;
599 convert_f func;
600 iconv_t fake_cd;
601};
602static const struct conversion conversion_tab[] = {
603 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
604 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
605 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
606 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
607 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
608 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
609 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
610 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
611};
612
613/* Subroutine of cpp_init_iconv: initialize and return a
614 cset_converter structure for conversion from FROM to TO. If
615 iconv_open() fails, issue an error and return an identity
616 converter. Silently return an identity converter if FROM and TO
617 are identical. */
618static struct cset_converter
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000619init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
620{
Zack Weinberg6b883142003-07-10 23:16:31 +0000621 struct cset_converter ret;
622 char *pair;
623 size_t i;
Eric Christophercf551fb2004-01-16 22:37:49 +0000624
Zack Weinberg6b883142003-07-10 23:16:31 +0000625 if (!strcasecmp (to, from))
626 {
627 ret.func = convert_no_conversion;
628 ret.cd = (iconv_t) -1;
629 return ret;
630 }
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000631
Zack Weinberg6b883142003-07-10 23:16:31 +0000632 pair = alloca(strlen(to) + strlen(from) + 2);
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000633
Zack Weinberg6b883142003-07-10 23:16:31 +0000634 strcpy(pair, from);
635 strcat(pair, "/");
636 strcat(pair, to);
637 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
638 if (!strcasecmp (pair, conversion_tab[i].pair))
639 {
640 ret.func = conversion_tab[i].func;
641 ret.cd = conversion_tab[i].fake_cd;
642 return ret;
643 }
644
645 /* No custom converter - try iconv. */
Zack Weinbergdea55da2003-07-11 05:49:47 +0000646 if (HAVE_ICONV)
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000647 {
Zack Weinbergdea55da2003-07-11 05:49:47 +0000648 ret.func = convert_using_iconv;
649 ret.cd = iconv_open (to, from);
Zack Weinberg6b883142003-07-10 23:16:31 +0000650
Zack Weinbergdea55da2003-07-11 05:49:47 +0000651 if (ret.cd == (iconv_t) -1)
652 {
653 if (errno == EINVAL)
Eric Christophercf551fb2004-01-16 22:37:49 +0000654 cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinbergdea55da2003-07-11 05:49:47 +0000655 "conversion from %s to %s not supported by iconv",
656 from, to);
657 else
John David Anglin0527bc42003-11-01 22:56:54 +0000658 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinbergdea55da2003-07-11 05:49:47 +0000659
660 ret.func = convert_no_conversion;
661 }
662 }
663 else
664 {
Eric Christophercf551fb2004-01-16 22:37:49 +0000665 cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinbergdea55da2003-07-11 05:49:47 +0000666 "no iconv implementation, cannot convert from %s to %s",
667 from, to);
Zack Weinberg6b883142003-07-10 23:16:31 +0000668 ret.func = convert_no_conversion;
Zack Weinbergdea55da2003-07-11 05:49:47 +0000669 ret.cd = (iconv_t) -1;
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000670 }
Zack Weinberg6b883142003-07-10 23:16:31 +0000671 return ret;
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000672}
673
674/* If charset conversion is requested, initialize iconv(3) descriptors
675 for conversion from the source character set to the execution
676 character sets. If iconv is not present in the C library, and
677 conversion is requested, issue an error. */
678
679void
680cpp_init_iconv (cpp_reader *pfile)
681{
682 const char *ncset = CPP_OPTION (pfile, narrow_charset);
683 const char *wcset = CPP_OPTION (pfile, wide_charset);
684 const char *default_wcset;
685
686 bool be = CPP_OPTION (pfile, bytes_big_endian);
687
688 if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg6b883142003-07-10 23:16:31 +0000689 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000690 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg6b883142003-07-10 23:16:31 +0000691 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000692 else
693 /* This effectively means that wide strings are not supported,
694 so don't do any conversion at all. */
695 default_wcset = SOURCE_CHARSET;
696
Zack Weinbergdea55da2003-07-11 05:49:47 +0000697 if (!ncset)
698 ncset = SOURCE_CHARSET;
699 if (!wcset)
700 wcset = default_wcset;
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000701
Zack Weinbergdea55da2003-07-11 05:49:47 +0000702 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
703 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000704}
705
Zack Weinberga29f62d2004-09-18 00:56:19 +0000706/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000707void
708_cpp_destroy_iconv (cpp_reader *pfile)
709{
710 if (HAVE_ICONV)
711 {
Zack Weinberg6b883142003-07-10 23:16:31 +0000712 if (pfile->narrow_cset_desc.func == convert_using_iconv)
713 iconv_close (pfile->narrow_cset_desc.cd);
714 if (pfile->wide_cset_desc.func == convert_using_iconv)
715 iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000716 }
717}
718
Zack Weinbergc5ff0692005-02-20 17:01:32 +0000719/* Utility routine for use by a full compiler. C is a character taken
720 from the *basic* source character set, encoded in the host's
721 execution encoding. Convert it to (the target's) execution
722 encoding, and return that value.
723
724 Issues an internal error if C's representation in the narrow
725 execution character set fails to be a single-byte value (C99
726 5.2.1p3: "The representation of each member of the source and
727 execution character sets shall fit in a byte.") May also issue an
728 internal error if C fails to be a member of the basic source
729 character set (testing this exactly is too hard, especially when
730 the host character set is EBCDIC). */
731cppchar_t
732cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
733{
734 uchar sbuf[1];
735 struct _cpp_strbuf tbuf;
736
737 /* This test is merely an approximation, but it suffices to catch
738 the most important thing, which is that we don't get handed a
739 character outside the unibyte range of the host character set. */
740 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
741 {
742 cpp_error (pfile, CPP_DL_ICE,
743 "character 0x%lx is not in the basic source character set\n",
744 (unsigned long)c);
745 return 0;
746 }
747
748 /* Being a character in the unibyte range of the host character set,
749 we can safely splat it into a one-byte buffer and trust that that
750 is a well-formed string. */
751 sbuf[0] = c;
752
753 /* This should never need to reallocate, but just in case... */
754 tbuf.asize = 1;
755 tbuf.text = xmalloc (tbuf.asize);
756 tbuf.len = 0;
757
758 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
759 {
760 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
761 return 0;
762 }
763 if (tbuf.len != 1)
764 {
765 cpp_error (pfile, CPP_DL_ICE,
766 "character 0x%lx is not unibyte in execution character set",
767 (unsigned long)c);
768 return 0;
769 }
770 c = tbuf.text[0];
771 free(tbuf.text);
772 return c;
773}
774
775
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000776
777/* Utility routine that computes a mask of the form 0000...111... with
778 WIDTH 1-bits. */
779static inline size_t
780width_to_mask (size_t width)
781{
782 width = MIN (width, BITS_PER_CPPCHAR_T);
783 if (width >= CHAR_BIT * sizeof (size_t))
784 return ~(size_t) 0;
785 else
786 return ((size_t) 1 << width) - 1;
787}
788
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000789/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
790 the start of an identifier, and 0 if C is not valid in an
791 identifier. We assume C has already gone through the checks of
792 _cpp_valid_ucn. The algorithm is a simple binary search on the
793 table defined in cppucnid.h. */
794
795static int
796ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
797{
798 int mn, mx, md;
799
800 mn = -1;
801 mx = ARRAY_SIZE (ucnranges);
802 while (mx - mn > 1)
803 {
804 md = (mn + mx) / 2;
805 if (c < ucnranges[md].lo)
806 mx = md;
807 else if (c > ucnranges[md].hi)
808 mn = md;
809 else
810 goto found;
811 }
812 return 0;
813
814 found:
815 /* When -pedantic, we require the character to have been listed by
816 the standard for the current language. Otherwise, we accept the
817 union of the acceptable sets for C++98 and C99. */
818 if (CPP_PEDANTIC (pfile)
819 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
820 || (CPP_OPTION (pfile, cplusplus)
821 && !(ucnranges[md].flags & CXX))))
822 return 0;
823
824 /* In C99, UCN digits may not begin identifiers. */
825 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
826 return 2;
827
828 return 1;
829}
Neil Booth1613e522003-04-20 07:29:23 +0000830
831/* [lex.charset]: The character designated by the universal character
832 name \UNNNNNNNN is that character whose character short name in
833 ISO/IEC 10646 is NNNNNNNN; the character designated by the
834 universal character name \uNNNN is that character whose character
835 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
836 for a universal character name is less than 0x20 or in the range
837 0x7F-0x9F (inclusive), or if the universal character name
838 designates a character in the basic source character set, then the
839 program is ill-formed.
840
841 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
842 buffer end is delimited by a non-hex digit. Returns zero if UCNs
843 are not part of the relevant standard, or if the string beginning
844 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
845
Kazu Hirata6356f892003-06-12 19:01:08 +0000846 Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth1613e522003-04-20 07:29:23 +0000847 is returned. Diagnostics are emitted for invalid values. PSTR
848 is updated to point one beyond the UCN, or to the syntactically
849 invalid character.
850
851 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
Zack Weinberga29f62d2004-09-18 00:56:19 +0000852 an identifier, or 2 otherwise. */
Neil Booth1613e522003-04-20 07:29:23 +0000853
854cppchar_t
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000855_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
856 const uchar *limit, int identifier_pos)
Neil Booth1613e522003-04-20 07:29:23 +0000857{
858 cppchar_t result, c;
859 unsigned int length;
860 const uchar *str = *pstr;
861 const uchar *base = str - 2;
862
Neil Booth1613e522003-04-20 07:29:23 +0000863 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin0527bc42003-11-01 22:56:54 +0000864 cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000865 "universal character names are only valid in C++ and C99");
866 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
John David Anglin0527bc42003-11-01 22:56:54 +0000867 cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000868 "the meaning of '\\%c' is different in traditional C",
869 (int) str[-1]);
Neil Booth1613e522003-04-20 07:29:23 +0000870
871 if (str[-1] == 'u')
872 length = 4;
873 else if (str[-1] == 'U')
874 length = 8;
875 else
876 abort();
877
878 result = 0;
879 do
880 {
881 c = *str;
882 if (!ISXDIGIT (c))
883 break;
884 str++;
885 result = (result << 4) + hex_value (c);
886 }
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000887 while (--length && str < limit);
Neil Booth1613e522003-04-20 07:29:23 +0000888
889 *pstr = str;
890 if (length)
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000891 {
892 /* We'll error when we try it out as the start of an identifier. */
John David Anglin0527bc42003-11-01 22:56:54 +0000893 cpp_error (pfile, CPP_DL_ERROR,
894 "incomplete universal character name %.*s",
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000895 (int) (str - base), base);
896 result = 1;
897 }
Neil Booth1613e522003-04-20 07:29:23 +0000898 /* The standard permits $, @ and ` to be specified as UCNs. We use
899 hex escapes so that this also works with EBCDIC hosts. */
900 else if ((result < 0xa0
901 && (result != 0x24 && result != 0x40 && result != 0x60))
902 || (result & 0x80000000)
903 || (result >= 0xD800 && result <= 0xDFFF))
904 {
John David Anglin0527bc42003-11-01 22:56:54 +0000905 cpp_error (pfile, CPP_DL_ERROR,
906 "%.*s is not a valid universal character",
Andreas Jaeger0e7a8c42003-04-21 14:06:12 +0200907 (int) (str - base), base);
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000908 result = 1;
Neil Booth1613e522003-04-20 07:29:23 +0000909 }
Geoffrey Keating47e20492005-03-12 10:44:06 +0000910 else if (identifier_pos && result == 0x24
911 && CPP_OPTION (pfile, dollars_in_ident))
912 {
913 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
914 {
915 CPP_OPTION (pfile, warn_dollars) = 0;
916 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
917 }
918 }
Neil Booth1613e522003-04-20 07:29:23 +0000919 else if (identifier_pos)
920 {
921 int validity = ucn_valid_in_identifier (pfile, result);
922
923 if (validity == 0)
John David Anglin0527bc42003-11-01 22:56:54 +0000924 cpp_error (pfile, CPP_DL_ERROR,
Neil Booth1613e522003-04-20 07:29:23 +0000925 "universal character %.*s is not valid in an identifier",
Andreas Jaeger0e7a8c42003-04-21 14:06:12 +0200926 (int) (str - base), base);
Neil Booth1613e522003-04-20 07:29:23 +0000927 else if (validity == 2 && identifier_pos == 1)
John David Anglin0527bc42003-11-01 22:56:54 +0000928 cpp_error (pfile, CPP_DL_ERROR,
Neil Booth1613e522003-04-20 07:29:23 +0000929 "universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger0e7a8c42003-04-21 14:06:12 +0200930 (int) (str - base), base);
Neil Booth1613e522003-04-20 07:29:23 +0000931 }
932
933 if (result == 0)
934 result = 1;
935
936 return result;
937}
938
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000939/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
940 it to the execution character set and write the result into TBUF.
Zack Weinberg6b883142003-07-10 23:16:31 +0000941 An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000942static const uchar *
943convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000944 struct _cpp_strbuf *tbuf, bool wide)
Neil Booth1613e522003-04-20 07:29:23 +0000945{
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000946 cppchar_t ucn;
Zack Weinberg6b883142003-07-10 23:16:31 +0000947 uchar buf[6];
948 uchar *bufp = buf;
949 size_t bytesleft = 6;
950 int rval;
951 struct cset_converter cvt
952 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000953
Kazu Hiratae0a21ab2004-01-16 01:44:06 +0000954 from++; /* Skip u/U. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000955 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000956
Zack Weinberg6b883142003-07-10 23:16:31 +0000957 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
958 if (rval)
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000959 {
Zack Weinberg6b883142003-07-10 23:16:31 +0000960 errno = rval;
John David Anglin0527bc42003-11-01 22:56:54 +0000961 cpp_errno (pfile, CPP_DL_ERROR,
962 "converting UCN to source character set");
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000963 }
Zack Weinberg6b883142003-07-10 23:16:31 +0000964 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin0527bc42003-11-01 22:56:54 +0000965 cpp_errno (pfile, CPP_DL_ERROR,
966 "converting UCN to execution character set");
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000967
968 return from;
969}
970
Zack Weinberga29f62d2004-09-18 00:56:19 +0000971/* Subroutine of convert_hex and convert_oct. N is the representation
972 in the execution character set of a numeric escape; write it into the
973 string buffer TBUF and update the end-of-string pointer therein. WIDE
974 is true if it's a wide string that's being assembled in TBUF. This
975 function issues no diagnostics and never fails. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000976static void
977emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Nathanael Nerodea8016862003-09-26 05:52:43 +0000978 struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberge6cc3a22003-07-05 00:24:00 +0000979{
980 if (wide)
981 {
982 /* We have to render this into the target byte order, which may not
983 be our byte order. */
984 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
985 size_t width = CPP_OPTION (pfile, wchar_precision);
986 size_t cwidth = CPP_OPTION (pfile, char_precision);
987 size_t cmask = width_to_mask (cwidth);
988 size_t nbwc = width / cwidth;
989 size_t i;
990 size_t off = tbuf->len;
991 cppchar_t c;
992
993 if (tbuf->len + nbwc > tbuf->asize)
994 {
995 tbuf->asize += OUTBUF_BLOCK_SIZE;
996 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
997 }
998
999 for (i = 0; i < nbwc; i++)
1000 {
1001 c = n & cmask;
1002 n >>= cwidth;
1003 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1004 }
1005 tbuf->len += nbwc;
1006 }
1007 else
1008 {
Zack Weinberga29f62d2004-09-18 00:56:19 +00001009 /* Note: this code does not handle the case where the target
1010 and host have a different number of bits in a byte. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001011 if (tbuf->len + 1 > tbuf->asize)
1012 {
1013 tbuf->asize += OUTBUF_BLOCK_SIZE;
1014 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
1015 }
1016 tbuf->text[tbuf->len++] = n;
1017 }
1018}
1019
1020/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1021 character set and write it into the string buffer TBUF. Returns an
1022 advanced pointer, and issues diagnostics as necessary.
1023 No character set translation occurs; this routine always produces the
1024 execution-set character with numeric value equal to the given hex
1025 number. You can, e.g. generate surrogate pairs this way. */
1026static const uchar *
1027convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
Nathanael Nerodea8016862003-09-26 05:52:43 +00001028 struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001029{
1030 cppchar_t c, n = 0, overflow = 0;
1031 int digits_found = 0;
1032 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1033 : CPP_OPTION (pfile, char_precision));
1034 size_t mask = width_to_mask (width);
1035
1036 if (CPP_WTRADITIONAL (pfile))
John David Anglin0527bc42003-11-01 22:56:54 +00001037 cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001038 "the meaning of '\\x' is different in traditional C");
1039
Kazu Hiratae0a21ab2004-01-16 01:44:06 +00001040 from++; /* Skip 'x'. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001041 while (from < limit)
1042 {
1043 c = *from;
1044 if (! hex_p (c))
1045 break;
1046 from++;
1047 overflow |= n ^ (n << 4 >> 4);
1048 n = (n << 4) + hex_value (c);
1049 digits_found = 1;
1050 }
1051
1052 if (!digits_found)
1053 {
John David Anglin0527bc42003-11-01 22:56:54 +00001054 cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001055 "\\x used with no following hex digits");
1056 return from;
1057 }
1058
1059 if (overflow | (n != (n & mask)))
1060 {
John David Anglin0527bc42003-11-01 22:56:54 +00001061 cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001062 "hex escape sequence out of range");
1063 n &= mask;
1064 }
1065
1066 emit_numeric_escape (pfile, n, tbuf, wide);
1067
1068 return from;
1069}
1070
1071/* Convert an octal escape, pointed to by FROM, to the execution
1072 character set and write it into the string buffer TBUF. Returns an
1073 advanced pointer, and issues diagnostics as necessary.
1074 No character set translation occurs; this routine always produces the
1075 execution-set character with numeric value equal to the given octal
1076 number. */
1077static const uchar *
1078convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
Nathanael Nerodea8016862003-09-26 05:52:43 +00001079 struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001080{
1081 size_t count = 0;
1082 cppchar_t c, n = 0;
1083 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1084 : CPP_OPTION (pfile, char_precision));
1085 size_t mask = width_to_mask (width);
1086 bool overflow = false;
1087
1088 while (from < limit && count++ < 3)
1089 {
1090 c = *from;
1091 if (c < '0' || c > '7')
1092 break;
1093 from++;
1094 overflow |= n ^ (n << 3 >> 3);
1095 n = (n << 3) + c - '0';
1096 }
1097
1098 if (n != (n & mask))
1099 {
John David Anglin0527bc42003-11-01 22:56:54 +00001100 cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001101 "octal escape sequence out of range");
1102 n &= mask;
1103 }
1104
1105 emit_numeric_escape (pfile, n, tbuf, wide);
1106
1107 return from;
1108}
1109
1110/* Convert an escape sequence (pointed to by FROM) to its value on
1111 the target, and to the execution character set. Do not scan past
1112 LIMIT. Write the converted value into TBUF. Returns an advanced
1113 pointer. Handles all relevant diagnostics. */
1114static const uchar *
1115convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
Nathanael Nerodea8016862003-09-26 05:52:43 +00001116 struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001117{
1118 /* Values of \a \b \e \f \n \r \t \v respectively. */
1119#if HOST_CHARSET == HOST_CHARSET_ASCII
1120 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1121#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1122 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1123#else
1124#error "unknown host character set"
1125#endif
1126
1127 uchar c;
Zack Weinberg6b883142003-07-10 23:16:31 +00001128 struct cset_converter cvt
1129 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001130
1131 c = *from;
1132 switch (c)
1133 {
1134 /* UCNs, hex escapes, and octal escapes are processed separately. */
1135 case 'u': case 'U':
1136 return convert_ucn (pfile, from, limit, tbuf, wide);
1137
1138 case 'x':
1139 return convert_hex (pfile, from, limit, tbuf, wide);
1140 break;
1141
1142 case '0': case '1': case '2': case '3':
1143 case '4': case '5': case '6': case '7':
1144 return convert_oct (pfile, from, limit, tbuf, wide);
1145
1146 /* Various letter escapes. Get the appropriate host-charset
1147 value into C. */
1148 case '\\': case '\'': case '"': case '?': break;
1149
1150 case '(': case '{': case '[': case '%':
1151 /* '\(', etc, can be used at the beginning of a line in a long
1152 string split onto multiple lines with \-newline, to prevent
1153 Emacs or other text editors from getting confused. '\%' can
1154 be used to prevent SCCS from mangling printf format strings. */
1155 if (CPP_PEDANTIC (pfile))
1156 goto unknown;
1157 break;
1158
1159 case 'b': c = charconsts[1]; break;
1160 case 'f': c = charconsts[3]; break;
1161 case 'n': c = charconsts[4]; break;
1162 case 'r': c = charconsts[5]; break;
1163 case 't': c = charconsts[6]; break;
1164 case 'v': c = charconsts[7]; break;
1165
1166 case 'a':
1167 if (CPP_WTRADITIONAL (pfile))
John David Anglin0527bc42003-11-01 22:56:54 +00001168 cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001169 "the meaning of '\\a' is different in traditional C");
1170 c = charconsts[0];
1171 break;
1172
1173 case 'e': case 'E':
1174 if (CPP_PEDANTIC (pfile))
John David Anglin0527bc42003-11-01 22:56:54 +00001175 cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001176 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1177 c = charconsts[2];
1178 break;
1179
1180 default:
1181 unknown:
1182 if (ISGRAPH (c))
John David Anglin0527bc42003-11-01 22:56:54 +00001183 cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001184 "unknown escape sequence '\\%c'", (int) c);
1185 else
John David Anglin0527bc42003-11-01 22:56:54 +00001186 cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001187 "unknown escape sequence: '\\%03o'", (int) c);
1188 }
1189
1190 /* Now convert what we have to the execution character set. */
Zack Weinberg6b883142003-07-10 23:16:31 +00001191 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin0527bc42003-11-01 22:56:54 +00001192 cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001193 "converting escape sequence to execution character set");
1194
1195 return from + 1;
1196}
1197
1198/* FROM is an array of cpp_string structures of length COUNT. These
1199 are to be converted from the source to the execution character set,
1200 escape sequences translated, and finally all are to be
1201 concatenated. WIDE indicates whether or not to produce a wide
1202 string. The result is written into TO. Returns true for success,
1203 false for failure. */
1204bool
1205cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1206 cpp_string *to, bool wide)
1207{
Nathanael Nerodea8016862003-09-26 05:52:43 +00001208 struct _cpp_strbuf tbuf;
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001209 const uchar *p, *base, *limit;
1210 size_t i;
Zack Weinberg6b883142003-07-10 23:16:31 +00001211 struct cset_converter cvt
1212 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001213
1214 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1215 tbuf.text = xmalloc (tbuf.asize);
1216 tbuf.len = 0;
1217
1218 for (i = 0; i < count; i++)
1219 {
1220 p = from[i].text;
1221 if (*p == 'L') p++;
Kazu Hiratae0a21ab2004-01-16 01:44:06 +00001222 p++; /* Skip leading quote. */
1223 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001224
1225 for (;;)
1226 {
1227 base = p;
1228 while (p < limit && *p != '\\')
1229 p++;
1230 if (p > base)
1231 {
1232 /* We have a run of normal characters; these can be fed
1233 directly to convert_cset. */
Zack Weinberg6b883142003-07-10 23:16:31 +00001234 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001235 goto fail;
1236 }
1237 if (p == limit)
1238 break;
1239
1240 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1241 }
1242 }
1243 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1244 structure. */
1245 emit_numeric_escape (pfile, 0, &tbuf, wide);
1246 tbuf.text = xrealloc (tbuf.text, tbuf.len);
1247 to->text = tbuf.text;
1248 to->len = tbuf.len;
1249 return true;
1250
1251 fail:
John David Anglin0527bc42003-11-01 22:56:54 +00001252 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001253 free (tbuf.text);
1254 return false;
1255}
Zack Weinberg6b883142003-07-10 23:16:31 +00001256
1257/* Subroutine of do_line and do_linemarker. Convert escape sequences
1258 in a string, but do not perform character set conversion. */
1259bool
Eric Christopher423e95e2004-02-12 02:25:03 +00001260cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1261 size_t count, cpp_string *to, bool wide)
Zack Weinberg6b883142003-07-10 23:16:31 +00001262{
1263 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1264 bool retval;
1265
1266 pfile->narrow_cset_desc.func = convert_no_conversion;
1267 pfile->narrow_cset_desc.cd = (iconv_t) -1;
1268
Eric Christopher423e95e2004-02-12 02:25:03 +00001269 retval = cpp_interpret_string (pfile, from, count, to, wide);
Zack Weinberg6b883142003-07-10 23:16:31 +00001270
1271 pfile->narrow_cset_desc = save_narrow_cset_desc;
1272 return retval;
1273}
1274
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001275
1276/* Subroutine of cpp_interpret_charconst which performs the conversion
1277 to a number, for narrow strings. STR is the string structure returned
1278 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1279 cpp_interpret_charconst. */
1280static cppchar_t
1281narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1282 unsigned int *pchars_seen, int *unsignedp)
1283{
1284 size_t width = CPP_OPTION (pfile, char_precision);
1285 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1286 size_t mask = width_to_mask (width);
1287 size_t i;
1288 cppchar_t result, c;
1289 bool unsigned_p;
1290
1291 /* The value of a multi-character character constant, or a
1292 single-character character constant whose representation in the
1293 execution character set is more than one byte long, is
1294 implementation defined. This implementation defines it to be the
1295 number formed by interpreting the byte sequence in memory as a
1296 big-endian binary number. If overflow occurs, the high bytes are
1297 lost, and a warning is issued.
1298
1299 We don't want to process the NUL terminator handed back by
1300 cpp_interpret_string. */
1301 result = 0;
1302 for (i = 0; i < str.len - 1; i++)
1303 {
1304 c = str.text[i] & mask;
1305 if (width < BITS_PER_CPPCHAR_T)
1306 result = (result << width) | c;
1307 else
1308 result = c;
1309 }
1310
1311 if (i > max_chars)
1312 {
1313 i = max_chars;
John David Anglin0527bc42003-11-01 22:56:54 +00001314 cpp_error (pfile, CPP_DL_WARNING,
1315 "character constant too long for its type");
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001316 }
1317 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
John David Anglin0527bc42003-11-01 22:56:54 +00001318 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001319
1320 /* Multichar constants are of type int and therefore signed. */
1321 if (i > 1)
1322 unsigned_p = 0;
1323 else
1324 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1325
1326 /* Truncate the constant to its natural width, and simultaneously
1327 sign- or zero-extend to the full width of cppchar_t.
1328 For single-character constants, the value is WIDTH bits wide.
1329 For multi-character constants, the value is INT_PRECISION bits wide. */
1330 if (i > 1)
1331 width = CPP_OPTION (pfile, int_precision);
1332 if (width < BITS_PER_CPPCHAR_T)
1333 {
1334 mask = ((cppchar_t) 1 << width) - 1;
1335 if (unsigned_p || !(result & (1 << (width - 1))))
1336 result &= mask;
1337 else
1338 result |= ~mask;
1339 }
1340 *pchars_seen = i;
1341 *unsignedp = unsigned_p;
1342 return result;
1343}
Eric Christophercf551fb2004-01-16 22:37:49 +00001344
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001345/* Subroutine of cpp_interpret_charconst which performs the conversion
1346 to a number, for wide strings. STR is the string structure returned
1347 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1348 cpp_interpret_charconst. */
1349static cppchar_t
1350wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1351 unsigned int *pchars_seen, int *unsignedp)
1352{
1353 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1354 size_t width = CPP_OPTION (pfile, wchar_precision);
1355 size_t cwidth = CPP_OPTION (pfile, char_precision);
1356 size_t mask = width_to_mask (width);
1357 size_t cmask = width_to_mask (cwidth);
1358 size_t nbwc = width / cwidth;
1359 size_t off, i;
1360 cppchar_t result = 0, c;
1361
1362 /* This is finicky because the string is in the target's byte order,
1363 which may not be our byte order. Only the last character, ignoring
1364 the NUL terminator, is relevant. */
1365 off = str.len - (nbwc * 2);
1366 result = 0;
1367 for (i = 0; i < nbwc; i++)
1368 {
1369 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1370 result = (result << cwidth) | (c & cmask);
1371 }
1372
1373 /* Wide character constants have type wchar_t, and a single
1374 character exactly fills a wchar_t, so a multi-character wide
1375 character constant is guaranteed to overflow. */
1376 if (off > 0)
John David Anglin0527bc42003-11-01 22:56:54 +00001377 cpp_error (pfile, CPP_DL_WARNING,
1378 "character constant too long for its type");
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001379
1380 /* Truncate the constant to its natural width, and simultaneously
1381 sign- or zero-extend to the full width of cppchar_t. */
1382 if (width < BITS_PER_CPPCHAR_T)
1383 {
1384 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1385 result &= mask;
1386 else
1387 result |= ~mask;
1388 }
1389
1390 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1391 *pchars_seen = 1;
1392 return result;
1393}
1394
1395/* Interpret a (possibly wide) character constant in TOKEN.
1396 PCHARS_SEEN points to a variable that is filled in with the number
1397 of characters seen, and UNSIGNEDP to a variable that indicates
1398 whether the result has signed type. */
1399cppchar_t
1400cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1401 unsigned int *pchars_seen, int *unsignedp)
1402{
1403 cpp_string str = { 0, 0 };
1404 bool wide = (token->type == CPP_WCHAR);
1405 cppchar_t result;
1406
1407 /* an empty constant will appear as L'' or '' */
1408 if (token->val.str.len == (size_t) (2 + wide))
1409 {
John David Anglin0527bc42003-11-01 22:56:54 +00001410 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001411 return 0;
1412 }
1413 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
Neil Booth1613e522003-04-20 07:29:23 +00001414 return 0;
1415
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001416 if (wide)
1417 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1418 else
1419 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth1613e522003-04-20 07:29:23 +00001420
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001421 if (str.text != token->val.str.text)
1422 free ((void *)str.text);
Neil Booth1613e522003-04-20 07:29:23 +00001423
Zack Weinberge6cc3a22003-07-05 00:24:00 +00001424 return result;
Neil Booth1613e522003-04-20 07:29:23 +00001425}
Geoffrey Keating47e20492005-03-12 10:44:06 +00001426
1427/* Convert an identifier denoted by ID and LEN, which might contain
1428 UCN escapes, to the source character set, either UTF-8 or
1429 UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
1430cpp_hashnode *
1431_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1432{
1433 /* It turns out that a UCN escape always turns into fewer characters
1434 than the escape itself, so we can allocate a temporary in advance. */
1435 uchar * buf = alloca (len + 1);
1436 uchar * bufp = buf;
1437 size_t idp;
1438
1439 for (idp = 0; idp < len; idp++)
1440 if (id[idp] != '\\')
1441 *bufp++ = id[idp];
1442 else
1443 {
1444 unsigned length = id[idp+1] == 'u' ? 4 : 8;
1445 cppchar_t value = 0;
1446 size_t bufleft = len - (bufp - buf);
1447 int rval;
Eric Christophercf551fb2004-01-16 22:37:49 +00001448
Geoffrey Keating47e20492005-03-12 10:44:06 +00001449 idp += 2;
1450 while (length && idp < len && ISXDIGIT (id[idp]))
1451 {
1452 value = (value << 4) + hex_value (id[idp]);
1453 idp++;
1454 length--;
1455 }
1456 idp--;
1457
1458 /* Special case for EBCDIC: if the identifier contains
1459 a '$' specified using a UCN, translate it to EBCDIC. */
1460 if (value == 0x24)
1461 {
1462 *bufp++ = '$';
1463 continue;
1464 }
1465
1466 rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1467 if (rval)
1468 {
1469 errno = rval;
1470 cpp_errno (pfile, CPP_DL_ERROR,
1471 "converting UCN to source character set");
1472 break;
1473 }
1474 }
1475
1476 return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1477 buf, bufp - buf, HT_ALLOC));
1478}
1479
Zack Weinberga29f62d2004-09-18 00:56:19 +00001480/* Convert an input buffer (containing the complete contents of one
1481 source file) from INPUT_CHARSET to the source character set. INPUT
1482 points to the input buffer, SIZE is its allocated size, and LEN is
1483 the length of the meaningful data within the buffer. The
1484 translated buffer is returned, and *ST_SIZE is set to the length of
1485 the meaningful data within the translated buffer.
1486
1487 INPUT is expected to have been allocated with xmalloc. This function
1488 will either return INPUT, or free it and return a pointer to another
1489 xmalloc-allocated block of memory. */
1490uchar *
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001491_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1492 uchar *input, size_t size, size_t len, off_t *st_size)
Eric Christophercf551fb2004-01-16 22:37:49 +00001493{
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001494 struct cset_converter input_cset;
1495 struct _cpp_strbuf to;
Eric Christophercf551fb2004-01-16 22:37:49 +00001496
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001497 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1498 if (input_cset.func == convert_no_conversion)
1499 {
1500 to.text = input;
1501 to.asize = size;
1502 to.len = len;
1503 }
Eric Christophercf551fb2004-01-16 22:37:49 +00001504 else
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001505 {
1506 to.asize = MAX (65536, len);
1507 to.text = xmalloc (to.asize);
1508 to.len = 0;
Eric Christophercf551fb2004-01-16 22:37:49 +00001509
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001510 if (!APPLY_CONVERSION (input_cset, input, len, &to))
1511 cpp_error (pfile, CPP_DL_ERROR,
1512 "failure to convert %s to %s",
1513 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1514
1515 free (input);
1516 }
1517
1518 /* Clean up the mess. */
1519 if (input_cset.func == convert_using_iconv)
1520 iconv_close (input_cset.cd);
1521
1522 /* Resize buffer if we allocated substantially too much, or if we
1523 haven't enough space for the \n-terminator. */
1524 if (to.len + 4096 < to.asize || to.len >= to.asize)
1525 to.text = xrealloc (to.text, to.len + 1);
1526
Devang Patel04c90ee2005-02-19 11:48:02 -08001527 /* If the file is using old-school Mac line endings (\r only),
1528 terminate with another \r, not an \n, so that we do not mistake
1529 the \r\n sequence for a single DOS line ending and erroneously
1530 issue the "No newline at end of file" diagnostic. */
1531 if (to.text[to.len - 1] == '\r')
1532 to.text[to.len] = '\r';
1533 else
1534 to.text[to.len] = '\n';
1535
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001536 *st_size = to.len;
1537 return to.text;
Eric Christophercf551fb2004-01-16 22:37:49 +00001538}
1539
Zack Weinberga29f62d2004-09-18 00:56:19 +00001540/* Decide on the default encoding to assume for input files. */
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001541const char *
1542_cpp_default_encoding (void)
Eric Christophercf551fb2004-01-16 22:37:49 +00001543{
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001544 const char *current_encoding = NULL;
Eric Christophercf551fb2004-01-16 22:37:49 +00001545
Paolo Bonzini4f4e53dd2004-05-24 10:50:45 +00001546 /* We disable this because the default codeset is 7-bit ASCII on
1547 most platforms, and this causes conversion failures on every
1548 file in GCC that happens to have one of the upper 128 characters
1549 in it -- most likely, as part of the name of a contributor.
1550 We should definitely recognize in-band markers of file encoding,
1551 like:
1552 - the appropriate Unicode byte-order mark (FE FF) to recognize
1553 UTF16 and UCS4 (in both big-endian and little-endian flavors)
1554 and UTF8
Zack Weinbergc6e83802004-06-05 20:58:06 +00001555 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini4f4e53dd2004-05-24 10:50:45 +00001556 distinguish ASCII and EBCDIC.
1557 - now we can parse something like "#pragma GCC encoding <xyz>
1558 on the first line, or even Emacs/VIM's mode line tags (there's
1559 a problem here in that VIM uses the last line, and Emacs has
Zack Weinberga29f62d2004-09-18 00:56:19 +00001560 its more elaborate "local variables" convention).
Paolo Bonzini4f4e53dd2004-05-24 10:50:45 +00001561 - investigate whether Java has another common convention, which
1562 would be friendly to support.
1563 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
1564#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher16dd5cf2004-02-02 20:20:58 +00001565 setlocale (LC_CTYPE, "");
1566 current_encoding = nl_langinfo (CODESET);
1567#endif
1568 if (current_encoding == NULL || *current_encoding == '\0')
1569 current_encoding = SOURCE_CHARSET;
1570
1571 return current_encoding;
Eric Christophercf551fb2004-01-16 22:37:49 +00001572}