Blame - libcpp/charset.c - toolchain/gcc

blob: 4de858a1b547f2b87d68f6ef609e90e1b93b8814 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Kazu Hirata	d9221e01	2004-01-21 20:40:04 +0000	[diff] [blame]	2	Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3	Free Software Foundation, Inc.
				4
				5	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				6
				7	This program is free software; you can redistribute it and/or modify it
				8	under the terms of the GNU General Public License as published by the
				9	Free Software Foundation; either version 2, or (at your option) any
				10	later version.
				11
				12	This program is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				15	GNU General Public License for more details.
				16
				17	You should have received a copy of the GNU General Public License
				18	along with this program; if not, write to the Free Software
				19	Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
				20
				21	#include "config.h"
				22	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	23	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	24	#include "internal.h"
				25	#include "ucnid.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	26
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	27	/* Character set handling for C-family languages.
				28
				29	Terminological note: In what follows, "charset" or "character set"
				30	will be taken to mean both an abstract set of characters and an
				31	encoding for that set.
				32
				33	The C99 standard discusses two character sets: source and execution.
				34	The source character set is used for internal processing in translation
				35	phases 1 through 4; the execution character set is used thereafter.
				36	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				37	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				38	of these terms). Furthermore, the "basic character set" (listed in
				39	5.2.1p3) is to be encoded in each with values one byte wide, and is
				40	to appear in the initial shift state.
				41
				42	It is not explicitly mentioned, but there is also a "wide execution
				43	character set" used to encode wide character constants and wide
				44	string literals; this is supposed to be the result of applying the
				45	standard library function mbstowcs() to an equivalent narrow string
				46	(6.4.5p5). However, the behavior of hexadecimal and octal
				47	\-escapes is at odds with this; they are supposed to be translated
				48	directly to wchar_t values (6.4.4.4p5,6).
				49
				50	The source character set is not necessarily the character set used
				51	to encode physical source files on disk; translation phase 1 converts
				52	from whatever that encoding is to the source character set.
				53
				54	The presence of universal character names in C99 (6.4.3 et seq.)
				55	forces the source character set to be isomorphic to ISO 10646,
				56	that is, Unicode. There is no such constraint on the execution
				57	character set; note also that the conversion from source to
				58	execution character set does not occur for identifiers (5.1.1.2p1#5).
				59
				60	For convenience of implementation, the source character set's
				61	encoding of the basic character set should be identical to the
				62	execution character set OF THE HOST SYSTEM's encoding of the basic
				63	character set, and it should not be a state-dependent encoding.
				64
				65	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				66	depending on whether the host is based on ASCII or EBCDIC (see
				67	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	68	Technical Report #16). With limited exceptions, it relies on the
				69	system library's iconv() primitive to do charset conversion
				70	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	71
				72	#if !HAVE_ICONV
				73	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				74	below, which are guarded only by if statements with compile-time
				75	constant conditions, do not cause link errors. */
				76	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	77	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	78	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	79	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	80	#endif
				81
				82	#if HOST_CHARSET == HOST_CHARSET_ASCII
				83	#define SOURCE_CHARSET "UTF-8"
				84	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				85	#define SOURCE_CHARSET "UTF-EBCDIC"
				86	#else
				87	#error "Unrecognized basic host character set"
				88	#endif
				89
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	90	#ifndef EILSEQ
				91	#define EILSEQ EINVAL
				92	#endif
				93
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	94	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	95	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	96	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	97	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	98	{
				99	uchar *text;
				100	size_t asize;
				101	size_t len;
				102	};
				103
				104	/* This is enough to hold any string that fits on a single 80-column
				105	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	106	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	107	#define OUTBUF_BLOCK_SIZE 256
				108
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	109	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				110	logic. This is because a depressing number of systems lack iconv,
				111	or have have iconv libraries that do not do these conversions, so
				112	we need a fallback implementation for them. To ensure the fallback
				113	doesn't break due to neglect, it is used on all systems.
				114
				115	UTF-32 encoding is nice and simple: a four-byte binary number,
				116	constrained to the range 00000000-7FFFFFFF to avoid questions of
				117	signedness. We do have to cope with big- and little-endian
				118	variants.
				119
				120	UTF-16 encoding uses two-byte binary numbers, again in big- and
				121	little-endian variants, for all values in the 00000000-0000FFFF
				122	range. Values in the 00010000-0010FFFF range are encoded as pairs
				123	of two-byte numbers, called "surrogate pairs": given a number S in
				124	this range, it is mapped to a pair (H, L) as follows:
				125
				126	H = (S - 0x10000) / 0x400 + 0xD800
				127	L = (S - 0x10000) % 0x400 + 0xDC00
				128
				129	Two-byte values in the D800...DFFF range are ill-formed except as a
				130	component of a surrogate pair. Even if the encoding within a
				131	two-byte value is little-endian, the H member of the surrogate pair
				132	comes first.
				133
				134	There is no way to encode values in the 00110000-7FFFFFFF range,
				135	which is not currently a problem as there are no assigned code
				136	points in that range; however, the author expects that it will
				137	eventually become necessary to abandon UTF-16 due to this
				138	limitation. Note also that, because of these pairs, UTF-16 does
				139	not meet the requirements of the C standard for a wide character
				140	encoding (see 3.7.3 and 6.4.4.4p11).
				141
				142	UTF-8 encoding looks like this:
				143
				144	value range encoded as
				145	00000000-0000007F 0xxxxxxx
				146	00000080-000007FF 110xxxxx 10xxxxxx
				147	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				148	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				149	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				150	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				151
				152	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				153	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				154	never occur. Note also that any value that can be encoded by a
				155	given row of the table can also be encoded by all successive rows,
				156	but this is not done; only the shortest possible encoding for any
				157	given value is valid. For instance, the character 07C0 could be
				158	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				159	FC 80 80 80 9F 80. Only the first is valid.
				160
				161	An implementation note: the transformation from UTF-16 to UTF-8, or
				162	vice versa, is easiest done by using UTF-32 as an intermediary. */
				163
				164	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				165	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				166	operation in several places below. */
				167	static inline int
				168	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				169	cppchar_t *cp)
				170	{
				171	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
				172	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	173
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	174	cppchar_t c;
				175	const uchar inbuf = inbufp;
				176	size_t nbytes, i;
				177
				178	if (*inbytesleftp < 1)
				179	return EINVAL;
				180
				181	c = *inbuf;
				182	if (c < 0x80)
				183	{
				184	*cp = c;
				185	*inbytesleftp -= 1;
				186	*inbufp += 1;
				187	return 0;
				188	}
				189
				190	/* The number of leading 1-bits in the first byte indicates how many
				191	bytes follow. */
				192	for (nbytes = 2; nbytes < 7; nbytes++)
				193	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				194	goto found;
				195	return EILSEQ;
				196	found:
				197
				198	if (*inbytesleftp < nbytes)
				199	return EINVAL;
				200
				201	c = (c & masks[nbytes-1]);
				202	inbuf++;
				203	for (i = 1; i < nbytes; i++)
				204	{
				205	cppchar_t n = *inbuf++;
				206	if ((n & 0xC0) != 0x80)
				207	return EILSEQ;
				208	c = ((c << 6) + (n & 0x3F));
				209	}
				210
				211	/* Make sure the shortest possible encoding was used. */
				212	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				213	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				214	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				215	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				216	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				217
				218	/* Make sure the character is valid. */
				219	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				220
				221	*cp = c;
				222	*inbufp = inbuf;
				223	*inbytesleftp -= nbytes;
				224	return 0;
				225	}
				226
				227	static inline int
				228	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				229	{
				230	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				231	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				232	size_t nbytes;
				233	uchar buf[6], *p = &buf[6];
				234	uchar outbuf = outbufp;
				235
				236	nbytes = 1;
				237	if (c < 0x80)
				238	*--p = c;
				239	else
				240	{
				241	do
				242	{
				243	*--p = ((c & 0x3F) \| 0x80);
				244	c >>= 6;
				245	nbytes++;
				246	}
				247	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				248	*--p = (c \| masks[nbytes-1]);
				249	}
				250
				251	if (*outbytesleftp < nbytes)
				252	return E2BIG;
				253
				254	while (p < &buf[6])
				255	outbuf++ = p++;
				256	*outbytesleftp -= nbytes;
				257	*outbufp = outbuf;
				258	return 0;
				259	}
				260
				261	/* The following four functions transform one character between the two
				262	encodings named in the function name. All have the signature
				263	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				264	uchar *outbufp, size_t outbytesleftp)
				265
				266	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				267	interpreted as a boolean indicating whether big-endian or
				268	little-endian encoding is to be used for the member of the pair
				269	that is not UTF-8.
				270
				271	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				272	do for iconv.
				273
				274	The return value is either 0 for success, or an errno value for
				275	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				276	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	277
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	278	static inline int
				279	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				280	uchar *outbufp, size_t outbytesleftp)
				281	{
				282	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	283	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	284	int rval;
				285
				286	/* Check for space first, since we know exactly how much we need. */
				287	if (*outbytesleftp < 4)
				288	return E2BIG;
				289
				290	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				291	if (rval)
				292	return rval;
				293
				294	outbuf = *outbufp;
				295	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				296	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				297	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				298	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				299
				300	*outbufp += 4;
				301	*outbytesleftp -= 4;
				302	return 0;
				303	}
				304
				305	static inline int
				306	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				307	uchar *outbufp, size_t outbytesleftp)
				308	{
				309	cppchar_t s;
				310	int rval;
				311	const uchar *inbuf;
				312
				313	if (*inbytesleftp < 4)
				314	return EINVAL;
				315
				316	inbuf = *inbufp;
				317
				318	s = inbuf[bigend ? 0 : 3] << 24;
				319	s += inbuf[bigend ? 1 : 2] << 16;
				320	s += inbuf[bigend ? 2 : 1] << 8;
				321	s += inbuf[bigend ? 3 : 0];
				322
				323	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				324	return EILSEQ;
				325
				326	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				327	if (rval)
				328	return rval;
				329
				330	*inbufp += 4;
				331	*inbytesleftp -= 4;
				332	return 0;
				333	}
				334
				335	static inline int
				336	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				337	uchar *outbufp, size_t outbytesleftp)
				338	{
				339	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	340	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	341	const uchar save_inbuf = inbufp;
				342	size_t save_inbytesleft = *inbytesleftp;
				343	uchar outbuf = outbufp;
				344
				345	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				346	if (rval)
				347	return rval;
				348
				349	if (s > 0x0010FFFF)
				350	{
				351	*inbufp = save_inbuf;
				352	*inbytesleftp = save_inbytesleft;
				353	return EILSEQ;
				354	}
				355
				356	if (s < 0xFFFF)
				357	{
				358	if (*outbytesleftp < 2)
				359	{
				360	*inbufp = save_inbuf;
				361	*inbytesleftp = save_inbytesleft;
				362	return E2BIG;
				363	}
				364	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				365	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				366
				367	*outbufp += 2;
				368	*outbytesleftp -= 2;
				369	return 0;
				370	}
				371	else
				372	{
				373	cppchar_t hi, lo;
				374
				375	if (*outbytesleftp < 4)
				376	{
				377	*inbufp = save_inbuf;
				378	*inbytesleftp = save_inbytesleft;
				379	return E2BIG;
				380	}
				381
				382	hi = (s - 0x10000) / 0x400 + 0xD800;
				383	lo = (s - 0x10000) % 0x400 + 0xDC00;
				384
				385	/* Even if we are little-endian, put the high surrogate first.
				386	??? Matches practice? */
				387	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				388	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				389	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				390	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				391
				392	*outbufp += 4;
				393	*outbytesleftp -= 4;
				394	return 0;
				395	}
				396	}
				397
				398	static inline int
				399	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				400	uchar *outbufp, size_t outbytesleftp)
				401	{
				402	cppchar_t s;
				403	const uchar inbuf = inbufp;
				404	int rval;
				405
				406	if (*inbytesleftp < 2)
				407	return EINVAL;
				408	s = inbuf[bigend ? 0 : 1] << 8;
				409	s += inbuf[bigend ? 1 : 0];
				410
				411	/* Low surrogate without immediately preceding high surrogate is invalid. */
				412	if (s >= 0xDC00 && s <= 0xDFFF)
				413	return EILSEQ;
				414	/* High surrogate must have a following low surrogate. */
				415	else if (s >= 0xD800 && s <= 0xDBFF)
				416	{
				417	cppchar_t hi = s, lo;
				418	if (*inbytesleftp < 4)
				419	return EINVAL;
				420
				421	lo = inbuf[bigend ? 2 : 3] << 8;
				422	lo += inbuf[bigend ? 3 : 2];
				423
				424	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				425	return EILSEQ;
				426
				427	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				428	}
				429
				430	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				431	if (rval)
				432	return rval;
				433
				434	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				435	the output pointers for us). */
				436	if (s <= 0xFFFF)
				437	{
				438	*inbufp += 2;
				439	*inbytesleftp -= 2;
				440	}
				441	else
				442	{
				443	*inbufp += 4;
				444	*inbytesleftp -= 4;
				445	}
				446	return 0;
				447	}
				448
				449	/* Helper routine for the next few functions. The 'const' on
				450	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	451	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	452
				453	static inline bool
				454	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				455	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	456	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	457	{
				458	const uchar *inbuf;
				459	uchar *outbuf;
				460	size_t inbytesleft, outbytesleft;
				461	int rval;
				462
				463	inbuf = from;
				464	inbytesleft = flen;
				465	outbuf = to->text + to->len;
				466	outbytesleft = to->asize - to->len;
				467
				468	for (;;)
				469	{
				470	do
				471	rval = one_conversion (cd, &inbuf, &inbytesleft,
				472	&outbuf, &outbytesleft);
				473	while (inbytesleft && !rval);
				474
				475	if (__builtin_expect (inbytesleft == 0, 1))
				476	{
				477	to->len = to->asize - outbytesleft;
				478	return true;
				479	}
				480	if (rval != E2BIG)
				481	{
				482	errno = rval;
				483	return false;
				484	}
				485
				486	outbytesleft += OUTBUF_BLOCK_SIZE;
				487	to->asize += OUTBUF_BLOCK_SIZE;
				488	to->text = xrealloc (to->text, to->asize);
				489	outbuf = to->text + to->asize - outbytesleft;
				490	}
				491	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	492
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	493
				494	/* These functions convert entire strings between character sets.
				495	They all have the signature
				496
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	497	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	498
				499	The input string FROM is converted as specified by the function
				500	name plus the iconv descriptor CD (which may be fake), and the
				501	result appended to TO. On any error, false is returned, otherwise true. */
				502
				503	/* These four use the custom conversion code above. */
				504	static bool
				505	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	506	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	507	{
				508	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				509	}
				510
				511	static bool
				512	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	513	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	514	{
				515	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				516	}
				517
				518	static bool
				519	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	520	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	521	{
				522	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				523	}
				524
				525	static bool
				526	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	527	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	528	{
				529	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				530	}
				531
				532	/* Identity conversion, used when we have no alternative. */
				533	static bool
				534	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	535	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	536	{
				537	if (to->len + flen > to->asize)
				538	{
				539	to->asize = to->len + flen;
				540	to->text = xrealloc (to->text, to->asize);
				541	}
				542	memcpy (to->text + to->len, from, flen);
				543	to->len += flen;
				544	return true;
				545	}
				546
				547	/* And this one uses the system iconv primitive. It's a little
				548	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	549	#if HAVE_ICONV
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	550	static bool
				551	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	552	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	553	{
				554	ICONV_CONST char *inbuf;
				555	char *outbuf;
				556	size_t inbytesleft, outbytesleft;
				557
				558	/* Reset conversion descriptor and check that it is valid. */
				559	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				560	return false;
				561
				562	inbuf = (ICONV_CONST char *)from;
				563	inbytesleft = flen;
				564	outbuf = (char *)to->text + to->len;
				565	outbytesleft = to->asize - to->len;
				566
				567	for (;;)
				568	{
				569	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				570	if (__builtin_expect (inbytesleft == 0, 1))
				571	{
				572	to->len = to->asize - outbytesleft;
				573	return true;
				574	}
				575	if (errno != E2BIG)
				576	return false;
				577
				578	outbytesleft += OUTBUF_BLOCK_SIZE;
				579	to->asize += OUTBUF_BLOCK_SIZE;
				580	to->text = xrealloc (to->text, to->asize);
				581	outbuf = (char *)to->text + to->asize - outbytesleft;
				582	}
				583	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	584	#else
				585	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				586	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	587
				588	/* Arrange for the above custom conversion logic to be used automatically
				589	when conversion between a suitable pair of character sets is requested. */
				590
				591	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				592	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				593
				594	struct conversion
				595	{
				596	const char *pair;
				597	convert_f func;
				598	iconv_t fake_cd;
				599	};
				600	static const struct conversion conversion_tab[] = {
				601	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				602	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				603	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				604	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				605	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				606	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				607	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				608	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				609	};
				610
				611	/* Subroutine of cpp_init_iconv: initialize and return a
				612	cset_converter structure for conversion from FROM to TO. If
				613	iconv_open() fails, issue an error and return an identity
				614	converter. Silently return an identity converter if FROM and TO
				615	are identical. */
				616	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	617	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				618	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	619	struct cset_converter ret;
				620	char *pair;
				621	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	622
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	623	if (!strcasecmp (to, from))
				624	{
				625	ret.func = convert_no_conversion;
				626	ret.cd = (iconv_t) -1;
				627	return ret;
				628	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	629
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	630	pair = alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	631
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	632	strcpy(pair, from);
				633	strcat(pair, "/");
				634	strcat(pair, to);
				635	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				636	if (!strcasecmp (pair, conversion_tab[i].pair))
				637	{
				638	ret.func = conversion_tab[i].func;
				639	ret.cd = conversion_tab[i].fake_cd;
				640	return ret;
				641	}
				642
				643	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	644	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	645	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	646	ret.func = convert_using_iconv;
				647	ret.cd = iconv_open (to, from);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	648
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	649	if (ret.cd == (iconv_t) -1)
				650	{
				651	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	652	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	653	"conversion from %s to %s not supported by iconv",
				654	from, to);
				655	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	656	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	657
				658	ret.func = convert_no_conversion;
				659	}
				660	}
				661	else
				662	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	663	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	664	"no iconv implementation, cannot convert from %s to %s",
				665	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	666	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	667	ret.cd = (iconv_t) -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	668	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	669	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	670	}
				671
				672	/* If charset conversion is requested, initialize iconv(3) descriptors
				673	for conversion from the source character set to the execution
				674	character sets. If iconv is not present in the C library, and
				675	conversion is requested, issue an error. */
				676
				677	void
				678	cpp_init_iconv (cpp_reader *pfile)
				679	{
				680	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				681	const char *wcset = CPP_OPTION (pfile, wide_charset);
				682	const char *default_wcset;
				683
				684	bool be = CPP_OPTION (pfile, bytes_big_endian);
				685
				686	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	687	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	688	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	689	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	690	else
				691	/* This effectively means that wide strings are not supported,
				692	so don't do any conversion at all. */
				693	default_wcset = SOURCE_CHARSET;
				694
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	695	if (!ncset)
				696	ncset = SOURCE_CHARSET;
				697	if (!wcset)
				698	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	699
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	700	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
				701	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	702	}
				703
				704	void
				705	_cpp_destroy_iconv (cpp_reader *pfile)
				706	{
				707	if (HAVE_ICONV)
				708	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	709	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				710	iconv_close (pfile->narrow_cset_desc.cd);
				711	if (pfile->wide_cset_desc.func == convert_using_iconv)
				712	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	713	}
				714	}
				715
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	716
				717	/* Utility routine that computes a mask of the form 0000...111... with
				718	WIDTH 1-bits. */
				719	static inline size_t
				720	width_to_mask (size_t width)
				721	{
				722	width = MIN (width, BITS_PER_CPPCHAR_T);
				723	if (width >= CHAR_BIT * sizeof (size_t))
				724	return ~(size_t) 0;
				725	else
				726	return ((size_t) 1 << width) - 1;
				727	}
				728
				729
				730
				731	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				732	the start of an identifier, and 0 if C is not valid in an
				733	identifier. We assume C has already gone through the checks of
				734	_cpp_valid_ucn. The algorithm is a simple binary search on the
				735	table defined in cppucnid.h. */
				736
				737	static int
				738	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
				739	{
				740	int mn, mx, md;
				741
				742	mn = -1;
				743	mx = ARRAY_SIZE (ucnranges);
				744	while (mx - mn > 1)
				745	{
				746	md = (mn + mx) / 2;
				747	if (c < ucnranges[md].lo)
				748	mx = md;
				749	else if (c > ucnranges[md].hi)
				750	mn = md;
				751	else
				752	goto found;
				753	}
				754	return 0;
				755
				756	found:
				757	/* When -pedantic, we require the character to have been listed by
				758	the standard for the current language. Otherwise, we accept the
				759	union of the acceptable sets for C++98 and C99. */
				760	if (CPP_PEDANTIC (pfile)
				761	&& ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
				762	\|\| (CPP_OPTION (pfile, cplusplus)
				763	&& !(ucnranges[md].flags & CXX))))
				764	return 0;
				765
				766	/* In C99, UCN digits may not begin identifiers. */
				767	if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
				768	return 2;
				769
				770	return 1;
				771	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	772
				773	/* [lex.charset]: The character designated by the universal character
				774	name \UNNNNNNNN is that character whose character short name in
				775	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				776	universal character name \uNNNN is that character whose character
				777	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
				778	for a universal character name is less than 0x20 or in the range
				779	0x7F-0x9F (inclusive), or if the universal character name
				780	designates a character in the basic source character set, then the
				781	program is ill-formed.
				782
				783	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
				784	buffer end is delimited by a non-hex digit. Returns zero if UCNs
				785	are not part of the relevant standard, or if the string beginning
				786	at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
				787
Kazu Hirata	6356f89	2003-06-12 19:01:08 +0000	[diff] [blame]	788	Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	789	is returned. Diagnostics are emitted for invalid values. PSTR
				790	is updated to point one beyond the UCN, or to the syntactically
				791	invalid character.
				792
				793	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
				794	an identifier, or 2 otherwise.
				795	*/
				796
				797	cppchar_t
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	798	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
				799	const uchar *limit, int identifier_pos)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	800	{
				801	cppchar_t result, c;
				802	unsigned int length;
				803	const uchar str = pstr;
				804	const uchar *base = str - 2;
				805
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	806	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	807	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	808	"universal character names are only valid in C++ and C99");
				809	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	810	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	811	"the meaning of '\\%c' is different in traditional C",
				812	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	813
				814	if (str[-1] == 'u')
				815	length = 4;
				816	else if (str[-1] == 'U')
				817	length = 8;
				818	else
				819	abort();
				820
				821	result = 0;
				822	do
				823	{
				824	c = *str;
				825	if (!ISXDIGIT (c))
				826	break;
				827	str++;
				828	result = (result << 4) + hex_value (c);
				829	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	830	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	831
				832	*pstr = str;
				833	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	834	{
				835	/* We'll error when we try it out as the start of an identifier. */
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	836	cpp_error (pfile, CPP_DL_ERROR,
				837	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	838	(int) (str - base), base);
				839	result = 1;
				840	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	841	/* The standard permits $, @ and ` to be specified as UCNs. We use
				842	hex escapes so that this also works with EBCDIC hosts. */
				843	else if ((result < 0xa0
				844	&& (result != 0x24 && result != 0x40 && result != 0x60))
				845	\|\| (result & 0x80000000)
				846	\|\| (result >= 0xD800 && result <= 0xDFFF))
				847	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	848	cpp_error (pfile, CPP_DL_ERROR,
				849	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	850	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	851	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	852	}
				853	else if (identifier_pos)
				854	{
				855	int validity = ucn_valid_in_identifier (pfile, result);
				856
				857	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	858	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	859	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	860	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	861	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	862	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	863	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	864	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	865	}
				866
				867	if (result == 0)
				868	result = 1;
				869
				870	return result;
				871	}
				872
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	873	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
				874	it to the execution character set and write the result into TBUF.
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	875	An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	876
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	877
				878	static const uchar *
				879	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	880	struct _cpp_strbuf *tbuf, bool wide)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	881	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	882	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	883	uchar buf[6];
				884	uchar *bufp = buf;
				885	size_t bytesleft = 6;
				886	int rval;
				887	struct cset_converter cvt
				888	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	889
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	890	from++; /* Skip u/U. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	891	ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	892
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	893	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				894	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	895	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	896	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	897	cpp_errno (pfile, CPP_DL_ERROR,
				898	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	899	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	900	else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	901	cpp_errno (pfile, CPP_DL_ERROR,
				902	"converting UCN to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	903
				904	return from;
				905	}
				906
				907	static void
				908	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	909	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	910	{
				911	if (wide)
				912	{
				913	/* We have to render this into the target byte order, which may not
				914	be our byte order. */
				915	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				916	size_t width = CPP_OPTION (pfile, wchar_precision);
				917	size_t cwidth = CPP_OPTION (pfile, char_precision);
				918	size_t cmask = width_to_mask (cwidth);
				919	size_t nbwc = width / cwidth;
				920	size_t i;
				921	size_t off = tbuf->len;
				922	cppchar_t c;
				923
				924	if (tbuf->len + nbwc > tbuf->asize)
				925	{
				926	tbuf->asize += OUTBUF_BLOCK_SIZE;
				927	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				928	}
				929
				930	for (i = 0; i < nbwc; i++)
				931	{
				932	c = n & cmask;
				933	n >>= cwidth;
				934	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				935	}
				936	tbuf->len += nbwc;
				937	}
				938	else
				939	{
				940	if (tbuf->len + 1 > tbuf->asize)
				941	{
				942	tbuf->asize += OUTBUF_BLOCK_SIZE;
				943	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				944	}
				945	tbuf->text[tbuf->len++] = n;
				946	}
				947	}
				948
				949	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
				950	character set and write it into the string buffer TBUF. Returns an
				951	advanced pointer, and issues diagnostics as necessary.
				952	No character set translation occurs; this routine always produces the
				953	execution-set character with numeric value equal to the given hex
				954	number. You can, e.g. generate surrogate pairs this way. */
				955	static const uchar *
				956	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	957	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	958	{
				959	cppchar_t c, n = 0, overflow = 0;
				960	int digits_found = 0;
				961	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				962	: CPP_OPTION (pfile, char_precision));
				963	size_t mask = width_to_mask (width);
				964
				965	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	966	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	967	"the meaning of '\\x' is different in traditional C");
				968
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	969	from++; /* Skip 'x'. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	970	while (from < limit)
				971	{
				972	c = *from;
				973	if (! hex_p (c))
				974	break;
				975	from++;
				976	overflow \|= n ^ (n << 4 >> 4);
				977	n = (n << 4) + hex_value (c);
				978	digits_found = 1;
				979	}
				980
				981	if (!digits_found)
				982	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	983	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	984	"\\x used with no following hex digits");
				985	return from;
				986	}
				987
				988	if (overflow \| (n != (n & mask)))
				989	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	990	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	991	"hex escape sequence out of range");
				992	n &= mask;
				993	}
				994
				995	emit_numeric_escape (pfile, n, tbuf, wide);
				996
				997	return from;
				998	}
				999
				1000	/* Convert an octal escape, pointed to by FROM, to the execution
				1001	character set and write it into the string buffer TBUF. Returns an
				1002	advanced pointer, and issues diagnostics as necessary.
				1003	No character set translation occurs; this routine always produces the
				1004	execution-set character with numeric value equal to the given octal
				1005	number. */
				1006	static const uchar *
				1007	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1008	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1009	{
				1010	size_t count = 0;
				1011	cppchar_t c, n = 0;
				1012	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				1013	: CPP_OPTION (pfile, char_precision));
				1014	size_t mask = width_to_mask (width);
				1015	bool overflow = false;
				1016
				1017	while (from < limit && count++ < 3)
				1018	{
				1019	c = *from;
				1020	if (c < '0' \|\| c > '7')
				1021	break;
				1022	from++;
				1023	overflow \|= n ^ (n << 3 >> 3);
				1024	n = (n << 3) + c - '0';
				1025	}
				1026
				1027	if (n != (n & mask))
				1028	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1029	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1030	"octal escape sequence out of range");
				1031	n &= mask;
				1032	}
				1033
				1034	emit_numeric_escape (pfile, n, tbuf, wide);
				1035
				1036	return from;
				1037	}
				1038
				1039	/* Convert an escape sequence (pointed to by FROM) to its value on
				1040	the target, and to the execution character set. Do not scan past
				1041	LIMIT. Write the converted value into TBUF. Returns an advanced
				1042	pointer. Handles all relevant diagnostics. */
				1043	static const uchar *
				1044	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1045	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1046	{
				1047	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1048	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1049	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1050	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1051	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1052	#else
				1053	#error "unknown host character set"
				1054	#endif
				1055
				1056	uchar c;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1057	struct cset_converter cvt
				1058	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1059
				1060	c = *from;
				1061	switch (c)
				1062	{
				1063	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1064	case 'u': case 'U':
				1065	return convert_ucn (pfile, from, limit, tbuf, wide);
				1066
				1067	case 'x':
				1068	return convert_hex (pfile, from, limit, tbuf, wide);
				1069	break;
				1070
				1071	case '0': case '1': case '2': case '3':
				1072	case '4': case '5': case '6': case '7':
				1073	return convert_oct (pfile, from, limit, tbuf, wide);
				1074
				1075	/* Various letter escapes. Get the appropriate host-charset
				1076	value into C. */
				1077	case '\\': case '\'': case '"': case '?': break;
				1078
				1079	case '(': case '{': case '[': case '%':
				1080	/* '\(', etc, can be used at the beginning of a line in a long
				1081	string split onto multiple lines with \-newline, to prevent
				1082	Emacs or other text editors from getting confused. '\%' can
				1083	be used to prevent SCCS from mangling printf format strings. */
				1084	if (CPP_PEDANTIC (pfile))
				1085	goto unknown;
				1086	break;
				1087
				1088	case 'b': c = charconsts[1]; break;
				1089	case 'f': c = charconsts[3]; break;
				1090	case 'n': c = charconsts[4]; break;
				1091	case 'r': c = charconsts[5]; break;
				1092	case 't': c = charconsts[6]; break;
				1093	case 'v': c = charconsts[7]; break;
				1094
				1095	case 'a':
				1096	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1097	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1098	"the meaning of '\\a' is different in traditional C");
				1099	c = charconsts[0];
				1100	break;
				1101
				1102	case 'e': case 'E':
				1103	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1104	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1105	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1106	c = charconsts[2];
				1107	break;
				1108
				1109	default:
				1110	unknown:
				1111	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1112	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1113	"unknown escape sequence '\\%c'", (int) c);
				1114	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1115	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1116	"unknown escape sequence: '\\%03o'", (int) c);
				1117	}
				1118
				1119	/* Now convert what we have to the execution character set. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1120	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1121	cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1122	"converting escape sequence to execution character set");
				1123
				1124	return from + 1;
				1125	}
				1126
				1127	/* FROM is an array of cpp_string structures of length COUNT. These
				1128	are to be converted from the source to the execution character set,
				1129	escape sequences translated, and finally all are to be
				1130	concatenated. WIDE indicates whether or not to produce a wide
				1131	string. The result is written into TO. Returns true for success,
				1132	false for failure. */
				1133	bool
				1134	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
				1135	cpp_string *to, bool wide)
				1136	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1137	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1138	const uchar p, base, *limit;
				1139	size_t i;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1140	struct cset_converter cvt
				1141	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1142
				1143	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
				1144	tbuf.text = xmalloc (tbuf.asize);
				1145	tbuf.len = 0;
				1146
				1147	for (i = 0; i < count; i++)
				1148	{
				1149	p = from[i].text;
				1150	if (*p == 'L') p++;
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1151	p++; /* Skip leading quote. */
				1152	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1153
				1154	for (;;)
				1155	{
				1156	base = p;
				1157	while (p < limit && *p != '\\')
				1158	p++;
				1159	if (p > base)
				1160	{
				1161	/* We have a run of normal characters; these can be fed
				1162	directly to convert_cset. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1163	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1164	goto fail;
				1165	}
				1166	if (p == limit)
				1167	break;
				1168
				1169	p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
				1170	}
				1171	}
				1172	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1173	structure. */
				1174	emit_numeric_escape (pfile, 0, &tbuf, wide);
				1175	tbuf.text = xrealloc (tbuf.text, tbuf.len);
				1176	to->text = tbuf.text;
				1177	to->len = tbuf.len;
				1178	return true;
				1179
				1180	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1181	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1182	free (tbuf.text);
				1183	return false;
				1184	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1185
				1186	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1187	in a string, but do not perform character set conversion. */
				1188	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1189	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
				1190	size_t count, cpp_string *to, bool wide)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1191	{
				1192	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1193	bool retval;
				1194
				1195	pfile->narrow_cset_desc.func = convert_no_conversion;
				1196	pfile->narrow_cset_desc.cd = (iconv_t) -1;
				1197
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1198	retval = cpp_interpret_string (pfile, from, count, to, wide);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1199
				1200	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1201	return retval;
				1202	}
				1203
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1204
				1205	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1206	to a number, for narrow strings. STR is the string structure returned
				1207	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1208	cpp_interpret_charconst. */
				1209	static cppchar_t
				1210	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1211	unsigned int pchars_seen, int unsignedp)
				1212	{
				1213	size_t width = CPP_OPTION (pfile, char_precision);
				1214	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1215	size_t mask = width_to_mask (width);
				1216	size_t i;
				1217	cppchar_t result, c;
				1218	bool unsigned_p;
				1219
				1220	/* The value of a multi-character character constant, or a
				1221	single-character character constant whose representation in the
				1222	execution character set is more than one byte long, is
				1223	implementation defined. This implementation defines it to be the
				1224	number formed by interpreting the byte sequence in memory as a
				1225	big-endian binary number. If overflow occurs, the high bytes are
				1226	lost, and a warning is issued.
				1227
				1228	We don't want to process the NUL terminator handed back by
				1229	cpp_interpret_string. */
				1230	result = 0;
				1231	for (i = 0; i < str.len - 1; i++)
				1232	{
				1233	c = str.text[i] & mask;
				1234	if (width < BITS_PER_CPPCHAR_T)
				1235	result = (result << width) \| c;
				1236	else
				1237	result = c;
				1238	}
				1239
				1240	if (i > max_chars)
				1241	{
				1242	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1243	cpp_error (pfile, CPP_DL_WARNING,
				1244	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1245	}
				1246	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1247	cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1248
				1249	/* Multichar constants are of type int and therefore signed. */
				1250	if (i > 1)
				1251	unsigned_p = 0;
				1252	else
				1253	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1254
				1255	/* Truncate the constant to its natural width, and simultaneously
				1256	sign- or zero-extend to the full width of cppchar_t.
				1257	For single-character constants, the value is WIDTH bits wide.
				1258	For multi-character constants, the value is INT_PRECISION bits wide. */
				1259	if (i > 1)
				1260	width = CPP_OPTION (pfile, int_precision);
				1261	if (width < BITS_PER_CPPCHAR_T)
				1262	{
				1263	mask = ((cppchar_t) 1 << width) - 1;
				1264	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1265	result &= mask;
				1266	else
				1267	result \|= ~mask;
				1268	}
				1269	*pchars_seen = i;
				1270	*unsignedp = unsigned_p;
				1271	return result;
				1272	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1273
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1274	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1275	to a number, for wide strings. STR is the string structure returned
				1276	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1277	cpp_interpret_charconst. */
				1278	static cppchar_t
				1279	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1280	unsigned int pchars_seen, int unsignedp)
				1281	{
				1282	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				1283	size_t width = CPP_OPTION (pfile, wchar_precision);
				1284	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1285	size_t mask = width_to_mask (width);
				1286	size_t cmask = width_to_mask (cwidth);
				1287	size_t nbwc = width / cwidth;
				1288	size_t off, i;
				1289	cppchar_t result = 0, c;
				1290
				1291	/* This is finicky because the string is in the target's byte order,
				1292	which may not be our byte order. Only the last character, ignoring
				1293	the NUL terminator, is relevant. */
				1294	off = str.len - (nbwc * 2);
				1295	result = 0;
				1296	for (i = 0; i < nbwc; i++)
				1297	{
				1298	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1299	result = (result << cwidth) \| (c & cmask);
				1300	}
				1301
				1302	/* Wide character constants have type wchar_t, and a single
				1303	character exactly fills a wchar_t, so a multi-character wide
				1304	character constant is guaranteed to overflow. */
				1305	if (off > 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1306	cpp_error (pfile, CPP_DL_WARNING,
				1307	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1308
				1309	/* Truncate the constant to its natural width, and simultaneously
				1310	sign- or zero-extend to the full width of cppchar_t. */
				1311	if (width < BITS_PER_CPPCHAR_T)
				1312	{
				1313	if (CPP_OPTION (pfile, unsigned_wchar) \|\| !(result & (1 << (width - 1))))
				1314	result &= mask;
				1315	else
				1316	result \|= ~mask;
				1317	}
				1318
				1319	*unsignedp = CPP_OPTION (pfile, unsigned_wchar);
				1320	*pchars_seen = 1;
				1321	return result;
				1322	}
				1323
				1324	/* Interpret a (possibly wide) character constant in TOKEN.
				1325	PCHARS_SEEN points to a variable that is filled in with the number
				1326	of characters seen, and UNSIGNEDP to a variable that indicates
				1327	whether the result has signed type. */
				1328	cppchar_t
				1329	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1330	unsigned int pchars_seen, int unsignedp)
				1331	{
				1332	cpp_string str = { 0, 0 };
				1333	bool wide = (token->type == CPP_WCHAR);
				1334	cppchar_t result;
				1335
				1336	/* an empty constant will appear as L'' or '' */
				1337	if (token->val.str.len == (size_t) (2 + wide))
				1338	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1339	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1340	return 0;
				1341	}
				1342	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1343	return 0;
				1344
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1345	if (wide)
				1346	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
				1347	else
				1348	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1349
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1350	if (str.text != token->val.str.text)
				1351	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1352
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1353	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1354	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1355
				1356	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1357	_cpp_convert_input (cpp_reader pfile, const char input_charset,
				1358	uchar input, size_t size, size_t len, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1359	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1360	struct cset_converter input_cset;
				1361	struct _cpp_strbuf to;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1362
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1363	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				1364	if (input_cset.func == convert_no_conversion)
				1365	{
				1366	to.text = input;
				1367	to.asize = size;
				1368	to.len = len;
				1369	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1370	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1371	{
				1372	to.asize = MAX (65536, len);
				1373	to.text = xmalloc (to.asize);
				1374	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1375
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1376	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				1377	cpp_error (pfile, CPP_DL_ERROR,
				1378	"failure to convert %s to %s",
				1379	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				1380
				1381	free (input);
				1382	}
				1383
				1384	/* Clean up the mess. */
				1385	if (input_cset.func == convert_using_iconv)
				1386	iconv_close (input_cset.cd);
				1387
				1388	/* Resize buffer if we allocated substantially too much, or if we
				1389	haven't enough space for the \n-terminator. */
				1390	if (to.len + 4096 < to.asize \|\| to.len >= to.asize)
				1391	to.text = xrealloc (to.text, to.len + 1);
				1392
				1393	to.text[to.len] = '\n';
				1394	*st_size = to.len;
				1395	return to.text;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1396	}
				1397
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1398	const char *
				1399	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1400	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1401	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1402
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1403	/* We disable this because the default codeset is 7-bit ASCII on
				1404	most platforms, and this causes conversion failures on every
				1405	file in GCC that happens to have one of the upper 128 characters
				1406	in it -- most likely, as part of the name of a contributor.
				1407	We should definitely recognize in-band markers of file encoding,
				1408	like:
				1409	- the appropriate Unicode byte-order mark (FE FF) to recognize
				1410	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				1411	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame^]	1412	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1413	distinguish ASCII and EBCDIC.
				1414	- now we can parse something like "#pragma GCC encoding <xyz>
				1415	on the first line, or even Emacs/VIM's mode line tags (there's
				1416	a problem here in that VIM uses the last line, and Emacs has
				1417	its more elaborate "Local variables:" convention).
				1418	- investigate whether Java has another common convention, which
				1419	would be friendly to support.
				1420	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				1421	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1422	setlocale (LC_CTYPE, "");
				1423	current_encoding = nl_langinfo (CODESET);
				1424	#endif
				1425	if (current_encoding == NULL \|\| *current_encoding == '\0')
				1426	current_encoding = SOURCE_CHARSET;
				1427
				1428	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1429	}