Blame - libcpp/charset.c - toolchain/gcc

blob: cd25f10a2e69fae2f969a6e2fc0a54d37547a2c4 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Kazu Hirata	d9221e01	2004-01-21 20:40:04 +0000	[diff] [blame]	2	Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3	Free Software Foundation, Inc.
				4
				5	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				6
				7	This program is free software; you can redistribute it and/or modify it
				8	under the terms of the GNU General Public License as published by the
				9	Free Software Foundation; either version 2, or (at your option) any
				10	later version.
				11
				12	This program is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				15	GNU General Public License for more details.
				16
				17	You should have received a copy of the GNU General Public License
				18	along with this program; if not, write to the Free Software
				19	Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
				20
				21	#include "config.h"
				22	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	23	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	24	#include "internal.h"
				25	#include "ucnid.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	26
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	27	/* Character set handling for C-family languages.
				28
				29	Terminological note: In what follows, "charset" or "character set"
				30	will be taken to mean both an abstract set of characters and an
				31	encoding for that set.
				32
				33	The C99 standard discusses two character sets: source and execution.
				34	The source character set is used for internal processing in translation
				35	phases 1 through 4; the execution character set is used thereafter.
				36	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				37	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				38	of these terms). Furthermore, the "basic character set" (listed in
				39	5.2.1p3) is to be encoded in each with values one byte wide, and is
				40	to appear in the initial shift state.
				41
				42	It is not explicitly mentioned, but there is also a "wide execution
				43	character set" used to encode wide character constants and wide
				44	string literals; this is supposed to be the result of applying the
				45	standard library function mbstowcs() to an equivalent narrow string
				46	(6.4.5p5). However, the behavior of hexadecimal and octal
				47	\-escapes is at odds with this; they are supposed to be translated
				48	directly to wchar_t values (6.4.4.4p5,6).
				49
				50	The source character set is not necessarily the character set used
				51	to encode physical source files on disk; translation phase 1 converts
				52	from whatever that encoding is to the source character set.
				53
				54	The presence of universal character names in C99 (6.4.3 et seq.)
				55	forces the source character set to be isomorphic to ISO 10646,
				56	that is, Unicode. There is no such constraint on the execution
				57	character set; note also that the conversion from source to
				58	execution character set does not occur for identifiers (5.1.1.2p1#5).
				59
				60	For convenience of implementation, the source character set's
				61	encoding of the basic character set should be identical to the
				62	execution character set OF THE HOST SYSTEM's encoding of the basic
				63	character set, and it should not be a state-dependent encoding.
				64
				65	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				66	depending on whether the host is based on ASCII or EBCDIC (see
				67	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	68	Technical Report #16). With limited exceptions, it relies on the
				69	system library's iconv() primitive to do charset conversion
				70	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	71
				72	#if !HAVE_ICONV
				73	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				74	below, which are guarded only by if statements with compile-time
				75	constant conditions, do not cause link errors. */
				76	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	77	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	78	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	79	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	80	#endif
				81
				82	#if HOST_CHARSET == HOST_CHARSET_ASCII
				83	#define SOURCE_CHARSET "UTF-8"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	84	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	85	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				86	#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	87	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	88	#else
				89	#error "Unrecognized basic host character set"
				90	#endif
				91
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	92	#ifndef EILSEQ
				93	#define EILSEQ EINVAL
				94	#endif
				95
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	96	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	97	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	98	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	99	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	100	{
				101	uchar *text;
				102	size_t asize;
				103	size_t len;
				104	};
				105
				106	/* This is enough to hold any string that fits on a single 80-column
				107	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	108	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	109	#define OUTBUF_BLOCK_SIZE 256
				110
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	111	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				112	logic. This is because a depressing number of systems lack iconv,
				113	or have have iconv libraries that do not do these conversions, so
				114	we need a fallback implementation for them. To ensure the fallback
				115	doesn't break due to neglect, it is used on all systems.
				116
				117	UTF-32 encoding is nice and simple: a four-byte binary number,
				118	constrained to the range 00000000-7FFFFFFF to avoid questions of
				119	signedness. We do have to cope with big- and little-endian
				120	variants.
				121
				122	UTF-16 encoding uses two-byte binary numbers, again in big- and
				123	little-endian variants, for all values in the 00000000-0000FFFF
				124	range. Values in the 00010000-0010FFFF range are encoded as pairs
				125	of two-byte numbers, called "surrogate pairs": given a number S in
				126	this range, it is mapped to a pair (H, L) as follows:
				127
				128	H = (S - 0x10000) / 0x400 + 0xD800
				129	L = (S - 0x10000) % 0x400 + 0xDC00
				130
				131	Two-byte values in the D800...DFFF range are ill-formed except as a
				132	component of a surrogate pair. Even if the encoding within a
				133	two-byte value is little-endian, the H member of the surrogate pair
				134	comes first.
				135
				136	There is no way to encode values in the 00110000-7FFFFFFF range,
				137	which is not currently a problem as there are no assigned code
				138	points in that range; however, the author expects that it will
				139	eventually become necessary to abandon UTF-16 due to this
				140	limitation. Note also that, because of these pairs, UTF-16 does
				141	not meet the requirements of the C standard for a wide character
				142	encoding (see 3.7.3 and 6.4.4.4p11).
				143
				144	UTF-8 encoding looks like this:
				145
				146	value range encoded as
				147	00000000-0000007F 0xxxxxxx
				148	00000080-000007FF 110xxxxx 10xxxxxx
				149	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				150	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				151	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				152	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				153
				154	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				155	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				156	never occur. Note also that any value that can be encoded by a
				157	given row of the table can also be encoded by all successive rows,
				158	but this is not done; only the shortest possible encoding for any
				159	given value is valid. For instance, the character 07C0 could be
				160	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				161	FC 80 80 80 9F 80. Only the first is valid.
				162
				163	An implementation note: the transformation from UTF-16 to UTF-8, or
				164	vice versa, is easiest done by using UTF-32 as an intermediary. */
				165
				166	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				167	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				168	operation in several places below. */
				169	static inline int
				170	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				171	cppchar_t *cp)
				172	{
				173	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
				174	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	175
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	176	cppchar_t c;
				177	const uchar inbuf = inbufp;
				178	size_t nbytes, i;
				179
				180	if (*inbytesleftp < 1)
				181	return EINVAL;
				182
				183	c = *inbuf;
				184	if (c < 0x80)
				185	{
				186	*cp = c;
				187	*inbytesleftp -= 1;
				188	*inbufp += 1;
				189	return 0;
				190	}
				191
				192	/* The number of leading 1-bits in the first byte indicates how many
				193	bytes follow. */
				194	for (nbytes = 2; nbytes < 7; nbytes++)
				195	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				196	goto found;
				197	return EILSEQ;
				198	found:
				199
				200	if (*inbytesleftp < nbytes)
				201	return EINVAL;
				202
				203	c = (c & masks[nbytes-1]);
				204	inbuf++;
				205	for (i = 1; i < nbytes; i++)
				206	{
				207	cppchar_t n = *inbuf++;
				208	if ((n & 0xC0) != 0x80)
				209	return EILSEQ;
				210	c = ((c << 6) + (n & 0x3F));
				211	}
				212
				213	/* Make sure the shortest possible encoding was used. */
				214	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				215	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				216	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				217	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				218	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				219
				220	/* Make sure the character is valid. */
				221	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				222
				223	*cp = c;
				224	*inbufp = inbuf;
				225	*inbytesleftp -= nbytes;
				226	return 0;
				227	}
				228
				229	static inline int
				230	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				231	{
				232	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				233	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				234	size_t nbytes;
				235	uchar buf[6], *p = &buf[6];
				236	uchar outbuf = outbufp;
				237
				238	nbytes = 1;
				239	if (c < 0x80)
				240	*--p = c;
				241	else
				242	{
				243	do
				244	{
				245	*--p = ((c & 0x3F) \| 0x80);
				246	c >>= 6;
				247	nbytes++;
				248	}
				249	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				250	*--p = (c \| masks[nbytes-1]);
				251	}
				252
				253	if (*outbytesleftp < nbytes)
				254	return E2BIG;
				255
				256	while (p < &buf[6])
				257	outbuf++ = p++;
				258	*outbytesleftp -= nbytes;
				259	*outbufp = outbuf;
				260	return 0;
				261	}
				262
				263	/* The following four functions transform one character between the two
				264	encodings named in the function name. All have the signature
				265	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				266	uchar *outbufp, size_t outbytesleftp)
				267
				268	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				269	interpreted as a boolean indicating whether big-endian or
				270	little-endian encoding is to be used for the member of the pair
				271	that is not UTF-8.
				272
				273	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				274	do for iconv.
				275
				276	The return value is either 0 for success, or an errno value for
				277	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				278	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	279
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	280	static inline int
				281	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				282	uchar *outbufp, size_t outbytesleftp)
				283	{
				284	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	285	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	286	int rval;
				287
				288	/* Check for space first, since we know exactly how much we need. */
				289	if (*outbytesleftp < 4)
				290	return E2BIG;
				291
				292	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				293	if (rval)
				294	return rval;
				295
				296	outbuf = *outbufp;
				297	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				298	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				299	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				300	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				301
				302	*outbufp += 4;
				303	*outbytesleftp -= 4;
				304	return 0;
				305	}
				306
				307	static inline int
				308	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				309	uchar *outbufp, size_t outbytesleftp)
				310	{
				311	cppchar_t s;
				312	int rval;
				313	const uchar *inbuf;
				314
				315	if (*inbytesleftp < 4)
				316	return EINVAL;
				317
				318	inbuf = *inbufp;
				319
				320	s = inbuf[bigend ? 0 : 3] << 24;
				321	s += inbuf[bigend ? 1 : 2] << 16;
				322	s += inbuf[bigend ? 2 : 1] << 8;
				323	s += inbuf[bigend ? 3 : 0];
				324
				325	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				326	return EILSEQ;
				327
				328	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				329	if (rval)
				330	return rval;
				331
				332	*inbufp += 4;
				333	*inbytesleftp -= 4;
				334	return 0;
				335	}
				336
				337	static inline int
				338	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				339	uchar *outbufp, size_t outbytesleftp)
				340	{
				341	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	342	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	343	const uchar save_inbuf = inbufp;
				344	size_t save_inbytesleft = *inbytesleftp;
				345	uchar outbuf = outbufp;
				346
				347	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				348	if (rval)
				349	return rval;
				350
				351	if (s > 0x0010FFFF)
				352	{
				353	*inbufp = save_inbuf;
				354	*inbytesleftp = save_inbytesleft;
				355	return EILSEQ;
				356	}
				357
				358	if (s < 0xFFFF)
				359	{
				360	if (*outbytesleftp < 2)
				361	{
				362	*inbufp = save_inbuf;
				363	*inbytesleftp = save_inbytesleft;
				364	return E2BIG;
				365	}
				366	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				367	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				368
				369	*outbufp += 2;
				370	*outbytesleftp -= 2;
				371	return 0;
				372	}
				373	else
				374	{
				375	cppchar_t hi, lo;
				376
				377	if (*outbytesleftp < 4)
				378	{
				379	*inbufp = save_inbuf;
				380	*inbytesleftp = save_inbytesleft;
				381	return E2BIG;
				382	}
				383
				384	hi = (s - 0x10000) / 0x400 + 0xD800;
				385	lo = (s - 0x10000) % 0x400 + 0xDC00;
				386
				387	/* Even if we are little-endian, put the high surrogate first.
				388	??? Matches practice? */
				389	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				390	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				391	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				392	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				393
				394	*outbufp += 4;
				395	*outbytesleftp -= 4;
				396	return 0;
				397	}
				398	}
				399
				400	static inline int
				401	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				402	uchar *outbufp, size_t outbytesleftp)
				403	{
				404	cppchar_t s;
				405	const uchar inbuf = inbufp;
				406	int rval;
				407
				408	if (*inbytesleftp < 2)
				409	return EINVAL;
				410	s = inbuf[bigend ? 0 : 1] << 8;
				411	s += inbuf[bigend ? 1 : 0];
				412
				413	/* Low surrogate without immediately preceding high surrogate is invalid. */
				414	if (s >= 0xDC00 && s <= 0xDFFF)
				415	return EILSEQ;
				416	/* High surrogate must have a following low surrogate. */
				417	else if (s >= 0xD800 && s <= 0xDBFF)
				418	{
				419	cppchar_t hi = s, lo;
				420	if (*inbytesleftp < 4)
				421	return EINVAL;
				422
				423	lo = inbuf[bigend ? 2 : 3] << 8;
				424	lo += inbuf[bigend ? 3 : 2];
				425
				426	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				427	return EILSEQ;
				428
				429	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				430	}
				431
				432	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				433	if (rval)
				434	return rval;
				435
				436	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				437	the output pointers for us). */
				438	if (s <= 0xFFFF)
				439	{
				440	*inbufp += 2;
				441	*inbytesleftp -= 2;
				442	}
				443	else
				444	{
				445	*inbufp += 4;
				446	*inbytesleftp -= 4;
				447	}
				448	return 0;
				449	}
				450
				451	/* Helper routine for the next few functions. The 'const' on
				452	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	453	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	454
				455	static inline bool
				456	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				457	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	458	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	459	{
				460	const uchar *inbuf;
				461	uchar *outbuf;
				462	size_t inbytesleft, outbytesleft;
				463	int rval;
				464
				465	inbuf = from;
				466	inbytesleft = flen;
				467	outbuf = to->text + to->len;
				468	outbytesleft = to->asize - to->len;
				469
				470	for (;;)
				471	{
				472	do
				473	rval = one_conversion (cd, &inbuf, &inbytesleft,
				474	&outbuf, &outbytesleft);
				475	while (inbytesleft && !rval);
				476
				477	if (__builtin_expect (inbytesleft == 0, 1))
				478	{
				479	to->len = to->asize - outbytesleft;
				480	return true;
				481	}
				482	if (rval != E2BIG)
				483	{
				484	errno = rval;
				485	return false;
				486	}
				487
				488	outbytesleft += OUTBUF_BLOCK_SIZE;
				489	to->asize += OUTBUF_BLOCK_SIZE;
				490	to->text = xrealloc (to->text, to->asize);
				491	outbuf = to->text + to->asize - outbytesleft;
				492	}
				493	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	494
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	495
				496	/* These functions convert entire strings between character sets.
				497	They all have the signature
				498
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	499	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	500
				501	The input string FROM is converted as specified by the function
				502	name plus the iconv descriptor CD (which may be fake), and the
				503	result appended to TO. On any error, false is returned, otherwise true. */
				504
				505	/* These four use the custom conversion code above. */
				506	static bool
				507	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	508	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	509	{
				510	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				511	}
				512
				513	static bool
				514	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	515	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	516	{
				517	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				518	}
				519
				520	static bool
				521	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	522	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	523	{
				524	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				525	}
				526
				527	static bool
				528	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	529	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	530	{
				531	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				532	}
				533
				534	/* Identity conversion, used when we have no alternative. */
				535	static bool
				536	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	537	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	538	{
				539	if (to->len + flen > to->asize)
				540	{
				541	to->asize = to->len + flen;
				542	to->text = xrealloc (to->text, to->asize);
				543	}
				544	memcpy (to->text + to->len, from, flen);
				545	to->len += flen;
				546	return true;
				547	}
				548
				549	/* And this one uses the system iconv primitive. It's a little
				550	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	551	#if HAVE_ICONV
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	552	static bool
				553	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	554	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	555	{
				556	ICONV_CONST char *inbuf;
				557	char *outbuf;
				558	size_t inbytesleft, outbytesleft;
				559
				560	/* Reset conversion descriptor and check that it is valid. */
				561	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				562	return false;
				563
				564	inbuf = (ICONV_CONST char *)from;
				565	inbytesleft = flen;
				566	outbuf = (char *)to->text + to->len;
				567	outbytesleft = to->asize - to->len;
				568
				569	for (;;)
				570	{
				571	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				572	if (__builtin_expect (inbytesleft == 0, 1))
				573	{
				574	to->len = to->asize - outbytesleft;
				575	return true;
				576	}
				577	if (errno != E2BIG)
				578	return false;
				579
				580	outbytesleft += OUTBUF_BLOCK_SIZE;
				581	to->asize += OUTBUF_BLOCK_SIZE;
				582	to->text = xrealloc (to->text, to->asize);
				583	outbuf = (char *)to->text + to->asize - outbytesleft;
				584	}
				585	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	586	#else
				587	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				588	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	589
				590	/* Arrange for the above custom conversion logic to be used automatically
				591	when conversion between a suitable pair of character sets is requested. */
				592
				593	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				594	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				595
				596	struct conversion
				597	{
				598	const char *pair;
				599	convert_f func;
				600	iconv_t fake_cd;
				601	};
				602	static const struct conversion conversion_tab[] = {
				603	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				604	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				605	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				606	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				607	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				608	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				609	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				610	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				611	};
				612
				613	/* Subroutine of cpp_init_iconv: initialize and return a
				614	cset_converter structure for conversion from FROM to TO. If
				615	iconv_open() fails, issue an error and return an identity
				616	converter. Silently return an identity converter if FROM and TO
				617	are identical. */
				618	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	619	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				620	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	621	struct cset_converter ret;
				622	char *pair;
				623	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	624
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	625	if (!strcasecmp (to, from))
				626	{
				627	ret.func = convert_no_conversion;
				628	ret.cd = (iconv_t) -1;
				629	return ret;
				630	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	631
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	632	pair = alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	633
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	634	strcpy(pair, from);
				635	strcat(pair, "/");
				636	strcat(pair, to);
				637	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				638	if (!strcasecmp (pair, conversion_tab[i].pair))
				639	{
				640	ret.func = conversion_tab[i].func;
				641	ret.cd = conversion_tab[i].fake_cd;
				642	return ret;
				643	}
				644
				645	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	646	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	647	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	648	ret.func = convert_using_iconv;
				649	ret.cd = iconv_open (to, from);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	650
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	651	if (ret.cd == (iconv_t) -1)
				652	{
				653	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	654	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	655	"conversion from %s to %s not supported by iconv",
				656	from, to);
				657	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	658	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	659
				660	ret.func = convert_no_conversion;
				661	}
				662	}
				663	else
				664	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	665	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	666	"no iconv implementation, cannot convert from %s to %s",
				667	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	668	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	669	ret.cd = (iconv_t) -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	670	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	671	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	672	}
				673
				674	/* If charset conversion is requested, initialize iconv(3) descriptors
				675	for conversion from the source character set to the execution
				676	character sets. If iconv is not present in the C library, and
				677	conversion is requested, issue an error. */
				678
				679	void
				680	cpp_init_iconv (cpp_reader *pfile)
				681	{
				682	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				683	const char *wcset = CPP_OPTION (pfile, wide_charset);
				684	const char *default_wcset;
				685
				686	bool be = CPP_OPTION (pfile, bytes_big_endian);
				687
				688	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	689	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	690	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	691	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	692	else
				693	/* This effectively means that wide strings are not supported,
				694	so don't do any conversion at all. */
				695	default_wcset = SOURCE_CHARSET;
				696
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	697	if (!ncset)
				698	ncset = SOURCE_CHARSET;
				699	if (!wcset)
				700	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	701
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	702	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
				703	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	704	}
				705
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	706	/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	707	void
				708	_cpp_destroy_iconv (cpp_reader *pfile)
				709	{
				710	if (HAVE_ICONV)
				711	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	712	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				713	iconv_close (pfile->narrow_cset_desc.cd);
				714	if (pfile->wide_cset_desc.func == convert_using_iconv)
				715	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	716	}
				717	}
				718
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	719	/* Utility routine for use by a full compiler. C is a character taken
				720	from the basic source character set, encoded in the host's
				721	execution encoding. Convert it to (the target's) execution
				722	encoding, and return that value.
				723
				724	Issues an internal error if C's representation in the narrow
				725	execution character set fails to be a single-byte value (C99
				726	5.2.1p3: "The representation of each member of the source and
				727	execution character sets shall fit in a byte.") May also issue an
				728	internal error if C fails to be a member of the basic source
				729	character set (testing this exactly is too hard, especially when
				730	the host character set is EBCDIC). */
				731	cppchar_t
				732	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
				733	{
				734	uchar sbuf[1];
				735	struct _cpp_strbuf tbuf;
				736
				737	/* This test is merely an approximation, but it suffices to catch
				738	the most important thing, which is that we don't get handed a
				739	character outside the unibyte range of the host character set. */
				740	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
				741	{
				742	cpp_error (pfile, CPP_DL_ICE,
				743	"character 0x%lx is not in the basic source character set\n",
				744	(unsigned long)c);
				745	return 0;
				746	}
				747
				748	/* Being a character in the unibyte range of the host character set,
				749	we can safely splat it into a one-byte buffer and trust that that
				750	is a well-formed string. */
				751	sbuf[0] = c;
				752
				753	/* This should never need to reallocate, but just in case... */
				754	tbuf.asize = 1;
				755	tbuf.text = xmalloc (tbuf.asize);
				756	tbuf.len = 0;
				757
				758	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
				759	{
				760	cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
				761	return 0;
				762	}
				763	if (tbuf.len != 1)
				764	{
				765	cpp_error (pfile, CPP_DL_ICE,
				766	"character 0x%lx is not unibyte in execution character set",
				767	(unsigned long)c);
				768	return 0;
				769	}
				770	c = tbuf.text[0];
				771	free(tbuf.text);
				772	return c;
				773	}
				774
				775
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	776
				777	/* Utility routine that computes a mask of the form 0000...111... with
				778	WIDTH 1-bits. */
				779	static inline size_t
				780	width_to_mask (size_t width)
				781	{
				782	width = MIN (width, BITS_PER_CPPCHAR_T);
				783	if (width >= CHAR_BIT * sizeof (size_t))
				784	return ~(size_t) 0;
				785	else
				786	return ((size_t) 1 << width) - 1;
				787	}
				788
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	789	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				790	the start of an identifier, and 0 if C is not valid in an
				791	identifier. We assume C has already gone through the checks of
				792	_cpp_valid_ucn. The algorithm is a simple binary search on the
				793	table defined in cppucnid.h. */
				794
				795	static int
				796	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
				797	{
				798	int mn, mx, md;
				799
				800	mn = -1;
				801	mx = ARRAY_SIZE (ucnranges);
				802	while (mx - mn > 1)
				803	{
				804	md = (mn + mx) / 2;
				805	if (c < ucnranges[md].lo)
				806	mx = md;
				807	else if (c > ucnranges[md].hi)
				808	mn = md;
				809	else
				810	goto found;
				811	}
				812	return 0;
				813
				814	found:
				815	/* When -pedantic, we require the character to have been listed by
				816	the standard for the current language. Otherwise, we accept the
				817	union of the acceptable sets for C++98 and C99. */
				818	if (CPP_PEDANTIC (pfile)
				819	&& ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
				820	\|\| (CPP_OPTION (pfile, cplusplus)
				821	&& !(ucnranges[md].flags & CXX))))
				822	return 0;
				823
				824	/* In C99, UCN digits may not begin identifiers. */
				825	if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
				826	return 2;
				827
				828	return 1;
				829	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	830
				831	/* [lex.charset]: The character designated by the universal character
				832	name \UNNNNNNNN is that character whose character short name in
				833	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				834	universal character name \uNNNN is that character whose character
				835	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
				836	for a universal character name is less than 0x20 or in the range
				837	0x7F-0x9F (inclusive), or if the universal character name
				838	designates a character in the basic source character set, then the
				839	program is ill-formed.
				840
				841	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
				842	buffer end is delimited by a non-hex digit. Returns zero if UCNs
				843	are not part of the relevant standard, or if the string beginning
				844	at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
				845
Kazu Hirata	6356f89	2003-06-12 19:01:08 +0000	[diff] [blame]	846	Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	847	is returned. Diagnostics are emitted for invalid values. PSTR
				848	is updated to point one beyond the UCN, or to the syntactically
				849	invalid character.
				850
				851	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	852	an identifier, or 2 otherwise. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	853
				854	cppchar_t
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	855	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
				856	const uchar *limit, int identifier_pos)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	857	{
				858	cppchar_t result, c;
				859	unsigned int length;
				860	const uchar str = pstr;
				861	const uchar *base = str - 2;
				862
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	863	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	864	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	865	"universal character names are only valid in C++ and C99");
				866	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	867	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	868	"the meaning of '\\%c' is different in traditional C",
				869	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	870
				871	if (str[-1] == 'u')
				872	length = 4;
				873	else if (str[-1] == 'U')
				874	length = 8;
				875	else
				876	abort();
				877
				878	result = 0;
				879	do
				880	{
				881	c = *str;
				882	if (!ISXDIGIT (c))
				883	break;
				884	str++;
				885	result = (result << 4) + hex_value (c);
				886	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	887	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	888
				889	*pstr = str;
				890	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	891	{
				892	/* We'll error when we try it out as the start of an identifier. */
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	893	cpp_error (pfile, CPP_DL_ERROR,
				894	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	895	(int) (str - base), base);
				896	result = 1;
				897	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	898	/* The standard permits $, @ and ` to be specified as UCNs. We use
				899	hex escapes so that this also works with EBCDIC hosts. */
				900	else if ((result < 0xa0
				901	&& (result != 0x24 && result != 0x40 && result != 0x60))
				902	\|\| (result & 0x80000000)
				903	\|\| (result >= 0xD800 && result <= 0xDFFF))
				904	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	905	cpp_error (pfile, CPP_DL_ERROR,
				906	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	907	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	908	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	909	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	910	else if (identifier_pos && result == 0x24
				911	&& CPP_OPTION (pfile, dollars_in_ident))
				912	{
				913	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
				914	{
				915	CPP_OPTION (pfile, warn_dollars) = 0;
				916	cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
				917	}
				918	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	919	else if (identifier_pos)
				920	{
				921	int validity = ucn_valid_in_identifier (pfile, result);
				922
				923	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	924	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	925	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	926	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	927	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	928	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	929	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	930	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	931	}
				932
				933	if (result == 0)
				934	result = 1;
				935
				936	return result;
				937	}
				938
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	939	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
				940	it to the execution character set and write the result into TBUF.
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	941	An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	942	static const uchar *
				943	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	944	struct _cpp_strbuf *tbuf, bool wide)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	945	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	946	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	947	uchar buf[6];
				948	uchar *bufp = buf;
				949	size_t bytesleft = 6;
				950	int rval;
				951	struct cset_converter cvt
				952	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	953
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	954	from++; /* Skip u/U. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	955	ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	956
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	957	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				958	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	959	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	960	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	961	cpp_errno (pfile, CPP_DL_ERROR,
				962	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	963	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	964	else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	965	cpp_errno (pfile, CPP_DL_ERROR,
				966	"converting UCN to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	967
				968	return from;
				969	}
				970
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	971	/* Subroutine of convert_hex and convert_oct. N is the representation
				972	in the execution character set of a numeric escape; write it into the
				973	string buffer TBUF and update the end-of-string pointer therein. WIDE
				974	is true if it's a wide string that's being assembled in TBUF. This
				975	function issues no diagnostics and never fails. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	976	static void
				977	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	978	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	979	{
				980	if (wide)
				981	{
				982	/* We have to render this into the target byte order, which may not
				983	be our byte order. */
				984	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				985	size_t width = CPP_OPTION (pfile, wchar_precision);
				986	size_t cwidth = CPP_OPTION (pfile, char_precision);
				987	size_t cmask = width_to_mask (cwidth);
				988	size_t nbwc = width / cwidth;
				989	size_t i;
				990	size_t off = tbuf->len;
				991	cppchar_t c;
				992
				993	if (tbuf->len + nbwc > tbuf->asize)
				994	{
				995	tbuf->asize += OUTBUF_BLOCK_SIZE;
				996	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				997	}
				998
				999	for (i = 0; i < nbwc; i++)
				1000	{
				1001	c = n & cmask;
				1002	n >>= cwidth;
				1003	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				1004	}
				1005	tbuf->len += nbwc;
				1006	}
				1007	else
				1008	{
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1009	/* Note: this code does not handle the case where the target
				1010	and host have a different number of bits in a byte. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1011	if (tbuf->len + 1 > tbuf->asize)
				1012	{
				1013	tbuf->asize += OUTBUF_BLOCK_SIZE;
				1014	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				1015	}
				1016	tbuf->text[tbuf->len++] = n;
				1017	}
				1018	}
				1019
				1020	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
				1021	character set and write it into the string buffer TBUF. Returns an
				1022	advanced pointer, and issues diagnostics as necessary.
				1023	No character set translation occurs; this routine always produces the
				1024	execution-set character with numeric value equal to the given hex
				1025	number. You can, e.g. generate surrogate pairs this way. */
				1026	static const uchar *
				1027	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1028	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1029	{
				1030	cppchar_t c, n = 0, overflow = 0;
				1031	int digits_found = 0;
				1032	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				1033	: CPP_OPTION (pfile, char_precision));
				1034	size_t mask = width_to_mask (width);
				1035
				1036	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1037	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1038	"the meaning of '\\x' is different in traditional C");
				1039
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1040	from++; /* Skip 'x'. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1041	while (from < limit)
				1042	{
				1043	c = *from;
				1044	if (! hex_p (c))
				1045	break;
				1046	from++;
				1047	overflow \|= n ^ (n << 4 >> 4);
				1048	n = (n << 4) + hex_value (c);
				1049	digits_found = 1;
				1050	}
				1051
				1052	if (!digits_found)
				1053	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1054	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1055	"\\x used with no following hex digits");
				1056	return from;
				1057	}
				1058
				1059	if (overflow \| (n != (n & mask)))
				1060	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1061	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1062	"hex escape sequence out of range");
				1063	n &= mask;
				1064	}
				1065
				1066	emit_numeric_escape (pfile, n, tbuf, wide);
				1067
				1068	return from;
				1069	}
				1070
				1071	/* Convert an octal escape, pointed to by FROM, to the execution
				1072	character set and write it into the string buffer TBUF. Returns an
				1073	advanced pointer, and issues diagnostics as necessary.
				1074	No character set translation occurs; this routine always produces the
				1075	execution-set character with numeric value equal to the given octal
				1076	number. */
				1077	static const uchar *
				1078	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1079	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1080	{
				1081	size_t count = 0;
				1082	cppchar_t c, n = 0;
				1083	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				1084	: CPP_OPTION (pfile, char_precision));
				1085	size_t mask = width_to_mask (width);
				1086	bool overflow = false;
				1087
				1088	while (from < limit && count++ < 3)
				1089	{
				1090	c = *from;
				1091	if (c < '0' \|\| c > '7')
				1092	break;
				1093	from++;
				1094	overflow \|= n ^ (n << 3 >> 3);
				1095	n = (n << 3) + c - '0';
				1096	}
				1097
				1098	if (n != (n & mask))
				1099	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1100	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1101	"octal escape sequence out of range");
				1102	n &= mask;
				1103	}
				1104
				1105	emit_numeric_escape (pfile, n, tbuf, wide);
				1106
				1107	return from;
				1108	}
				1109
				1110	/* Convert an escape sequence (pointed to by FROM) to its value on
				1111	the target, and to the execution character set. Do not scan past
				1112	LIMIT. Write the converted value into TBUF. Returns an advanced
				1113	pointer. Handles all relevant diagnostics. */
				1114	static const uchar *
				1115	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1116	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1117	{
				1118	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1119	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1120	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1121	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1122	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1123	#else
				1124	#error "unknown host character set"
				1125	#endif
				1126
				1127	uchar c;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1128	struct cset_converter cvt
				1129	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1130
				1131	c = *from;
				1132	switch (c)
				1133	{
				1134	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1135	case 'u': case 'U':
				1136	return convert_ucn (pfile, from, limit, tbuf, wide);
				1137
				1138	case 'x':
				1139	return convert_hex (pfile, from, limit, tbuf, wide);
				1140	break;
				1141
				1142	case '0': case '1': case '2': case '3':
				1143	case '4': case '5': case '6': case '7':
				1144	return convert_oct (pfile, from, limit, tbuf, wide);
				1145
				1146	/* Various letter escapes. Get the appropriate host-charset
				1147	value into C. */
				1148	case '\\': case '\'': case '"': case '?': break;
				1149
				1150	case '(': case '{': case '[': case '%':
				1151	/* '\(', etc, can be used at the beginning of a line in a long
				1152	string split onto multiple lines with \-newline, to prevent
				1153	Emacs or other text editors from getting confused. '\%' can
				1154	be used to prevent SCCS from mangling printf format strings. */
				1155	if (CPP_PEDANTIC (pfile))
				1156	goto unknown;
				1157	break;
				1158
				1159	case 'b': c = charconsts[1]; break;
				1160	case 'f': c = charconsts[3]; break;
				1161	case 'n': c = charconsts[4]; break;
				1162	case 'r': c = charconsts[5]; break;
				1163	case 't': c = charconsts[6]; break;
				1164	case 'v': c = charconsts[7]; break;
				1165
				1166	case 'a':
				1167	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1168	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1169	"the meaning of '\\a' is different in traditional C");
				1170	c = charconsts[0];
				1171	break;
				1172
				1173	case 'e': case 'E':
				1174	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1175	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1176	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1177	c = charconsts[2];
				1178	break;
				1179
				1180	default:
				1181	unknown:
				1182	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1183	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1184	"unknown escape sequence '\\%c'", (int) c);
				1185	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1186	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1187	"unknown escape sequence: '\\%03o'", (int) c);
				1188	}
				1189
				1190	/* Now convert what we have to the execution character set. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1191	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1192	cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1193	"converting escape sequence to execution character set");
				1194
				1195	return from + 1;
				1196	}
				1197
				1198	/* FROM is an array of cpp_string structures of length COUNT. These
				1199	are to be converted from the source to the execution character set,
				1200	escape sequences translated, and finally all are to be
				1201	concatenated. WIDE indicates whether or not to produce a wide
				1202	string. The result is written into TO. Returns true for success,
				1203	false for failure. */
				1204	bool
				1205	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
				1206	cpp_string *to, bool wide)
				1207	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1208	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1209	const uchar p, base, *limit;
				1210	size_t i;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1211	struct cset_converter cvt
				1212	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1213
				1214	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
				1215	tbuf.text = xmalloc (tbuf.asize);
				1216	tbuf.len = 0;
				1217
				1218	for (i = 0; i < count; i++)
				1219	{
				1220	p = from[i].text;
				1221	if (*p == 'L') p++;
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1222	p++; /* Skip leading quote. */
				1223	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1224
				1225	for (;;)
				1226	{
				1227	base = p;
				1228	while (p < limit && *p != '\\')
				1229	p++;
				1230	if (p > base)
				1231	{
				1232	/* We have a run of normal characters; these can be fed
				1233	directly to convert_cset. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1234	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1235	goto fail;
				1236	}
				1237	if (p == limit)
				1238	break;
				1239
				1240	p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
				1241	}
				1242	}
				1243	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1244	structure. */
				1245	emit_numeric_escape (pfile, 0, &tbuf, wide);
				1246	tbuf.text = xrealloc (tbuf.text, tbuf.len);
				1247	to->text = tbuf.text;
				1248	to->len = tbuf.len;
				1249	return true;
				1250
				1251	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1252	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1253	free (tbuf.text);
				1254	return false;
				1255	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1256
				1257	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1258	in a string, but do not perform character set conversion. */
				1259	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1260	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
				1261	size_t count, cpp_string *to, bool wide)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1262	{
				1263	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1264	bool retval;
				1265
				1266	pfile->narrow_cset_desc.func = convert_no_conversion;
				1267	pfile->narrow_cset_desc.cd = (iconv_t) -1;
				1268
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1269	retval = cpp_interpret_string (pfile, from, count, to, wide);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1270
				1271	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1272	return retval;
				1273	}
				1274
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1275
				1276	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1277	to a number, for narrow strings. STR is the string structure returned
				1278	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1279	cpp_interpret_charconst. */
				1280	static cppchar_t
				1281	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1282	unsigned int pchars_seen, int unsignedp)
				1283	{
				1284	size_t width = CPP_OPTION (pfile, char_precision);
				1285	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1286	size_t mask = width_to_mask (width);
				1287	size_t i;
				1288	cppchar_t result, c;
				1289	bool unsigned_p;
				1290
				1291	/* The value of a multi-character character constant, or a
				1292	single-character character constant whose representation in the
				1293	execution character set is more than one byte long, is
				1294	implementation defined. This implementation defines it to be the
				1295	number formed by interpreting the byte sequence in memory as a
				1296	big-endian binary number. If overflow occurs, the high bytes are
				1297	lost, and a warning is issued.
				1298
				1299	We don't want to process the NUL terminator handed back by
				1300	cpp_interpret_string. */
				1301	result = 0;
				1302	for (i = 0; i < str.len - 1; i++)
				1303	{
				1304	c = str.text[i] & mask;
				1305	if (width < BITS_PER_CPPCHAR_T)
				1306	result = (result << width) \| c;
				1307	else
				1308	result = c;
				1309	}
				1310
				1311	if (i > max_chars)
				1312	{
				1313	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1314	cpp_error (pfile, CPP_DL_WARNING,
				1315	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1316	}
				1317	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1318	cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1319
				1320	/* Multichar constants are of type int and therefore signed. */
				1321	if (i > 1)
				1322	unsigned_p = 0;
				1323	else
				1324	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1325
				1326	/* Truncate the constant to its natural width, and simultaneously
				1327	sign- or zero-extend to the full width of cppchar_t.
				1328	For single-character constants, the value is WIDTH bits wide.
				1329	For multi-character constants, the value is INT_PRECISION bits wide. */
				1330	if (i > 1)
				1331	width = CPP_OPTION (pfile, int_precision);
				1332	if (width < BITS_PER_CPPCHAR_T)
				1333	{
				1334	mask = ((cppchar_t) 1 << width) - 1;
				1335	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1336	result &= mask;
				1337	else
				1338	result \|= ~mask;
				1339	}
				1340	*pchars_seen = i;
				1341	*unsignedp = unsigned_p;
				1342	return result;
				1343	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1344
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1345	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1346	to a number, for wide strings. STR is the string structure returned
				1347	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1348	cpp_interpret_charconst. */
				1349	static cppchar_t
				1350	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1351	unsigned int pchars_seen, int unsignedp)
				1352	{
				1353	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				1354	size_t width = CPP_OPTION (pfile, wchar_precision);
				1355	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1356	size_t mask = width_to_mask (width);
				1357	size_t cmask = width_to_mask (cwidth);
				1358	size_t nbwc = width / cwidth;
				1359	size_t off, i;
				1360	cppchar_t result = 0, c;
				1361
				1362	/* This is finicky because the string is in the target's byte order,
				1363	which may not be our byte order. Only the last character, ignoring
				1364	the NUL terminator, is relevant. */
				1365	off = str.len - (nbwc * 2);
				1366	result = 0;
				1367	for (i = 0; i < nbwc; i++)
				1368	{
				1369	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1370	result = (result << cwidth) \| (c & cmask);
				1371	}
				1372
				1373	/* Wide character constants have type wchar_t, and a single
				1374	character exactly fills a wchar_t, so a multi-character wide
				1375	character constant is guaranteed to overflow. */
				1376	if (off > 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1377	cpp_error (pfile, CPP_DL_WARNING,
				1378	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1379
				1380	/* Truncate the constant to its natural width, and simultaneously
				1381	sign- or zero-extend to the full width of cppchar_t. */
				1382	if (width < BITS_PER_CPPCHAR_T)
				1383	{
				1384	if (CPP_OPTION (pfile, unsigned_wchar) \|\| !(result & (1 << (width - 1))))
				1385	result &= mask;
				1386	else
				1387	result \|= ~mask;
				1388	}
				1389
				1390	*unsignedp = CPP_OPTION (pfile, unsigned_wchar);
				1391	*pchars_seen = 1;
				1392	return result;
				1393	}
				1394
				1395	/* Interpret a (possibly wide) character constant in TOKEN.
				1396	PCHARS_SEEN points to a variable that is filled in with the number
				1397	of characters seen, and UNSIGNEDP to a variable that indicates
				1398	whether the result has signed type. */
				1399	cppchar_t
				1400	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1401	unsigned int pchars_seen, int unsignedp)
				1402	{
				1403	cpp_string str = { 0, 0 };
				1404	bool wide = (token->type == CPP_WCHAR);
				1405	cppchar_t result;
				1406
				1407	/* an empty constant will appear as L'' or '' */
				1408	if (token->val.str.len == (size_t) (2 + wide))
				1409	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1410	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1411	return 0;
				1412	}
				1413	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1414	return 0;
				1415
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1416	if (wide)
				1417	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
				1418	else
				1419	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1420
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1421	if (str.text != token->val.str.text)
				1422	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1423
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1424	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1425	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1426
				1427	/* Convert an identifier denoted by ID and LEN, which might contain
				1428	UCN escapes, to the source character set, either UTF-8 or
				1429	UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
				1430	cpp_hashnode *
				1431	_cpp_interpret_identifier (cpp_reader pfile, const uchar id, size_t len)
				1432	{
				1433	/* It turns out that a UCN escape always turns into fewer characters
				1434	than the escape itself, so we can allocate a temporary in advance. */
				1435	uchar * buf = alloca (len + 1);
				1436	uchar * bufp = buf;
				1437	size_t idp;
				1438
				1439	for (idp = 0; idp < len; idp++)
				1440	if (id[idp] != '\\')
				1441	*bufp++ = id[idp];
				1442	else
				1443	{
				1444	unsigned length = id[idp+1] == 'u' ? 4 : 8;
				1445	cppchar_t value = 0;
				1446	size_t bufleft = len - (bufp - buf);
				1447	int rval;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1448
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1449	idp += 2;
				1450	while (length && idp < len && ISXDIGIT (id[idp]))
				1451	{
				1452	value = (value << 4) + hex_value (id[idp]);
				1453	idp++;
				1454	length--;
				1455	}
				1456	idp--;
				1457
				1458	/* Special case for EBCDIC: if the identifier contains
				1459	a '$' specified using a UCN, translate it to EBCDIC. */
				1460	if (value == 0x24)
				1461	{
				1462	*bufp++ = '$';
				1463	continue;
				1464	}
				1465
				1466	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
				1467	if (rval)
				1468	{
				1469	errno = rval;
				1470	cpp_errno (pfile, CPP_DL_ERROR,
				1471	"converting UCN to source character set");
				1472	break;
				1473	}
				1474	}
				1475
				1476	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				1477	buf, bufp - buf, HT_ALLOC));
				1478	}
				1479
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1480	/* Convert an input buffer (containing the complete contents of one
				1481	source file) from INPUT_CHARSET to the source character set. INPUT
				1482	points to the input buffer, SIZE is its allocated size, and LEN is
				1483	the length of the meaningful data within the buffer. The
				1484	translated buffer is returned, and *ST_SIZE is set to the length of
				1485	the meaningful data within the translated buffer.
				1486
				1487	INPUT is expected to have been allocated with xmalloc. This function
				1488	will either return INPUT, or free it and return a pointer to another
				1489	xmalloc-allocated block of memory. */
				1490	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1491	_cpp_convert_input (cpp_reader pfile, const char input_charset,
				1492	uchar input, size_t size, size_t len, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1493	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1494	struct cset_converter input_cset;
				1495	struct _cpp_strbuf to;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1496
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1497	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				1498	if (input_cset.func == convert_no_conversion)
				1499	{
				1500	to.text = input;
				1501	to.asize = size;
				1502	to.len = len;
				1503	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1504	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1505	{
				1506	to.asize = MAX (65536, len);
				1507	to.text = xmalloc (to.asize);
				1508	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1509
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1510	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				1511	cpp_error (pfile, CPP_DL_ERROR,
				1512	"failure to convert %s to %s",
				1513	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				1514
				1515	free (input);
				1516	}
				1517
				1518	/* Clean up the mess. */
				1519	if (input_cset.func == convert_using_iconv)
				1520	iconv_close (input_cset.cd);
				1521
				1522	/* Resize buffer if we allocated substantially too much, or if we
				1523	haven't enough space for the \n-terminator. */
				1524	if (to.len + 4096 < to.asize \|\| to.len >= to.asize)
				1525	to.text = xrealloc (to.text, to.len + 1);
				1526
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1527	/* If the file is using old-school Mac line endings (\r only),
				1528	terminate with another \r, not an \n, so that we do not mistake
				1529	the \r\n sequence for a single DOS line ending and erroneously
				1530	issue the "No newline at end of file" diagnostic. */
				1531	if (to.text[to.len - 1] == '\r')
				1532	to.text[to.len] = '\r';
				1533	else
				1534	to.text[to.len] = '\n';
				1535
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1536	*st_size = to.len;
				1537	return to.text;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1538	}
				1539
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1540	/* Decide on the default encoding to assume for input files. */
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1541	const char *
				1542	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1543	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1544	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1545
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1546	/* We disable this because the default codeset is 7-bit ASCII on
				1547	most platforms, and this causes conversion failures on every
				1548	file in GCC that happens to have one of the upper 128 characters
				1549	in it -- most likely, as part of the name of a contributor.
				1550	We should definitely recognize in-band markers of file encoding,
				1551	like:
				1552	- the appropriate Unicode byte-order mark (FE FF) to recognize
				1553	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				1554	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame]	1555	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1556	distinguish ASCII and EBCDIC.
				1557	- now we can parse something like "#pragma GCC encoding <xyz>
				1558	on the first line, or even Emacs/VIM's mode line tags (there's
				1559	a problem here in that VIM uses the last line, and Emacs has
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1560	its more elaborate "local variables" convention).
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1561	- investigate whether Java has another common convention, which
				1562	would be friendly to support.
				1563	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				1564	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1565	setlocale (LC_CTYPE, "");
				1566	current_encoding = nl_langinfo (CODESET);
				1567	#endif
				1568	if (current_encoding == NULL \|\| *current_encoding == '\0')
				1569	current_encoding = SOURCE_CHARSET;
				1570
				1571	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1572	}