Blame - libcpp/charset.c - toolchain/gcc

blob: f028b371440bdbdb58296e31d3833b0926a0ecc0 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Kazu Hirata	d9221e01	2004-01-21 20:40:04 +0000	[diff] [blame]	2	Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3	Free Software Foundation, Inc.
				4
				5	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				6
				7	This program is free software; you can redistribute it and/or modify it
				8	under the terms of the GNU General Public License as published by the
				9	Free Software Foundation; either version 2, or (at your option) any
				10	later version.
				11
				12	This program is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				15	GNU General Public License for more details.
				16
				17	You should have received a copy of the GNU General Public License
				18	along with this program; if not, write to the Free Software
				19	Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
				20
				21	#include "config.h"
				22	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	23	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	24	#include "internal.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	25
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	26	/* Character set handling for C-family languages.
				27
				28	Terminological note: In what follows, "charset" or "character set"
				29	will be taken to mean both an abstract set of characters and an
				30	encoding for that set.
				31
				32	The C99 standard discusses two character sets: source and execution.
				33	The source character set is used for internal processing in translation
				34	phases 1 through 4; the execution character set is used thereafter.
				35	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				36	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				37	of these terms). Furthermore, the "basic character set" (listed in
				38	5.2.1p3) is to be encoded in each with values one byte wide, and is
				39	to appear in the initial shift state.
				40
				41	It is not explicitly mentioned, but there is also a "wide execution
				42	character set" used to encode wide character constants and wide
				43	string literals; this is supposed to be the result of applying the
				44	standard library function mbstowcs() to an equivalent narrow string
				45	(6.4.5p5). However, the behavior of hexadecimal and octal
				46	\-escapes is at odds with this; they are supposed to be translated
				47	directly to wchar_t values (6.4.4.4p5,6).
				48
				49	The source character set is not necessarily the character set used
				50	to encode physical source files on disk; translation phase 1 converts
				51	from whatever that encoding is to the source character set.
				52
				53	The presence of universal character names in C99 (6.4.3 et seq.)
				54	forces the source character set to be isomorphic to ISO 10646,
				55	that is, Unicode. There is no such constraint on the execution
				56	character set; note also that the conversion from source to
				57	execution character set does not occur for identifiers (5.1.1.2p1#5).
				58
				59	For convenience of implementation, the source character set's
				60	encoding of the basic character set should be identical to the
				61	execution character set OF THE HOST SYSTEM's encoding of the basic
				62	character set, and it should not be a state-dependent encoding.
				63
				64	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				65	depending on whether the host is based on ASCII or EBCDIC (see
				66	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	67	Technical Report #16). With limited exceptions, it relies on the
				68	system library's iconv() primitive to do charset conversion
				69	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	70
				71	#if !HAVE_ICONV
				72	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				73	below, which are guarded only by if statements with compile-time
				74	constant conditions, do not cause link errors. */
				75	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	76	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	77	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	78	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	79	#endif
				80
				81	#if HOST_CHARSET == HOST_CHARSET_ASCII
				82	#define SOURCE_CHARSET "UTF-8"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	83	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	84	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				85	#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	86	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	87	#else
				88	#error "Unrecognized basic host character set"
				89	#endif
				90
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	91	#ifndef EILSEQ
				92	#define EILSEQ EINVAL
				93	#endif
				94
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	95	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	96	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	97	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	98	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	99	{
				100	uchar *text;
				101	size_t asize;
				102	size_t len;
				103	};
				104
				105	/* This is enough to hold any string that fits on a single 80-column
				106	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	107	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	108	#define OUTBUF_BLOCK_SIZE 256
				109
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	110	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				111	logic. This is because a depressing number of systems lack iconv,
				112	or have have iconv libraries that do not do these conversions, so
				113	we need a fallback implementation for them. To ensure the fallback
				114	doesn't break due to neglect, it is used on all systems.
				115
				116	UTF-32 encoding is nice and simple: a four-byte binary number,
				117	constrained to the range 00000000-7FFFFFFF to avoid questions of
				118	signedness. We do have to cope with big- and little-endian
				119	variants.
				120
				121	UTF-16 encoding uses two-byte binary numbers, again in big- and
				122	little-endian variants, for all values in the 00000000-0000FFFF
				123	range. Values in the 00010000-0010FFFF range are encoded as pairs
				124	of two-byte numbers, called "surrogate pairs": given a number S in
				125	this range, it is mapped to a pair (H, L) as follows:
				126
				127	H = (S - 0x10000) / 0x400 + 0xD800
				128	L = (S - 0x10000) % 0x400 + 0xDC00
				129
				130	Two-byte values in the D800...DFFF range are ill-formed except as a
				131	component of a surrogate pair. Even if the encoding within a
				132	two-byte value is little-endian, the H member of the surrogate pair
				133	comes first.
				134
				135	There is no way to encode values in the 00110000-7FFFFFFF range,
				136	which is not currently a problem as there are no assigned code
				137	points in that range; however, the author expects that it will
				138	eventually become necessary to abandon UTF-16 due to this
				139	limitation. Note also that, because of these pairs, UTF-16 does
				140	not meet the requirements of the C standard for a wide character
				141	encoding (see 3.7.3 and 6.4.4.4p11).
				142
				143	UTF-8 encoding looks like this:
				144
				145	value range encoded as
				146	00000000-0000007F 0xxxxxxx
				147	00000080-000007FF 110xxxxx 10xxxxxx
				148	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				149	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				150	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				151	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				152
				153	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				154	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				155	never occur. Note also that any value that can be encoded by a
				156	given row of the table can also be encoded by all successive rows,
				157	but this is not done; only the shortest possible encoding for any
				158	given value is valid. For instance, the character 07C0 could be
				159	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				160	FC 80 80 80 9F 80. Only the first is valid.
				161
				162	An implementation note: the transformation from UTF-16 to UTF-8, or
				163	vice versa, is easiest done by using UTF-32 as an intermediary. */
				164
				165	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				166	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				167	operation in several places below. */
				168	static inline int
				169	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				170	cppchar_t *cp)
				171	{
				172	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
				173	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	174
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	175	cppchar_t c;
				176	const uchar inbuf = inbufp;
				177	size_t nbytes, i;
				178
				179	if (*inbytesleftp < 1)
				180	return EINVAL;
				181
				182	c = *inbuf;
				183	if (c < 0x80)
				184	{
				185	*cp = c;
				186	*inbytesleftp -= 1;
				187	*inbufp += 1;
				188	return 0;
				189	}
				190
				191	/* The number of leading 1-bits in the first byte indicates how many
				192	bytes follow. */
				193	for (nbytes = 2; nbytes < 7; nbytes++)
				194	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				195	goto found;
				196	return EILSEQ;
				197	found:
				198
				199	if (*inbytesleftp < nbytes)
				200	return EINVAL;
				201
				202	c = (c & masks[nbytes-1]);
				203	inbuf++;
				204	for (i = 1; i < nbytes; i++)
				205	{
				206	cppchar_t n = *inbuf++;
				207	if ((n & 0xC0) != 0x80)
				208	return EILSEQ;
				209	c = ((c << 6) + (n & 0x3F));
				210	}
				211
				212	/* Make sure the shortest possible encoding was used. */
				213	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				214	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				215	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				216	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				217	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				218
				219	/* Make sure the character is valid. */
				220	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				221
				222	*cp = c;
				223	*inbufp = inbuf;
				224	*inbytesleftp -= nbytes;
				225	return 0;
				226	}
				227
				228	static inline int
				229	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				230	{
				231	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				232	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				233	size_t nbytes;
				234	uchar buf[6], *p = &buf[6];
				235	uchar outbuf = outbufp;
				236
				237	nbytes = 1;
				238	if (c < 0x80)
				239	*--p = c;
				240	else
				241	{
				242	do
				243	{
				244	*--p = ((c & 0x3F) \| 0x80);
				245	c >>= 6;
				246	nbytes++;
				247	}
				248	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				249	*--p = (c \| masks[nbytes-1]);
				250	}
				251
				252	if (*outbytesleftp < nbytes)
				253	return E2BIG;
				254
				255	while (p < &buf[6])
				256	outbuf++ = p++;
				257	*outbytesleftp -= nbytes;
				258	*outbufp = outbuf;
				259	return 0;
				260	}
				261
				262	/* The following four functions transform one character between the two
				263	encodings named in the function name. All have the signature
				264	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				265	uchar *outbufp, size_t outbytesleftp)
				266
				267	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				268	interpreted as a boolean indicating whether big-endian or
				269	little-endian encoding is to be used for the member of the pair
				270	that is not UTF-8.
				271
				272	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				273	do for iconv.
				274
				275	The return value is either 0 for success, or an errno value for
				276	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				277	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	278
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	279	static inline int
				280	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				281	uchar *outbufp, size_t outbytesleftp)
				282	{
				283	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	284	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	285	int rval;
				286
				287	/* Check for space first, since we know exactly how much we need. */
				288	if (*outbytesleftp < 4)
				289	return E2BIG;
				290
				291	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				292	if (rval)
				293	return rval;
				294
				295	outbuf = *outbufp;
				296	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				297	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				298	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				299	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				300
				301	*outbufp += 4;
				302	*outbytesleftp -= 4;
				303	return 0;
				304	}
				305
				306	static inline int
				307	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				308	uchar *outbufp, size_t outbytesleftp)
				309	{
				310	cppchar_t s;
				311	int rval;
				312	const uchar *inbuf;
				313
				314	if (*inbytesleftp < 4)
				315	return EINVAL;
				316
				317	inbuf = *inbufp;
				318
				319	s = inbuf[bigend ? 0 : 3] << 24;
				320	s += inbuf[bigend ? 1 : 2] << 16;
				321	s += inbuf[bigend ? 2 : 1] << 8;
				322	s += inbuf[bigend ? 3 : 0];
				323
				324	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				325	return EILSEQ;
				326
				327	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				328	if (rval)
				329	return rval;
				330
				331	*inbufp += 4;
				332	*inbytesleftp -= 4;
				333	return 0;
				334	}
				335
				336	static inline int
				337	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				338	uchar *outbufp, size_t outbytesleftp)
				339	{
				340	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	341	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	342	const uchar save_inbuf = inbufp;
				343	size_t save_inbytesleft = *inbytesleftp;
				344	uchar outbuf = outbufp;
				345
				346	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				347	if (rval)
				348	return rval;
				349
				350	if (s > 0x0010FFFF)
				351	{
				352	*inbufp = save_inbuf;
				353	*inbytesleftp = save_inbytesleft;
				354	return EILSEQ;
				355	}
				356
				357	if (s < 0xFFFF)
				358	{
				359	if (*outbytesleftp < 2)
				360	{
				361	*inbufp = save_inbuf;
				362	*inbytesleftp = save_inbytesleft;
				363	return E2BIG;
				364	}
				365	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				366	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				367
				368	*outbufp += 2;
				369	*outbytesleftp -= 2;
				370	return 0;
				371	}
				372	else
				373	{
				374	cppchar_t hi, lo;
				375
				376	if (*outbytesleftp < 4)
				377	{
				378	*inbufp = save_inbuf;
				379	*inbytesleftp = save_inbytesleft;
				380	return E2BIG;
				381	}
				382
				383	hi = (s - 0x10000) / 0x400 + 0xD800;
				384	lo = (s - 0x10000) % 0x400 + 0xDC00;
				385
				386	/* Even if we are little-endian, put the high surrogate first.
				387	??? Matches practice? */
				388	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				389	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				390	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				391	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				392
				393	*outbufp += 4;
				394	*outbytesleftp -= 4;
				395	return 0;
				396	}
				397	}
				398
				399	static inline int
				400	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				401	uchar *outbufp, size_t outbytesleftp)
				402	{
				403	cppchar_t s;
				404	const uchar inbuf = inbufp;
				405	int rval;
				406
				407	if (*inbytesleftp < 2)
				408	return EINVAL;
				409	s = inbuf[bigend ? 0 : 1] << 8;
				410	s += inbuf[bigend ? 1 : 0];
				411
				412	/* Low surrogate without immediately preceding high surrogate is invalid. */
				413	if (s >= 0xDC00 && s <= 0xDFFF)
				414	return EILSEQ;
				415	/* High surrogate must have a following low surrogate. */
				416	else if (s >= 0xD800 && s <= 0xDBFF)
				417	{
				418	cppchar_t hi = s, lo;
				419	if (*inbytesleftp < 4)
				420	return EINVAL;
				421
				422	lo = inbuf[bigend ? 2 : 3] << 8;
				423	lo += inbuf[bigend ? 3 : 2];
				424
				425	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				426	return EILSEQ;
				427
				428	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				429	}
				430
				431	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				432	if (rval)
				433	return rval;
				434
				435	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				436	the output pointers for us). */
				437	if (s <= 0xFFFF)
				438	{
				439	*inbufp += 2;
				440	*inbytesleftp -= 2;
				441	}
				442	else
				443	{
				444	*inbufp += 4;
				445	*inbytesleftp -= 4;
				446	}
				447	return 0;
				448	}
				449
				450	/* Helper routine for the next few functions. The 'const' on
				451	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	452	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	453
				454	static inline bool
				455	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				456	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	457	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	458	{
				459	const uchar *inbuf;
				460	uchar *outbuf;
				461	size_t inbytesleft, outbytesleft;
				462	int rval;
				463
				464	inbuf = from;
				465	inbytesleft = flen;
				466	outbuf = to->text + to->len;
				467	outbytesleft = to->asize - to->len;
				468
				469	for (;;)
				470	{
				471	do
				472	rval = one_conversion (cd, &inbuf, &inbytesleft,
				473	&outbuf, &outbytesleft);
				474	while (inbytesleft && !rval);
				475
				476	if (__builtin_expect (inbytesleft == 0, 1))
				477	{
				478	to->len = to->asize - outbytesleft;
				479	return true;
				480	}
				481	if (rval != E2BIG)
				482	{
				483	errno = rval;
				484	return false;
				485	}
				486
				487	outbytesleft += OUTBUF_BLOCK_SIZE;
				488	to->asize += OUTBUF_BLOCK_SIZE;
				489	to->text = xrealloc (to->text, to->asize);
				490	outbuf = to->text + to->asize - outbytesleft;
				491	}
				492	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	493
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	494
				495	/* These functions convert entire strings between character sets.
				496	They all have the signature
				497
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	498	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	499
				500	The input string FROM is converted as specified by the function
				501	name plus the iconv descriptor CD (which may be fake), and the
				502	result appended to TO. On any error, false is returned, otherwise true. */
				503
				504	/* These four use the custom conversion code above. */
				505	static bool
				506	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	507	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	508	{
				509	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				510	}
				511
				512	static bool
				513	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	514	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	515	{
				516	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				517	}
				518
				519	static bool
				520	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	521	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	522	{
				523	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				524	}
				525
				526	static bool
				527	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	528	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	529	{
				530	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				531	}
				532
				533	/* Identity conversion, used when we have no alternative. */
				534	static bool
				535	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	536	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	537	{
				538	if (to->len + flen > to->asize)
				539	{
				540	to->asize = to->len + flen;
				541	to->text = xrealloc (to->text, to->asize);
				542	}
				543	memcpy (to->text + to->len, from, flen);
				544	to->len += flen;
				545	return true;
				546	}
				547
				548	/* And this one uses the system iconv primitive. It's a little
				549	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	550	#if HAVE_ICONV
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	551	static bool
				552	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	553	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	554	{
				555	ICONV_CONST char *inbuf;
				556	char *outbuf;
				557	size_t inbytesleft, outbytesleft;
				558
				559	/* Reset conversion descriptor and check that it is valid. */
				560	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				561	return false;
				562
				563	inbuf = (ICONV_CONST char *)from;
				564	inbytesleft = flen;
				565	outbuf = (char *)to->text + to->len;
				566	outbytesleft = to->asize - to->len;
				567
				568	for (;;)
				569	{
				570	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				571	if (__builtin_expect (inbytesleft == 0, 1))
				572	{
				573	to->len = to->asize - outbytesleft;
				574	return true;
				575	}
				576	if (errno != E2BIG)
				577	return false;
				578
				579	outbytesleft += OUTBUF_BLOCK_SIZE;
				580	to->asize += OUTBUF_BLOCK_SIZE;
				581	to->text = xrealloc (to->text, to->asize);
				582	outbuf = (char *)to->text + to->asize - outbytesleft;
				583	}
				584	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	585	#else
				586	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				587	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	588
				589	/* Arrange for the above custom conversion logic to be used automatically
				590	when conversion between a suitable pair of character sets is requested. */
				591
				592	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				593	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				594
				595	struct conversion
				596	{
				597	const char *pair;
				598	convert_f func;
				599	iconv_t fake_cd;
				600	};
				601	static const struct conversion conversion_tab[] = {
				602	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				603	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				604	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				605	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				606	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				607	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				608	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				609	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				610	};
				611
				612	/* Subroutine of cpp_init_iconv: initialize and return a
				613	cset_converter structure for conversion from FROM to TO. If
				614	iconv_open() fails, issue an error and return an identity
				615	converter. Silently return an identity converter if FROM and TO
				616	are identical. */
				617	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	618	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				619	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	620	struct cset_converter ret;
				621	char *pair;
				622	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	623
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	624	if (!strcasecmp (to, from))
				625	{
				626	ret.func = convert_no_conversion;
				627	ret.cd = (iconv_t) -1;
				628	return ret;
				629	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	630
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	631	pair = alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	632
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	633	strcpy(pair, from);
				634	strcat(pair, "/");
				635	strcat(pair, to);
				636	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				637	if (!strcasecmp (pair, conversion_tab[i].pair))
				638	{
				639	ret.func = conversion_tab[i].func;
				640	ret.cd = conversion_tab[i].fake_cd;
				641	return ret;
				642	}
				643
				644	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	645	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	646	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	647	ret.func = convert_using_iconv;
				648	ret.cd = iconv_open (to, from);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	649
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	650	if (ret.cd == (iconv_t) -1)
				651	{
				652	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	653	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	654	"conversion from %s to %s not supported by iconv",
				655	from, to);
				656	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	657	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	658
				659	ret.func = convert_no_conversion;
				660	}
				661	}
				662	else
				663	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	664	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	665	"no iconv implementation, cannot convert from %s to %s",
				666	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	667	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	668	ret.cd = (iconv_t) -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	669	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	670	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	671	}
				672
				673	/* If charset conversion is requested, initialize iconv(3) descriptors
				674	for conversion from the source character set to the execution
				675	character sets. If iconv is not present in the C library, and
				676	conversion is requested, issue an error. */
				677
				678	void
				679	cpp_init_iconv (cpp_reader *pfile)
				680	{
				681	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				682	const char *wcset = CPP_OPTION (pfile, wide_charset);
				683	const char *default_wcset;
				684
				685	bool be = CPP_OPTION (pfile, bytes_big_endian);
				686
				687	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	688	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	689	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	690	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	691	else
				692	/* This effectively means that wide strings are not supported,
				693	so don't do any conversion at all. */
				694	default_wcset = SOURCE_CHARSET;
				695
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	696	if (!ncset)
				697	ncset = SOURCE_CHARSET;
				698	if (!wcset)
				699	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	700
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	701	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
				702	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	703	}
				704
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	705	/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	706	void
				707	_cpp_destroy_iconv (cpp_reader *pfile)
				708	{
				709	if (HAVE_ICONV)
				710	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	711	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				712	iconv_close (pfile->narrow_cset_desc.cd);
				713	if (pfile->wide_cset_desc.func == convert_using_iconv)
				714	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	715	}
				716	}
				717
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	718	/* Utility routine for use by a full compiler. C is a character taken
				719	from the basic source character set, encoded in the host's
				720	execution encoding. Convert it to (the target's) execution
				721	encoding, and return that value.
				722
				723	Issues an internal error if C's representation in the narrow
				724	execution character set fails to be a single-byte value (C99
				725	5.2.1p3: "The representation of each member of the source and
				726	execution character sets shall fit in a byte.") May also issue an
				727	internal error if C fails to be a member of the basic source
				728	character set (testing this exactly is too hard, especially when
				729	the host character set is EBCDIC). */
				730	cppchar_t
				731	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
				732	{
				733	uchar sbuf[1];
				734	struct _cpp_strbuf tbuf;
				735
				736	/* This test is merely an approximation, but it suffices to catch
				737	the most important thing, which is that we don't get handed a
				738	character outside the unibyte range of the host character set. */
				739	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
				740	{
				741	cpp_error (pfile, CPP_DL_ICE,
				742	"character 0x%lx is not in the basic source character set\n",
				743	(unsigned long)c);
				744	return 0;
				745	}
				746
				747	/* Being a character in the unibyte range of the host character set,
				748	we can safely splat it into a one-byte buffer and trust that that
				749	is a well-formed string. */
				750	sbuf[0] = c;
				751
				752	/* This should never need to reallocate, but just in case... */
				753	tbuf.asize = 1;
				754	tbuf.text = xmalloc (tbuf.asize);
				755	tbuf.len = 0;
				756
				757	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
				758	{
				759	cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
				760	return 0;
				761	}
				762	if (tbuf.len != 1)
				763	{
				764	cpp_error (pfile, CPP_DL_ICE,
				765	"character 0x%lx is not unibyte in execution character set",
				766	(unsigned long)c);
				767	return 0;
				768	}
				769	c = tbuf.text[0];
				770	free(tbuf.text);
				771	return c;
				772	}
				773
				774
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	775
				776	/* Utility routine that computes a mask of the form 0000...111... with
				777	WIDTH 1-bits. */
				778	static inline size_t
				779	width_to_mask (size_t width)
				780	{
				781	width = MIN (width, BITS_PER_CPPCHAR_T);
				782	if (width >= CHAR_BIT * sizeof (size_t))
				783	return ~(size_t) 0;
				784	else
				785	return ((size_t) 1 << width) - 1;
				786	}
				787
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	788	/* A large table of unicode character information. */
				789	enum {
				790	/* Valid in a C99 identifier? */
				791	C99 = 1,
				792	/* Valid in a C99 identifier, but not as the first character? */
				793	DIG = 2,
				794	/* Valid in a C++ identifier? */
				795	CXX = 4,
				796	/* NFC representation is not valid in an identifier? */
				797	CID = 8,
				798	/* Might be valid NFC form? */
				799	NFC = 16,
				800	/* Might be valid NFKC form? */
				801	NKC = 32,
				802	/* Certain preceding characters might make it not valid NFC/NKFC form? */
				803	CTX = 64
				804	};
				805
				806	static const struct {
				807	/* Bitmap of flags above. */
				808	unsigned char flags;
				809	/* Combining class of the character. */
				810	unsigned char combine;
				811	/* Last character in the range described by this entry. */
				812	unsigned short end;
				813	} ucnranges[] = {
				814	#include "ucnid.h"
				815	};
				816
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	817	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				818	the start of an identifier, and 0 if C is not valid in an
				819	identifier. We assume C has already gone through the checks of
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	820	_cpp_valid_ucn. Also update NST for C if returning nonzero. The
				821	algorithm is a simple binary search on the table defined in
				822	ucnid.h. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	823
				824	static int
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	825	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
				826	struct normalize_state *nst)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	827	{
				828	int mn, mx, md;
				829
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	830	if (c > 0xFFFF)
				831	return 0;
				832
				833	mn = 0;
				834	mx = ARRAY_SIZE (ucnranges) - 1;
				835	while (mx != mn)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	836	{
				837	md = (mn + mx) / 2;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	838	if (c <= ucnranges[md].end)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	839	mx = md;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	840	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	841	mn = md + 1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	842	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	843
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	844	/* When -pedantic, we require the character to have been listed by
				845	the standard for the current language. Otherwise, we accept the
				846	union of the acceptable sets for C++98 and C99. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	847	if (! (ucnranges[mn].flags & (C99 \| CXX)))
				848	return 0;
				849
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	850	if (CPP_PEDANTIC (pfile)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	851	&& ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	852	\|\| (CPP_OPTION (pfile, cplusplus)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	853	&& !(ucnranges[mn].flags & CXX))))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	854	return 0;
				855
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	856	/* Update NST. */
				857	if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
				858	nst->level = normalized_none;
				859	else if (ucnranges[mn].flags & CTX)
				860	{
				861	bool safe;
				862	cppchar_t p = nst->previous;
				863
				864	/* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
				865	if (c == 0x09BE)
				866	safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
				867	else if (c == 0x0B3E)
				868	safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
				869	else if (c == 0x0BBE)
				870	safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
				871	else if (c == 0x0CC2)
				872	safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
				873	else if (c == 0x0D3E)
				874	safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
				875	/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
				876	and are combined algorithmically from a sequence of the form
				877	1100-1112 1161-1175 11A8-11C2
				878	(if the third is not present, it is treated as 11A7, which is not
				879	really a valid character).
				880	Unfortunately, C99 allows (only) the NFC form, but C++ allows
				881	only the combining characters. */
				882	else if (c >= 0x1161 && c <= 0x1175)
				883	safe = p < 0x1100 \|\| p > 0x1112;
				884	else if (c >= 0x11A8 && c <= 0x11C2)
				885	safe = (p < 0xAC00 \|\| p > 0xD7A3 \|\| (p - 0xAC00) % 28 != 0);
				886	else
				887	{
				888	/* Uh-oh, someone updated ucnid.h without updating this code. */
				889	cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
				890	safe = true;
				891	}
				892	if (!safe && c < 0x1161)
				893	nst->level = normalized_none;
				894	else if (!safe)
				895	nst->level = MAX (nst->level, normalized_identifier_C);
				896	}
				897	else if (ucnranges[mn].flags & NKC)
				898	;
				899	else if (ucnranges[mn].flags & NFC)
				900	nst->level = MAX (nst->level, normalized_C);
				901	else if (ucnranges[mn].flags & CID)
				902	nst->level = MAX (nst->level, normalized_identifier_C);
				903	else
				904	nst->level = normalized_none;
				905	nst->previous = c;
				906	nst->prev_class = ucnranges[mn].combine;
				907
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	908	/* In C99, UCN digits may not begin identifiers. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	909	if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	910	return 2;
				911
				912	return 1;
				913	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	914
				915	/* [lex.charset]: The character designated by the universal character
				916	name \UNNNNNNNN is that character whose character short name in
				917	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				918	universal character name \uNNNN is that character whose character
				919	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
				920	for a universal character name is less than 0x20 or in the range
				921	0x7F-0x9F (inclusive), or if the universal character name
				922	designates a character in the basic source character set, then the
				923	program is ill-formed.
				924
				925	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
				926	buffer end is delimited by a non-hex digit. Returns zero if UCNs
				927	are not part of the relevant standard, or if the string beginning
				928	at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
				929
Kazu Hirata	6356f89	2003-06-12 19:01:08 +0000	[diff] [blame]	930	Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	931	is returned. Diagnostics are emitted for invalid values. PSTR
				932	is updated to point one beyond the UCN, or to the syntactically
				933	invalid character.
				934
				935	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	936	an identifier, or 2 otherwise. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	937
				938	cppchar_t
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	939	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	940	const uchar *limit, int identifier_pos,
				941	struct normalize_state *nst)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	942	{
				943	cppchar_t result, c;
				944	unsigned int length;
				945	const uchar str = pstr;
				946	const uchar *base = str - 2;
				947
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	948	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	949	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	950	"universal character names are only valid in C++ and C99");
				951	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	952	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	953	"the meaning of '\\%c' is different in traditional C",
				954	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	955
				956	if (str[-1] == 'u')
				957	length = 4;
				958	else if (str[-1] == 'U')
				959	length = 8;
				960	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	961	{
				962	cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
				963	length = 4;
				964	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	965
				966	result = 0;
				967	do
				968	{
				969	c = *str;
				970	if (!ISXDIGIT (c))
				971	break;
				972	str++;
				973	result = (result << 4) + hex_value (c);
				974	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	975	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	976
				977	*pstr = str;
				978	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	979	{
				980	/* We'll error when we try it out as the start of an identifier. */
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	981	cpp_error (pfile, CPP_DL_ERROR,
				982	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	983	(int) (str - base), base);
				984	result = 1;
				985	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	986	/* The standard permits $, @ and ` to be specified as UCNs. We use
				987	hex escapes so that this also works with EBCDIC hosts. */
				988	else if ((result < 0xa0
				989	&& (result != 0x24 && result != 0x40 && result != 0x60))
				990	\|\| (result & 0x80000000)
				991	\|\| (result >= 0xD800 && result <= 0xDFFF))
				992	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	993	cpp_error (pfile, CPP_DL_ERROR,
				994	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	995	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	996	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	997	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	998	else if (identifier_pos && result == 0x24
				999	&& CPP_OPTION (pfile, dollars_in_ident))
				1000	{
				1001	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
				1002	{
				1003	CPP_OPTION (pfile, warn_dollars) = 0;
				1004	cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
				1005	}
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	1006	NORMALIZE_STATE_UPDATE_IDNUM (nst);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1007	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1008	else if (identifier_pos)
				1009	{
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	1010	int validity = ucn_valid_in_identifier (pfile, result, nst);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1011
				1012	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1013	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1014	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1015	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1016	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1017	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1018	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1019	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1020	}
				1021
				1022	if (result == 0)
				1023	result = 1;
				1024
				1025	return result;
				1026	}
				1027
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1028	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
				1029	it to the execution character set and write the result into TBUF.
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1030	An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1031	static const uchar *
				1032	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1033	struct _cpp_strbuf *tbuf, bool wide)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1034	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1035	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1036	uchar buf[6];
				1037	uchar *bufp = buf;
				1038	size_t bytesleft = 6;
				1039	int rval;
				1040	struct cset_converter cvt
				1041	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	1042	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1043
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1044	from++; /* Skip u/U. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame^]	1045	ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1046
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1047	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				1048	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1049	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1050	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1051	cpp_errno (pfile, CPP_DL_ERROR,
				1052	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1053	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1054	else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1055	cpp_errno (pfile, CPP_DL_ERROR,
				1056	"converting UCN to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1057
				1058	return from;
				1059	}
				1060
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1061	/* Subroutine of convert_hex and convert_oct. N is the representation
				1062	in the execution character set of a numeric escape; write it into the
				1063	string buffer TBUF and update the end-of-string pointer therein. WIDE
				1064	is true if it's a wide string that's being assembled in TBUF. This
				1065	function issues no diagnostics and never fails. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1066	static void
				1067	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1068	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1069	{
				1070	if (wide)
				1071	{
				1072	/* We have to render this into the target byte order, which may not
				1073	be our byte order. */
				1074	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				1075	size_t width = CPP_OPTION (pfile, wchar_precision);
				1076	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1077	size_t cmask = width_to_mask (cwidth);
				1078	size_t nbwc = width / cwidth;
				1079	size_t i;
				1080	size_t off = tbuf->len;
				1081	cppchar_t c;
				1082
				1083	if (tbuf->len + nbwc > tbuf->asize)
				1084	{
				1085	tbuf->asize += OUTBUF_BLOCK_SIZE;
				1086	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				1087	}
				1088
				1089	for (i = 0; i < nbwc; i++)
				1090	{
				1091	c = n & cmask;
				1092	n >>= cwidth;
				1093	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				1094	}
				1095	tbuf->len += nbwc;
				1096	}
				1097	else
				1098	{
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1099	/* Note: this code does not handle the case where the target
				1100	and host have a different number of bits in a byte. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1101	if (tbuf->len + 1 > tbuf->asize)
				1102	{
				1103	tbuf->asize += OUTBUF_BLOCK_SIZE;
				1104	tbuf->text = xrealloc (tbuf->text, tbuf->asize);
				1105	}
				1106	tbuf->text[tbuf->len++] = n;
				1107	}
				1108	}
				1109
				1110	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
				1111	character set and write it into the string buffer TBUF. Returns an
				1112	advanced pointer, and issues diagnostics as necessary.
				1113	No character set translation occurs; this routine always produces the
				1114	execution-set character with numeric value equal to the given hex
				1115	number. You can, e.g. generate surrogate pairs this way. */
				1116	static const uchar *
				1117	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1118	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1119	{
				1120	cppchar_t c, n = 0, overflow = 0;
				1121	int digits_found = 0;
				1122	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				1123	: CPP_OPTION (pfile, char_precision));
				1124	size_t mask = width_to_mask (width);
				1125
				1126	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1127	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1128	"the meaning of '\\x' is different in traditional C");
				1129
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1130	from++; /* Skip 'x'. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1131	while (from < limit)
				1132	{
				1133	c = *from;
				1134	if (! hex_p (c))
				1135	break;
				1136	from++;
				1137	overflow \|= n ^ (n << 4 >> 4);
				1138	n = (n << 4) + hex_value (c);
				1139	digits_found = 1;
				1140	}
				1141
				1142	if (!digits_found)
				1143	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1144	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1145	"\\x used with no following hex digits");
				1146	return from;
				1147	}
				1148
				1149	if (overflow \| (n != (n & mask)))
				1150	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1151	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1152	"hex escape sequence out of range");
				1153	n &= mask;
				1154	}
				1155
				1156	emit_numeric_escape (pfile, n, tbuf, wide);
				1157
				1158	return from;
				1159	}
				1160
				1161	/* Convert an octal escape, pointed to by FROM, to the execution
				1162	character set and write it into the string buffer TBUF. Returns an
				1163	advanced pointer, and issues diagnostics as necessary.
				1164	No character set translation occurs; this routine always produces the
				1165	execution-set character with numeric value equal to the given octal
				1166	number. */
				1167	static const uchar *
				1168	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1169	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1170	{
				1171	size_t count = 0;
				1172	cppchar_t c, n = 0;
				1173	size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
				1174	: CPP_OPTION (pfile, char_precision));
				1175	size_t mask = width_to_mask (width);
				1176	bool overflow = false;
				1177
				1178	while (from < limit && count++ < 3)
				1179	{
				1180	c = *from;
				1181	if (c < '0' \|\| c > '7')
				1182	break;
				1183	from++;
				1184	overflow \|= n ^ (n << 3 >> 3);
				1185	n = (n << 3) + c - '0';
				1186	}
				1187
				1188	if (n != (n & mask))
				1189	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1190	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1191	"octal escape sequence out of range");
				1192	n &= mask;
				1193	}
				1194
				1195	emit_numeric_escape (pfile, n, tbuf, wide);
				1196
				1197	return from;
				1198	}
				1199
				1200	/* Convert an escape sequence (pointed to by FROM) to its value on
				1201	the target, and to the execution character set. Do not scan past
				1202	LIMIT. Write the converted value into TBUF. Returns an advanced
				1203	pointer. Handles all relevant diagnostics. */
				1204	static const uchar *
				1205	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1206	struct _cpp_strbuf *tbuf, bool wide)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1207	{
				1208	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1209	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1210	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1211	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1212	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1213	#else
				1214	#error "unknown host character set"
				1215	#endif
				1216
				1217	uchar c;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1218	struct cset_converter cvt
				1219	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1220
				1221	c = *from;
				1222	switch (c)
				1223	{
				1224	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1225	case 'u': case 'U':
				1226	return convert_ucn (pfile, from, limit, tbuf, wide);
				1227
				1228	case 'x':
				1229	return convert_hex (pfile, from, limit, tbuf, wide);
				1230	break;
				1231
				1232	case '0': case '1': case '2': case '3':
				1233	case '4': case '5': case '6': case '7':
				1234	return convert_oct (pfile, from, limit, tbuf, wide);
				1235
				1236	/* Various letter escapes. Get the appropriate host-charset
				1237	value into C. */
				1238	case '\\': case '\'': case '"': case '?': break;
				1239
				1240	case '(': case '{': case '[': case '%':
				1241	/* '\(', etc, can be used at the beginning of a line in a long
				1242	string split onto multiple lines with \-newline, to prevent
				1243	Emacs or other text editors from getting confused. '\%' can
				1244	be used to prevent SCCS from mangling printf format strings. */
				1245	if (CPP_PEDANTIC (pfile))
				1246	goto unknown;
				1247	break;
				1248
				1249	case 'b': c = charconsts[1]; break;
				1250	case 'f': c = charconsts[3]; break;
				1251	case 'n': c = charconsts[4]; break;
				1252	case 'r': c = charconsts[5]; break;
				1253	case 't': c = charconsts[6]; break;
				1254	case 'v': c = charconsts[7]; break;
				1255
				1256	case 'a':
				1257	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1258	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1259	"the meaning of '\\a' is different in traditional C");
				1260	c = charconsts[0];
				1261	break;
				1262
				1263	case 'e': case 'E':
				1264	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1265	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1266	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1267	c = charconsts[2];
				1268	break;
				1269
				1270	default:
				1271	unknown:
				1272	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1273	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1274	"unknown escape sequence '\\%c'", (int) c);
				1275	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1276	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1277	"unknown escape sequence: '\\%03o'", (int) c);
				1278	}
				1279
				1280	/* Now convert what we have to the execution character set. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1281	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1282	cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1283	"converting escape sequence to execution character set");
				1284
				1285	return from + 1;
				1286	}
				1287
				1288	/* FROM is an array of cpp_string structures of length COUNT. These
				1289	are to be converted from the source to the execution character set,
				1290	escape sequences translated, and finally all are to be
				1291	concatenated. WIDE indicates whether or not to produce a wide
				1292	string. The result is written into TO. Returns true for success,
				1293	false for failure. */
				1294	bool
				1295	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
				1296	cpp_string *to, bool wide)
				1297	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1298	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1299	const uchar p, base, *limit;
				1300	size_t i;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1301	struct cset_converter cvt
				1302	= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1303
				1304	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
				1305	tbuf.text = xmalloc (tbuf.asize);
				1306	tbuf.len = 0;
				1307
				1308	for (i = 0; i < count; i++)
				1309	{
				1310	p = from[i].text;
				1311	if (*p == 'L') p++;
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1312	p++; /* Skip leading quote. */
				1313	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1314
				1315	for (;;)
				1316	{
				1317	base = p;
				1318	while (p < limit && *p != '\\')
				1319	p++;
				1320	if (p > base)
				1321	{
				1322	/* We have a run of normal characters; these can be fed
				1323	directly to convert_cset. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1324	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1325	goto fail;
				1326	}
				1327	if (p == limit)
				1328	break;
				1329
				1330	p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
				1331	}
				1332	}
				1333	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1334	structure. */
				1335	emit_numeric_escape (pfile, 0, &tbuf, wide);
				1336	tbuf.text = xrealloc (tbuf.text, tbuf.len);
				1337	to->text = tbuf.text;
				1338	to->len = tbuf.len;
				1339	return true;
				1340
				1341	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1342	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1343	free (tbuf.text);
				1344	return false;
				1345	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1346
				1347	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1348	in a string, but do not perform character set conversion. */
				1349	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1350	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
				1351	size_t count, cpp_string *to, bool wide)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1352	{
				1353	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1354	bool retval;
				1355
				1356	pfile->narrow_cset_desc.func = convert_no_conversion;
				1357	pfile->narrow_cset_desc.cd = (iconv_t) -1;
				1358
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1359	retval = cpp_interpret_string (pfile, from, count, to, wide);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1360
				1361	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1362	return retval;
				1363	}
				1364
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1365
				1366	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1367	to a number, for narrow strings. STR is the string structure returned
				1368	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1369	cpp_interpret_charconst. */
				1370	static cppchar_t
				1371	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1372	unsigned int pchars_seen, int unsignedp)
				1373	{
				1374	size_t width = CPP_OPTION (pfile, char_precision);
				1375	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1376	size_t mask = width_to_mask (width);
				1377	size_t i;
				1378	cppchar_t result, c;
				1379	bool unsigned_p;
				1380
				1381	/* The value of a multi-character character constant, or a
				1382	single-character character constant whose representation in the
				1383	execution character set is more than one byte long, is
				1384	implementation defined. This implementation defines it to be the
				1385	number formed by interpreting the byte sequence in memory as a
				1386	big-endian binary number. If overflow occurs, the high bytes are
				1387	lost, and a warning is issued.
				1388
				1389	We don't want to process the NUL terminator handed back by
				1390	cpp_interpret_string. */
				1391	result = 0;
				1392	for (i = 0; i < str.len - 1; i++)
				1393	{
				1394	c = str.text[i] & mask;
				1395	if (width < BITS_PER_CPPCHAR_T)
				1396	result = (result << width) \| c;
				1397	else
				1398	result = c;
				1399	}
				1400
				1401	if (i > max_chars)
				1402	{
				1403	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1404	cpp_error (pfile, CPP_DL_WARNING,
				1405	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1406	}
				1407	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1408	cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1409
				1410	/* Multichar constants are of type int and therefore signed. */
				1411	if (i > 1)
				1412	unsigned_p = 0;
				1413	else
				1414	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1415
				1416	/* Truncate the constant to its natural width, and simultaneously
				1417	sign- or zero-extend to the full width of cppchar_t.
				1418	For single-character constants, the value is WIDTH bits wide.
				1419	For multi-character constants, the value is INT_PRECISION bits wide. */
				1420	if (i > 1)
				1421	width = CPP_OPTION (pfile, int_precision);
				1422	if (width < BITS_PER_CPPCHAR_T)
				1423	{
				1424	mask = ((cppchar_t) 1 << width) - 1;
				1425	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1426	result &= mask;
				1427	else
				1428	result \|= ~mask;
				1429	}
				1430	*pchars_seen = i;
				1431	*unsignedp = unsigned_p;
				1432	return result;
				1433	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1434
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1435	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1436	to a number, for wide strings. STR is the string structure returned
				1437	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1438	cpp_interpret_charconst. */
				1439	static cppchar_t
				1440	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1441	unsigned int pchars_seen, int unsignedp)
				1442	{
				1443	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
				1444	size_t width = CPP_OPTION (pfile, wchar_precision);
				1445	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1446	size_t mask = width_to_mask (width);
				1447	size_t cmask = width_to_mask (cwidth);
				1448	size_t nbwc = width / cwidth;
				1449	size_t off, i;
				1450	cppchar_t result = 0, c;
				1451
				1452	/* This is finicky because the string is in the target's byte order,
				1453	which may not be our byte order. Only the last character, ignoring
				1454	the NUL terminator, is relevant. */
				1455	off = str.len - (nbwc * 2);
				1456	result = 0;
				1457	for (i = 0; i < nbwc; i++)
				1458	{
				1459	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1460	result = (result << cwidth) \| (c & cmask);
				1461	}
				1462
				1463	/* Wide character constants have type wchar_t, and a single
				1464	character exactly fills a wchar_t, so a multi-character wide
				1465	character constant is guaranteed to overflow. */
				1466	if (off > 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1467	cpp_error (pfile, CPP_DL_WARNING,
				1468	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1469
				1470	/* Truncate the constant to its natural width, and simultaneously
				1471	sign- or zero-extend to the full width of cppchar_t. */
				1472	if (width < BITS_PER_CPPCHAR_T)
				1473	{
				1474	if (CPP_OPTION (pfile, unsigned_wchar) \|\| !(result & (1 << (width - 1))))
				1475	result &= mask;
				1476	else
				1477	result \|= ~mask;
				1478	}
				1479
				1480	*unsignedp = CPP_OPTION (pfile, unsigned_wchar);
				1481	*pchars_seen = 1;
				1482	return result;
				1483	}
				1484
				1485	/* Interpret a (possibly wide) character constant in TOKEN.
				1486	PCHARS_SEEN points to a variable that is filled in with the number
				1487	of characters seen, and UNSIGNEDP to a variable that indicates
				1488	whether the result has signed type. */
				1489	cppchar_t
				1490	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1491	unsigned int pchars_seen, int unsignedp)
				1492	{
				1493	cpp_string str = { 0, 0 };
				1494	bool wide = (token->type == CPP_WCHAR);
				1495	cppchar_t result;
				1496
				1497	/* an empty constant will appear as L'' or '' */
				1498	if (token->val.str.len == (size_t) (2 + wide))
				1499	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1500	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1501	return 0;
				1502	}
				1503	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1504	return 0;
				1505
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1506	if (wide)
				1507	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
				1508	else
				1509	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1510
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1511	if (str.text != token->val.str.text)
				1512	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1513
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1514	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1515	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1516
				1517	/* Convert an identifier denoted by ID and LEN, which might contain
				1518	UCN escapes, to the source character set, either UTF-8 or
				1519	UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
				1520	cpp_hashnode *
				1521	_cpp_interpret_identifier (cpp_reader pfile, const uchar id, size_t len)
				1522	{
				1523	/* It turns out that a UCN escape always turns into fewer characters
				1524	than the escape itself, so we can allocate a temporary in advance. */
				1525	uchar * buf = alloca (len + 1);
				1526	uchar * bufp = buf;
				1527	size_t idp;
				1528
				1529	for (idp = 0; idp < len; idp++)
				1530	if (id[idp] != '\\')
				1531	*bufp++ = id[idp];
				1532	else
				1533	{
				1534	unsigned length = id[idp+1] == 'u' ? 4 : 8;
				1535	cppchar_t value = 0;
				1536	size_t bufleft = len - (bufp - buf);
				1537	int rval;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1538
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1539	idp += 2;
				1540	while (length && idp < len && ISXDIGIT (id[idp]))
				1541	{
				1542	value = (value << 4) + hex_value (id[idp]);
				1543	idp++;
				1544	length--;
				1545	}
				1546	idp--;
				1547
				1548	/* Special case for EBCDIC: if the identifier contains
				1549	a '$' specified using a UCN, translate it to EBCDIC. */
				1550	if (value == 0x24)
				1551	{
				1552	*bufp++ = '$';
				1553	continue;
				1554	}
				1555
				1556	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
				1557	if (rval)
				1558	{
				1559	errno = rval;
				1560	cpp_errno (pfile, CPP_DL_ERROR,
				1561	"converting UCN to source character set");
				1562	break;
				1563	}
				1564	}
				1565
				1566	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				1567	buf, bufp - buf, HT_ALLOC));
				1568	}
				1569
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1570	/* Convert an input buffer (containing the complete contents of one
				1571	source file) from INPUT_CHARSET to the source character set. INPUT
				1572	points to the input buffer, SIZE is its allocated size, and LEN is
				1573	the length of the meaningful data within the buffer. The
				1574	translated buffer is returned, and *ST_SIZE is set to the length of
				1575	the meaningful data within the translated buffer.
				1576
				1577	INPUT is expected to have been allocated with xmalloc. This function
				1578	will either return INPUT, or free it and return a pointer to another
				1579	xmalloc-allocated block of memory. */
				1580	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1581	_cpp_convert_input (cpp_reader pfile, const char input_charset,
				1582	uchar input, size_t size, size_t len, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1583	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1584	struct cset_converter input_cset;
				1585	struct _cpp_strbuf to;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1586
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1587	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				1588	if (input_cset.func == convert_no_conversion)
				1589	{
				1590	to.text = input;
				1591	to.asize = size;
				1592	to.len = len;
				1593	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1594	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1595	{
				1596	to.asize = MAX (65536, len);
				1597	to.text = xmalloc (to.asize);
				1598	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1599
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1600	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				1601	cpp_error (pfile, CPP_DL_ERROR,
				1602	"failure to convert %s to %s",
				1603	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				1604
				1605	free (input);
				1606	}
				1607
				1608	/* Clean up the mess. */
				1609	if (input_cset.func == convert_using_iconv)
				1610	iconv_close (input_cset.cd);
				1611
				1612	/* Resize buffer if we allocated substantially too much, or if we
				1613	haven't enough space for the \n-terminator. */
				1614	if (to.len + 4096 < to.asize \|\| to.len >= to.asize)
				1615	to.text = xrealloc (to.text, to.len + 1);
				1616
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1617	/* If the file is using old-school Mac line endings (\r only),
				1618	terminate with another \r, not an \n, so that we do not mistake
				1619	the \r\n sequence for a single DOS line ending and erroneously
				1620	issue the "No newline at end of file" diagnostic. */
				1621	if (to.text[to.len - 1] == '\r')
				1622	to.text[to.len] = '\r';
				1623	else
				1624	to.text[to.len] = '\n';
				1625
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1626	*st_size = to.len;
				1627	return to.text;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1628	}
				1629
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1630	/* Decide on the default encoding to assume for input files. */
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1631	const char *
				1632	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1633	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1634	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1635
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1636	/* We disable this because the default codeset is 7-bit ASCII on
				1637	most platforms, and this causes conversion failures on every
				1638	file in GCC that happens to have one of the upper 128 characters
				1639	in it -- most likely, as part of the name of a contributor.
				1640	We should definitely recognize in-band markers of file encoding,
				1641	like:
				1642	- the appropriate Unicode byte-order mark (FE FF) to recognize
				1643	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				1644	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame]	1645	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1646	distinguish ASCII and EBCDIC.
				1647	- now we can parse something like "#pragma GCC encoding <xyz>
				1648	on the first line, or even Emacs/VIM's mode line tags (there's
				1649	a problem here in that VIM uses the last line, and Emacs has
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1650	its more elaborate "local variables" convention).
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1651	- investigate whether Java has another common convention, which
				1652	would be friendly to support.
				1653	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				1654	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1655	setlocale (LC_CTYPE, "");
				1656	current_encoding = nl_langinfo (CODESET);
				1657	#endif
				1658	if (current_encoding == NULL \|\| *current_encoding == '\0')
				1659	current_encoding = SOURCE_CHARSET;
				1660
				1661	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1662	}