Blame - libcpp/charset.c - toolchain/gcc

blob: bd24ec2490d1a130e8c36479b485161dd030adf8 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	2	Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3	Free Software Foundation, Inc.
				4
				5	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				6
				7	This program is free software; you can redistribute it and/or modify it
				8	under the terms of the GNU General Public License as published by the
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	9	Free Software Foundation; either version 3, or (at your option) any
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	10	later version.
				11
				12	This program is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				15	GNU General Public License for more details.
				16
				17	You should have received a copy of the GNU General Public License
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	18	along with this program; see the file COPYING3. If not see
				19	<http://www.gnu.org/licenses/>. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	20
				21	#include "config.h"
				22	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	23	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	24	#include "internal.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	25
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	26	/* Character set handling for C-family languages.
				27
				28	Terminological note: In what follows, "charset" or "character set"
				29	will be taken to mean both an abstract set of characters and an
				30	encoding for that set.
				31
				32	The C99 standard discusses two character sets: source and execution.
				33	The source character set is used for internal processing in translation
				34	phases 1 through 4; the execution character set is used thereafter.
				35	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				36	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				37	of these terms). Furthermore, the "basic character set" (listed in
				38	5.2.1p3) is to be encoded in each with values one byte wide, and is
				39	to appear in the initial shift state.
				40
				41	It is not explicitly mentioned, but there is also a "wide execution
				42	character set" used to encode wide character constants and wide
				43	string literals; this is supposed to be the result of applying the
				44	standard library function mbstowcs() to an equivalent narrow string
				45	(6.4.5p5). However, the behavior of hexadecimal and octal
				46	\-escapes is at odds with this; they are supposed to be translated
				47	directly to wchar_t values (6.4.4.4p5,6).
				48
				49	The source character set is not necessarily the character set used
				50	to encode physical source files on disk; translation phase 1 converts
				51	from whatever that encoding is to the source character set.
				52
				53	The presence of universal character names in C99 (6.4.3 et seq.)
				54	forces the source character set to be isomorphic to ISO 10646,
				55	that is, Unicode. There is no such constraint on the execution
				56	character set; note also that the conversion from source to
				57	execution character set does not occur for identifiers (5.1.1.2p1#5).
				58
				59	For convenience of implementation, the source character set's
				60	encoding of the basic character set should be identical to the
				61	execution character set OF THE HOST SYSTEM's encoding of the basic
				62	character set, and it should not be a state-dependent encoding.
				63
				64	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				65	depending on whether the host is based on ASCII or EBCDIC (see
				66	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	67	Technical Report #16). With limited exceptions, it relies on the
				68	system library's iconv() primitive to do charset conversion
				69	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	70
				71	#if !HAVE_ICONV
				72	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				73	below, which are guarded only by if statements with compile-time
				74	constant conditions, do not cause link errors. */
				75	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	76	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	77	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	78	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	79	#endif
				80
				81	#if HOST_CHARSET == HOST_CHARSET_ASCII
				82	#define SOURCE_CHARSET "UTF-8"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	83	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	84	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				85	#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	86	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	87	#else
				88	#error "Unrecognized basic host character set"
				89	#endif
				90
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	91	#ifndef EILSEQ
				92	#define EILSEQ EINVAL
				93	#endif
				94
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	95	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	96	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	97	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	98	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	99	{
				100	uchar *text;
				101	size_t asize;
				102	size_t len;
				103	};
				104
				105	/* This is enough to hold any string that fits on a single 80-column
				106	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	107	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	108	#define OUTBUF_BLOCK_SIZE 256
				109
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	110	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				111	logic. This is because a depressing number of systems lack iconv,
				112	or have have iconv libraries that do not do these conversions, so
				113	we need a fallback implementation for them. To ensure the fallback
				114	doesn't break due to neglect, it is used on all systems.
				115
				116	UTF-32 encoding is nice and simple: a four-byte binary number,
				117	constrained to the range 00000000-7FFFFFFF to avoid questions of
				118	signedness. We do have to cope with big- and little-endian
				119	variants.
				120
				121	UTF-16 encoding uses two-byte binary numbers, again in big- and
				122	little-endian variants, for all values in the 00000000-0000FFFF
				123	range. Values in the 00010000-0010FFFF range are encoded as pairs
				124	of two-byte numbers, called "surrogate pairs": given a number S in
				125	this range, it is mapped to a pair (H, L) as follows:
				126
				127	H = (S - 0x10000) / 0x400 + 0xD800
				128	L = (S - 0x10000) % 0x400 + 0xDC00
				129
				130	Two-byte values in the D800...DFFF range are ill-formed except as a
				131	component of a surrogate pair. Even if the encoding within a
				132	two-byte value is little-endian, the H member of the surrogate pair
				133	comes first.
				134
				135	There is no way to encode values in the 00110000-7FFFFFFF range,
				136	which is not currently a problem as there are no assigned code
				137	points in that range; however, the author expects that it will
				138	eventually become necessary to abandon UTF-16 due to this
				139	limitation. Note also that, because of these pairs, UTF-16 does
				140	not meet the requirements of the C standard for a wide character
				141	encoding (see 3.7.3 and 6.4.4.4p11).
				142
				143	UTF-8 encoding looks like this:
				144
				145	value range encoded as
				146	00000000-0000007F 0xxxxxxx
				147	00000080-000007FF 110xxxxx 10xxxxxx
				148	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				149	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				150	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				151	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				152
				153	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				154	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				155	never occur. Note also that any value that can be encoded by a
				156	given row of the table can also be encoded by all successive rows,
				157	but this is not done; only the shortest possible encoding for any
				158	given value is valid. For instance, the character 07C0 could be
				159	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				160	FC 80 80 80 9F 80. Only the first is valid.
				161
				162	An implementation note: the transformation from UTF-16 to UTF-8, or
				163	vice versa, is easiest done by using UTF-32 as an intermediary. */
				164
				165	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				166	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				167	operation in several places below. */
				168	static inline int
				169	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				170	cppchar_t *cp)
				171	{
Joseph Myers	9e322bc	2009-05-03 12:59:26 +0100	[diff] [blame]	172	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	173	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	174
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	175	cppchar_t c;
				176	const uchar inbuf = inbufp;
				177	size_t nbytes, i;
				178
				179	if (*inbytesleftp < 1)
				180	return EINVAL;
				181
				182	c = *inbuf;
				183	if (c < 0x80)
				184	{
				185	*cp = c;
				186	*inbytesleftp -= 1;
				187	*inbufp += 1;
				188	return 0;
				189	}
				190
				191	/* The number of leading 1-bits in the first byte indicates how many
				192	bytes follow. */
				193	for (nbytes = 2; nbytes < 7; nbytes++)
				194	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				195	goto found;
				196	return EILSEQ;
				197	found:
				198
				199	if (*inbytesleftp < nbytes)
				200	return EINVAL;
				201
				202	c = (c & masks[nbytes-1]);
				203	inbuf++;
				204	for (i = 1; i < nbytes; i++)
				205	{
				206	cppchar_t n = *inbuf++;
				207	if ((n & 0xC0) != 0x80)
				208	return EILSEQ;
				209	c = ((c << 6) + (n & 0x3F));
				210	}
				211
				212	/* Make sure the shortest possible encoding was used. */
				213	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				214	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				215	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				216	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				217	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				218
				219	/* Make sure the character is valid. */
				220	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				221
				222	*cp = c;
				223	*inbufp = inbuf;
				224	*inbytesleftp -= nbytes;
				225	return 0;
				226	}
				227
				228	static inline int
				229	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				230	{
				231	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				232	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				233	size_t nbytes;
				234	uchar buf[6], *p = &buf[6];
				235	uchar outbuf = outbufp;
				236
				237	nbytes = 1;
				238	if (c < 0x80)
				239	*--p = c;
				240	else
				241	{
				242	do
				243	{
				244	*--p = ((c & 0x3F) \| 0x80);
				245	c >>= 6;
				246	nbytes++;
				247	}
				248	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				249	*--p = (c \| masks[nbytes-1]);
				250	}
				251
				252	if (*outbytesleftp < nbytes)
				253	return E2BIG;
				254
				255	while (p < &buf[6])
				256	outbuf++ = p++;
				257	*outbytesleftp -= nbytes;
				258	*outbufp = outbuf;
				259	return 0;
				260	}
				261
				262	/* The following four functions transform one character between the two
				263	encodings named in the function name. All have the signature
				264	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				265	uchar *outbufp, size_t outbytesleftp)
				266
				267	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				268	interpreted as a boolean indicating whether big-endian or
				269	little-endian encoding is to be used for the member of the pair
				270	that is not UTF-8.
				271
				272	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				273	do for iconv.
				274
				275	The return value is either 0 for success, or an errno value for
				276	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				277	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	278
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	279	static inline int
				280	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				281	uchar *outbufp, size_t outbytesleftp)
				282	{
				283	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	284	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	285	int rval;
				286
				287	/* Check for space first, since we know exactly how much we need. */
				288	if (*outbytesleftp < 4)
				289	return E2BIG;
				290
				291	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				292	if (rval)
				293	return rval;
				294
				295	outbuf = *outbufp;
				296	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				297	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				298	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				299	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				300
				301	*outbufp += 4;
				302	*outbytesleftp -= 4;
				303	return 0;
				304	}
				305
				306	static inline int
				307	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				308	uchar *outbufp, size_t outbytesleftp)
				309	{
				310	cppchar_t s;
				311	int rval;
				312	const uchar *inbuf;
				313
				314	if (*inbytesleftp < 4)
				315	return EINVAL;
				316
				317	inbuf = *inbufp;
				318
				319	s = inbuf[bigend ? 0 : 3] << 24;
				320	s += inbuf[bigend ? 1 : 2] << 16;
				321	s += inbuf[bigend ? 2 : 1] << 8;
				322	s += inbuf[bigend ? 3 : 0];
				323
				324	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				325	return EILSEQ;
				326
				327	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				328	if (rval)
				329	return rval;
				330
				331	*inbufp += 4;
				332	*inbytesleftp -= 4;
				333	return 0;
				334	}
				335
				336	static inline int
				337	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				338	uchar *outbufp, size_t outbytesleftp)
				339	{
				340	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	341	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	342	const uchar save_inbuf = inbufp;
				343	size_t save_inbytesleft = *inbytesleftp;
				344	uchar outbuf = outbufp;
				345
				346	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				347	if (rval)
				348	return rval;
				349
				350	if (s > 0x0010FFFF)
				351	{
				352	*inbufp = save_inbuf;
				353	*inbytesleftp = save_inbytesleft;
				354	return EILSEQ;
				355	}
				356
				357	if (s < 0xFFFF)
				358	{
				359	if (*outbytesleftp < 2)
				360	{
				361	*inbufp = save_inbuf;
				362	*inbytesleftp = save_inbytesleft;
				363	return E2BIG;
				364	}
				365	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				366	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				367
				368	*outbufp += 2;
				369	*outbytesleftp -= 2;
				370	return 0;
				371	}
				372	else
				373	{
				374	cppchar_t hi, lo;
				375
				376	if (*outbytesleftp < 4)
				377	{
				378	*inbufp = save_inbuf;
				379	*inbytesleftp = save_inbytesleft;
				380	return E2BIG;
				381	}
				382
				383	hi = (s - 0x10000) / 0x400 + 0xD800;
				384	lo = (s - 0x10000) % 0x400 + 0xDC00;
				385
				386	/* Even if we are little-endian, put the high surrogate first.
				387	??? Matches practice? */
				388	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				389	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				390	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				391	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				392
				393	*outbufp += 4;
				394	*outbytesleftp -= 4;
				395	return 0;
				396	}
				397	}
				398
				399	static inline int
				400	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				401	uchar *outbufp, size_t outbytesleftp)
				402	{
				403	cppchar_t s;
				404	const uchar inbuf = inbufp;
				405	int rval;
				406
				407	if (*inbytesleftp < 2)
				408	return EINVAL;
				409	s = inbuf[bigend ? 0 : 1] << 8;
				410	s += inbuf[bigend ? 1 : 0];
				411
				412	/* Low surrogate without immediately preceding high surrogate is invalid. */
				413	if (s >= 0xDC00 && s <= 0xDFFF)
				414	return EILSEQ;
				415	/* High surrogate must have a following low surrogate. */
				416	else if (s >= 0xD800 && s <= 0xDBFF)
				417	{
				418	cppchar_t hi = s, lo;
				419	if (*inbytesleftp < 4)
				420	return EINVAL;
				421
				422	lo = inbuf[bigend ? 2 : 3] << 8;
				423	lo += inbuf[bigend ? 3 : 2];
				424
				425	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				426	return EILSEQ;
				427
				428	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				429	}
				430
				431	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				432	if (rval)
				433	return rval;
				434
				435	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				436	the output pointers for us). */
				437	if (s <= 0xFFFF)
				438	{
				439	*inbufp += 2;
				440	*inbytesleftp -= 2;
				441	}
				442	else
				443	{
				444	*inbufp += 4;
				445	*inbytesleftp -= 4;
				446	}
				447	return 0;
				448	}
				449
				450	/* Helper routine for the next few functions. The 'const' on
				451	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	452	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	453
				454	static inline bool
				455	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				456	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	457	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	458	{
				459	const uchar *inbuf;
				460	uchar *outbuf;
				461	size_t inbytesleft, outbytesleft;
				462	int rval;
				463
				464	inbuf = from;
				465	inbytesleft = flen;
				466	outbuf = to->text + to->len;
				467	outbytesleft = to->asize - to->len;
				468
				469	for (;;)
				470	{
				471	do
				472	rval = one_conversion (cd, &inbuf, &inbytesleft,
				473	&outbuf, &outbytesleft);
				474	while (inbytesleft && !rval);
				475
				476	if (__builtin_expect (inbytesleft == 0, 1))
				477	{
				478	to->len = to->asize - outbytesleft;
				479	return true;
				480	}
				481	if (rval != E2BIG)
				482	{
				483	errno = rval;
				484	return false;
				485	}
				486
				487	outbytesleft += OUTBUF_BLOCK_SIZE;
				488	to->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	489	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	490	outbuf = to->text + to->asize - outbytesleft;
				491	}
				492	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	493
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	494
				495	/* These functions convert entire strings between character sets.
				496	They all have the signature
				497
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	498	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	499
				500	The input string FROM is converted as specified by the function
				501	name plus the iconv descriptor CD (which may be fake), and the
				502	result appended to TO. On any error, false is returned, otherwise true. */
				503
				504	/* These four use the custom conversion code above. */
				505	static bool
				506	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	507	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	508	{
				509	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				510	}
				511
				512	static bool
				513	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	514	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	515	{
				516	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				517	}
				518
				519	static bool
				520	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	521	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	522	{
				523	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				524	}
				525
				526	static bool
				527	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	528	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	529	{
				530	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				531	}
				532
				533	/* Identity conversion, used when we have no alternative. */
				534	static bool
				535	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	536	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	537	{
				538	if (to->len + flen > to->asize)
				539	{
				540	to->asize = to->len + flen;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	541	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	542	}
				543	memcpy (to->text + to->len, from, flen);
				544	to->len += flen;
				545	return true;
				546	}
				547
				548	/* And this one uses the system iconv primitive. It's a little
				549	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	550	#if HAVE_ICONV
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	551
				552	#define CONVERT_ICONV_GROW_BUFFER \
				553	do { \
				554	outbytesleft += OUTBUF_BLOCK_SIZE; \
				555	to->asize += OUTBUF_BLOCK_SIZE; \
				556	to->text = XRESIZEVEC (uchar, to->text, to->asize); \
				557	outbuf = (char *)to->text + to->asize - outbytesleft; \
				558	} while (0)
				559
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	560	static bool
				561	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	562	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	563	{
				564	ICONV_CONST char *inbuf;
				565	char *outbuf;
				566	size_t inbytesleft, outbytesleft;
				567
				568	/* Reset conversion descriptor and check that it is valid. */
				569	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				570	return false;
				571
				572	inbuf = (ICONV_CONST char *)from;
				573	inbytesleft = flen;
				574	outbuf = (char *)to->text + to->len;
				575	outbytesleft = to->asize - to->len;
				576
				577	for (;;)
				578	{
				579	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				580	if (__builtin_expect (inbytesleft == 0, 1))
				581	{
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	582	/* Close out any shift states, returning to the initial state. */
				583	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				584	{
				585	if (errno != E2BIG)
				586	return false;
				587
				588	CONVERT_ICONV_GROW_BUFFER;
				589	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				590	return false;
				591	}
				592
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	593	to->len = to->asize - outbytesleft;
				594	return true;
				595	}
				596	if (errno != E2BIG)
				597	return false;
				598
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	599	CONVERT_ICONV_GROW_BUFFER;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	600	}
				601	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	602	#else
				603	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				604	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	605
				606	/* Arrange for the above custom conversion logic to be used automatically
				607	when conversion between a suitable pair of character sets is requested. */
				608
				609	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				610	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				611
				612	struct conversion
				613	{
				614	const char *pair;
				615	convert_f func;
				616	iconv_t fake_cd;
				617	};
				618	static const struct conversion conversion_tab[] = {
				619	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				620	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				621	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				622	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				623	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				624	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				625	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				626	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				627	};
				628
				629	/* Subroutine of cpp_init_iconv: initialize and return a
				630	cset_converter structure for conversion from FROM to TO. If
				631	iconv_open() fails, issue an error and return an identity
				632	converter. Silently return an identity converter if FROM and TO
				633	are identical. */
				634	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	635	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				636	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	637	struct cset_converter ret;
				638	char *pair;
				639	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	640
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	641	if (!strcasecmp (to, from))
				642	{
				643	ret.func = convert_no_conversion;
				644	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	645	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	646	return ret;
				647	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	648
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	649	pair = (char *) alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	650
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	651	strcpy(pair, from);
				652	strcat(pair, "/");
				653	strcat(pair, to);
				654	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				655	if (!strcasecmp (pair, conversion_tab[i].pair))
				656	{
				657	ret.func = conversion_tab[i].func;
				658	ret.cd = conversion_tab[i].fake_cd;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	659	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	660	return ret;
				661	}
				662
				663	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	664	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	665	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	666	ret.func = convert_using_iconv;
				667	ret.cd = iconv_open (to, from);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	668	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	669
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	670	if (ret.cd == (iconv_t) -1)
				671	{
				672	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	673	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	674	"conversion from %s to %s not supported by iconv",
				675	from, to);
				676	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	677	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	678
				679	ret.func = convert_no_conversion;
				680	}
				681	}
				682	else
				683	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	684	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	685	"no iconv implementation, cannot convert from %s to %s",
				686	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	687	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	688	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	689	ret.width = -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	690	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	691	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	692	}
				693
				694	/* If charset conversion is requested, initialize iconv(3) descriptors
				695	for conversion from the source character set to the execution
				696	character sets. If iconv is not present in the C library, and
				697	conversion is requested, issue an error. */
				698
				699	void
				700	cpp_init_iconv (cpp_reader *pfile)
				701	{
				702	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				703	const char *wcset = CPP_OPTION (pfile, wide_charset);
				704	const char *default_wcset;
				705
				706	bool be = CPP_OPTION (pfile, bytes_big_endian);
				707
				708	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	709	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	710	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	711	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	712	else
				713	/* This effectively means that wide strings are not supported,
				714	so don't do any conversion at all. */
				715	default_wcset = SOURCE_CHARSET;
				716
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	717	if (!ncset)
				718	ncset = SOURCE_CHARSET;
				719	if (!wcset)
				720	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	721
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	722	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	723	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
				724	pfile->char16_cset_desc = init_iconv_desc (pfile,
				725	be ? "UTF-16BE" : "UTF-16LE",
				726	SOURCE_CHARSET);
				727	pfile->char16_cset_desc.width = 16;
				728	pfile->char32_cset_desc = init_iconv_desc (pfile,
				729	be ? "UTF-32BE" : "UTF-32LE",
				730	SOURCE_CHARSET);
				731	pfile->char32_cset_desc.width = 32;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	732	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	733	pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	734	}
				735
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	736	/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	737	void
				738	_cpp_destroy_iconv (cpp_reader *pfile)
				739	{
				740	if (HAVE_ICONV)
				741	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	742	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				743	iconv_close (pfile->narrow_cset_desc.cd);
				744	if (pfile->wide_cset_desc.func == convert_using_iconv)
				745	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	746	}
				747	}
				748
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	749	/* Utility routine for use by a full compiler. C is a character taken
				750	from the basic source character set, encoded in the host's
				751	execution encoding. Convert it to (the target's) execution
				752	encoding, and return that value.
				753
				754	Issues an internal error if C's representation in the narrow
				755	execution character set fails to be a single-byte value (C99
				756	5.2.1p3: "The representation of each member of the source and
				757	execution character sets shall fit in a byte.") May also issue an
				758	internal error if C fails to be a member of the basic source
				759	character set (testing this exactly is too hard, especially when
				760	the host character set is EBCDIC). */
				761	cppchar_t
				762	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
				763	{
				764	uchar sbuf[1];
				765	struct _cpp_strbuf tbuf;
				766
				767	/* This test is merely an approximation, but it suffices to catch
				768	the most important thing, which is that we don't get handed a
				769	character outside the unibyte range of the host character set. */
				770	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
				771	{
				772	cpp_error (pfile, CPP_DL_ICE,
				773	"character 0x%lx is not in the basic source character set\n",
				774	(unsigned long)c);
				775	return 0;
				776	}
				777
				778	/* Being a character in the unibyte range of the host character set,
				779	we can safely splat it into a one-byte buffer and trust that that
				780	is a well-formed string. */
				781	sbuf[0] = c;
				782
				783	/* This should never need to reallocate, but just in case... */
				784	tbuf.asize = 1;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	785	tbuf.text = XNEWVEC (uchar, tbuf.asize);
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	786	tbuf.len = 0;
				787
				788	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
				789	{
				790	cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
				791	return 0;
				792	}
				793	if (tbuf.len != 1)
				794	{
				795	cpp_error (pfile, CPP_DL_ICE,
				796	"character 0x%lx is not unibyte in execution character set",
				797	(unsigned long)c);
				798	return 0;
				799	}
				800	c = tbuf.text[0];
				801	free(tbuf.text);
				802	return c;
				803	}
				804
				805
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	806
				807	/* Utility routine that computes a mask of the form 0000...111... with
				808	WIDTH 1-bits. */
				809	static inline size_t
				810	width_to_mask (size_t width)
				811	{
				812	width = MIN (width, BITS_PER_CPPCHAR_T);
				813	if (width >= CHAR_BIT * sizeof (size_t))
				814	return ~(size_t) 0;
				815	else
				816	return ((size_t) 1 << width) - 1;
				817	}
				818
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	819	/* A large table of unicode character information. */
				820	enum {
				821	/* Valid in a C99 identifier? */
				822	C99 = 1,
				823	/* Valid in a C99 identifier, but not as the first character? */
				824	DIG = 2,
				825	/* Valid in a C++ identifier? */
				826	CXX = 4,
				827	/* NFC representation is not valid in an identifier? */
				828	CID = 8,
				829	/* Might be valid NFC form? */
				830	NFC = 16,
				831	/* Might be valid NFKC form? */
				832	NKC = 32,
				833	/* Certain preceding characters might make it not valid NFC/NKFC form? */
				834	CTX = 64
				835	};
				836
				837	static const struct {
				838	/* Bitmap of flags above. */
				839	unsigned char flags;
				840	/* Combining class of the character. */
				841	unsigned char combine;
				842	/* Last character in the range described by this entry. */
				843	unsigned short end;
				844	} ucnranges[] = {
				845	#include "ucnid.h"
				846	};
				847
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	848	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				849	the start of an identifier, and 0 if C is not valid in an
				850	identifier. We assume C has already gone through the checks of
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	851	_cpp_valid_ucn. Also update NST for C if returning nonzero. The
				852	algorithm is a simple binary search on the table defined in
				853	ucnid.h. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	854
				855	static int
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	856	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
				857	struct normalize_state *nst)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	858	{
				859	int mn, mx, md;
				860
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	861	if (c > 0xFFFF)
				862	return 0;
				863
				864	mn = 0;
				865	mx = ARRAY_SIZE (ucnranges) - 1;
				866	while (mx != mn)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	867	{
				868	md = (mn + mx) / 2;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	869	if (c <= ucnranges[md].end)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	870	mx = md;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	871	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	872	mn = md + 1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	873	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	874
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	875	/* When -pedantic, we require the character to have been listed by
				876	the standard for the current language. Otherwise, we accept the
				877	union of the acceptable sets for C++98 and C99. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	878	if (! (ucnranges[mn].flags & (C99 \| CXX)))
				879	return 0;
				880
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	881	if (CPP_PEDANTIC (pfile)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	882	&& ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	883	\|\| (CPP_OPTION (pfile, cplusplus)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	884	&& !(ucnranges[mn].flags & CXX))))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	885	return 0;
				886
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	887	/* Update NST. */
				888	if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
				889	nst->level = normalized_none;
				890	else if (ucnranges[mn].flags & CTX)
				891	{
				892	bool safe;
				893	cppchar_t p = nst->previous;
				894
				895	/* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
				896	if (c == 0x09BE)
				897	safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
				898	else if (c == 0x0B3E)
				899	safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
				900	else if (c == 0x0BBE)
				901	safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
				902	else if (c == 0x0CC2)
				903	safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
				904	else if (c == 0x0D3E)
				905	safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
				906	/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
				907	and are combined algorithmically from a sequence of the form
				908	1100-1112 1161-1175 11A8-11C2
				909	(if the third is not present, it is treated as 11A7, which is not
				910	really a valid character).
				911	Unfortunately, C99 allows (only) the NFC form, but C++ allows
				912	only the combining characters. */
				913	else if (c >= 0x1161 && c <= 0x1175)
				914	safe = p < 0x1100 \|\| p > 0x1112;
				915	else if (c >= 0x11A8 && c <= 0x11C2)
				916	safe = (p < 0xAC00 \|\| p > 0xD7A3 \|\| (p - 0xAC00) % 28 != 0);
				917	else
				918	{
				919	/* Uh-oh, someone updated ucnid.h without updating this code. */
				920	cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
				921	safe = true;
				922	}
				923	if (!safe && c < 0x1161)
				924	nst->level = normalized_none;
				925	else if (!safe)
				926	nst->level = MAX (nst->level, normalized_identifier_C);
				927	}
				928	else if (ucnranges[mn].flags & NKC)
				929	;
				930	else if (ucnranges[mn].flags & NFC)
				931	nst->level = MAX (nst->level, normalized_C);
				932	else if (ucnranges[mn].flags & CID)
				933	nst->level = MAX (nst->level, normalized_identifier_C);
				934	else
				935	nst->level = normalized_none;
				936	nst->previous = c;
				937	nst->prev_class = ucnranges[mn].combine;
				938
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	939	/* In C99, UCN digits may not begin identifiers. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	940	if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	941	return 2;
				942
				943	return 1;
				944	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	945
				946	/* [lex.charset]: The character designated by the universal character
				947	name \UNNNNNNNN is that character whose character short name in
				948	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				949	universal character name \uNNNN is that character whose character
				950	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	951	for a universal character name corresponds to a surrogate code point
				952	(in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
				953	Additionally, if the hexadecimal value for a universal-character-name
				954	outside a character or string literal corresponds to a control character
				955	(in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
				956	character in the basic source character set, the program is ill-formed.
				957
				958	C99 6.4.3: A universal character name shall not specify a character
				959	whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
				960	or 0060 (`), nor one in the range D800 through DFFF inclusive.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	961
				962	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
Geoffrey Keating	c79e602	2005-03-16 00:59:31 +0000	[diff] [blame]	963	buffer end is delimited by a non-hex digit. Returns zero if the
				964	UCN has not been consumed.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	965
Kazu Hirata	6356f89	2003-06-12 19:01:08 +0000	[diff] [blame]	966	Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	967	is returned. Diagnostics are emitted for invalid values. PSTR
				968	is updated to point one beyond the UCN, or to the syntactically
				969	invalid character.
				970
				971	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	972	an identifier, or 2 otherwise. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	973
				974	cppchar_t
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	975	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	976	const uchar *limit, int identifier_pos,
				977	struct normalize_state *nst)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	978	{
				979	cppchar_t result, c;
				980	unsigned int length;
				981	const uchar str = pstr;
				982	const uchar *base = str - 2;
				983
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	984	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	985	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	986	"universal character names are only valid in C++ and C99");
				987	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	988	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	989	"the meaning of '\\%c' is different in traditional C",
				990	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	991
				992	if (str[-1] == 'u')
				993	length = 4;
				994	else if (str[-1] == 'U')
				995	length = 8;
				996	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	997	{
				998	cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
				999	length = 4;
				1000	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1001
				1002	result = 0;
				1003	do
				1004	{
				1005	c = *str;
				1006	if (!ISXDIGIT (c))
				1007	break;
				1008	str++;
				1009	result = (result << 4) + hex_value (c);
				1010	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1011	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1012
Geoffrey Keating	c79e602	2005-03-16 00:59:31 +0000	[diff] [blame]	1013	/* Partial UCNs are not valid in strings, but decompose into
				1014	multiple tokens in identifiers, so we can't give a helpful
				1015	error message in that case. */
				1016	if (length && identifier_pos)
				1017	return 0;
				1018
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1019	*pstr = str;
				1020	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1021	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1022	cpp_error (pfile, CPP_DL_ERROR,
				1023	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1024	(int) (str - base), base);
				1025	result = 1;
				1026	}
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1027	/* The C99 standard permits $, @ and ` to be specified as UCNs. We use
				1028	hex escapes so that this also works with EBCDIC hosts.
				1029	C++0x permits everything below 0xa0 within literals;
				1030	ucn_valid_in_identifier will complain about identifiers. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1031	else if ((result < 0xa0
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1032	&& !CPP_OPTION (pfile, cplusplus)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1033	&& (result != 0x24 && result != 0x40 && result != 0x60))
				1034	\|\| (result & 0x80000000)
				1035	\|\| (result >= 0xD800 && result <= 0xDFFF))
				1036	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1037	cpp_error (pfile, CPP_DL_ERROR,
				1038	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1039	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1040	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1041	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1042	else if (identifier_pos && result == 0x24
				1043	&& CPP_OPTION (pfile, dollars_in_ident))
				1044	{
				1045	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
				1046	{
				1047	CPP_OPTION (pfile, warn_dollars) = 0;
				1048	cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
				1049	}
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1050	NORMALIZE_STATE_UPDATE_IDNUM (nst);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1051	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1052	else if (identifier_pos)
				1053	{
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1054	int validity = ucn_valid_in_identifier (pfile, result, nst);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1055
				1056	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1057	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1058	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1059	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1060	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1061	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1062	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1063	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1064	}
				1065
				1066	if (result == 0)
				1067	result = 1;
				1068
				1069	return result;
				1070	}
				1071
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1072	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
				1073	it to the execution character set and write the result into TBUF.
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1074	An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1075	static const uchar *
				1076	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1077	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1078	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1079	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1080	uchar buf[6];
				1081	uchar *bufp = buf;
				1082	size_t bytesleft = 6;
				1083	int rval;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1084	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1085
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1086	from++; /* Skip u/U. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1087	ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1088
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1089	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				1090	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1091	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1092	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1093	cpp_errno (pfile, CPP_DL_ERROR,
				1094	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1095	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1096	else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1097	cpp_errno (pfile, CPP_DL_ERROR,
				1098	"converting UCN to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1099
				1100	return from;
				1101	}
				1102
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1103	/* Subroutine of convert_hex and convert_oct. N is the representation
				1104	in the execution character set of a numeric escape; write it into the
				1105	string buffer TBUF and update the end-of-string pointer therein. WIDE
				1106	is true if it's a wide string that's being assembled in TBUF. This
				1107	function issues no diagnostics and never fails. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1108	static void
				1109	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1110	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1111	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1112	size_t width = cvt.width;
				1113
				1114	if (width != CPP_OPTION (pfile, char_precision))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1115	{
				1116	/* We have to render this into the target byte order, which may not
				1117	be our byte order. */
				1118	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1119	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1120	size_t cmask = width_to_mask (cwidth);
				1121	size_t nbwc = width / cwidth;
				1122	size_t i;
				1123	size_t off = tbuf->len;
				1124	cppchar_t c;
				1125
				1126	if (tbuf->len + nbwc > tbuf->asize)
				1127	{
				1128	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1129	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1130	}
				1131
				1132	for (i = 0; i < nbwc; i++)
				1133	{
				1134	c = n & cmask;
				1135	n >>= cwidth;
				1136	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				1137	}
				1138	tbuf->len += nbwc;
				1139	}
				1140	else
				1141	{
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1142	/* Note: this code does not handle the case where the target
				1143	and host have a different number of bits in a byte. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1144	if (tbuf->len + 1 > tbuf->asize)
				1145	{
				1146	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1147	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1148	}
				1149	tbuf->text[tbuf->len++] = n;
				1150	}
				1151	}
				1152
				1153	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
				1154	character set and write it into the string buffer TBUF. Returns an
				1155	advanced pointer, and issues diagnostics as necessary.
				1156	No character set translation occurs; this routine always produces the
				1157	execution-set character with numeric value equal to the given hex
				1158	number. You can, e.g. generate surrogate pairs this way. */
				1159	static const uchar *
				1160	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1161	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1162	{
				1163	cppchar_t c, n = 0, overflow = 0;
				1164	int digits_found = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1165	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1166	size_t mask = width_to_mask (width);
				1167
				1168	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1169	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1170	"the meaning of '\\x' is different in traditional C");
				1171
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1172	from++; /* Skip 'x'. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1173	while (from < limit)
				1174	{
				1175	c = *from;
				1176	if (! hex_p (c))
				1177	break;
				1178	from++;
				1179	overflow \|= n ^ (n << 4 >> 4);
				1180	n = (n << 4) + hex_value (c);
				1181	digits_found = 1;
				1182	}
				1183
				1184	if (!digits_found)
				1185	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1186	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1187	"\\x used with no following hex digits");
				1188	return from;
				1189	}
				1190
				1191	if (overflow \| (n != (n & mask)))
				1192	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1193	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1194	"hex escape sequence out of range");
				1195	n &= mask;
				1196	}
				1197
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1198	emit_numeric_escape (pfile, n, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1199
				1200	return from;
				1201	}
				1202
				1203	/* Convert an octal escape, pointed to by FROM, to the execution
				1204	character set and write it into the string buffer TBUF. Returns an
				1205	advanced pointer, and issues diagnostics as necessary.
				1206	No character set translation occurs; this routine always produces the
				1207	execution-set character with numeric value equal to the given octal
				1208	number. */
				1209	static const uchar *
				1210	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1211	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1212	{
				1213	size_t count = 0;
				1214	cppchar_t c, n = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1215	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1216	size_t mask = width_to_mask (width);
				1217	bool overflow = false;
				1218
				1219	while (from < limit && count++ < 3)
				1220	{
				1221	c = *from;
				1222	if (c < '0' \|\| c > '7')
				1223	break;
				1224	from++;
				1225	overflow \|= n ^ (n << 3 >> 3);
				1226	n = (n << 3) + c - '0';
				1227	}
				1228
				1229	if (n != (n & mask))
				1230	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1231	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1232	"octal escape sequence out of range");
				1233	n &= mask;
				1234	}
				1235
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1236	emit_numeric_escape (pfile, n, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1237
				1238	return from;
				1239	}
				1240
				1241	/* Convert an escape sequence (pointed to by FROM) to its value on
				1242	the target, and to the execution character set. Do not scan past
				1243	LIMIT. Write the converted value into TBUF. Returns an advanced
				1244	pointer. Handles all relevant diagnostics. */
				1245	static const uchar *
				1246	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1247	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1248	{
				1249	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1250	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1251	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1252	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1253	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1254	#else
				1255	#error "unknown host character set"
				1256	#endif
				1257
				1258	uchar c;
				1259
				1260	c = *from;
				1261	switch (c)
				1262	{
				1263	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1264	case 'u': case 'U':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1265	return convert_ucn (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1266
				1267	case 'x':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1268	return convert_hex (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1269	break;
				1270
				1271	case '0': case '1': case '2': case '3':
				1272	case '4': case '5': case '6': case '7':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1273	return convert_oct (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1274
				1275	/* Various letter escapes. Get the appropriate host-charset
				1276	value into C. */
				1277	case '\\': case '\'': case '"': case '?': break;
				1278
				1279	case '(': case '{': case '[': case '%':
				1280	/* '\(', etc, can be used at the beginning of a line in a long
				1281	string split onto multiple lines with \-newline, to prevent
				1282	Emacs or other text editors from getting confused. '\%' can
				1283	be used to prevent SCCS from mangling printf format strings. */
				1284	if (CPP_PEDANTIC (pfile))
				1285	goto unknown;
				1286	break;
				1287
				1288	case 'b': c = charconsts[1]; break;
				1289	case 'f': c = charconsts[3]; break;
				1290	case 'n': c = charconsts[4]; break;
				1291	case 'r': c = charconsts[5]; break;
				1292	case 't': c = charconsts[6]; break;
				1293	case 'v': c = charconsts[7]; break;
				1294
				1295	case 'a':
				1296	if (CPP_WTRADITIONAL (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1297	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1298	"the meaning of '\\a' is different in traditional C");
				1299	c = charconsts[0];
				1300	break;
				1301
				1302	case 'e': case 'E':
				1303	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1304	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1305	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1306	c = charconsts[2];
				1307	break;
				1308
				1309	default:
				1310	unknown:
				1311	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1312	cpp_error (pfile, CPP_DL_PEDWARN,
Tom Tromey	709a22d	2009-08-17 17:34:53 +0000	[diff] [blame]	1313	"unknown escape sequence: '\\%c'", (int) c);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1314	else
Joseph Myers	178b58b	2005-11-03 23:08:18 +0000	[diff] [blame]	1315	{
				1316	/* diagnostic.c does not support "%03o". When it does, this
				1317	code can use %03o directly in the diagnostic again. */
				1318	char buf[32];
				1319	sprintf(buf, "%03o", (int) c);
				1320	cpp_error (pfile, CPP_DL_PEDWARN,
				1321	"unknown escape sequence: '\\%s'", buf);
				1322	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1323	}
				1324
				1325	/* Now convert what we have to the execution character set. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1326	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1327	cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1328	"converting escape sequence to execution character set");
				1329
				1330	return from + 1;
				1331	}
				1332
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1333	/* TYPE is a token type. The return value is the conversion needed to
				1334	convert from source to execution character set for the given type. */
				1335	static struct cset_converter
				1336	converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
				1337	{
				1338	switch (type)
				1339	{
				1340	default:
				1341	return pfile->narrow_cset_desc;
				1342	case CPP_CHAR16:
				1343	case CPP_STRING16:
				1344	return pfile->char16_cset_desc;
				1345	case CPP_CHAR32:
				1346	case CPP_STRING32:
				1347	return pfile->char32_cset_desc;
				1348	case CPP_WCHAR:
				1349	case CPP_WSTRING:
				1350	return pfile->wide_cset_desc;
				1351	}
				1352	}
				1353
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1354	/* FROM is an array of cpp_string structures of length COUNT. These
				1355	are to be converted from the source to the execution character set,
				1356	escape sequences translated, and finally all are to be
				1357	concatenated. WIDE indicates whether or not to produce a wide
				1358	string. The result is written into TO. Returns true for success,
				1359	false for failure. */
				1360	bool
				1361	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1362	cpp_string *to, enum cpp_ttype type)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1363	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1364	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1365	const uchar p, base, *limit;
				1366	size_t i;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1367	struct cset_converter cvt = converter_for_type (pfile, type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1368
				1369	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1370	tbuf.text = XNEWVEC (uchar, tbuf.asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1371	tbuf.len = 0;
				1372
				1373	for (i = 0; i < count; i++)
				1374	{
				1375	p = from[i].text;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1376	if (p == 'L' \|\| p == 'u' \|\| *p == 'U') p++;
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1377	p++; /* Skip leading quote. */
				1378	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1379
				1380	for (;;)
				1381	{
				1382	base = p;
				1383	while (p < limit && *p != '\\')
				1384	p++;
				1385	if (p > base)
				1386	{
				1387	/* We have a run of normal characters; these can be fed
				1388	directly to convert_cset. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1389	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1390	goto fail;
				1391	}
				1392	if (p == limit)
				1393	break;
				1394
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1395	p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1396	}
				1397	}
				1398	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1399	structure. */
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1400	emit_numeric_escape (pfile, 0, &tbuf, cvt);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1401	tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1402	to->text = tbuf.text;
				1403	to->len = tbuf.len;
				1404	return true;
				1405
				1406	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1407	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1408	free (tbuf.text);
				1409	return false;
				1410	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1411
				1412	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1413	in a string, but do not perform character set conversion. */
				1414	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1415	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1416	size_t count, cpp_string *to,
				1417	enum cpp_ttype type ATTRIBUTE_UNUSED)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1418	{
				1419	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1420	bool retval;
				1421
				1422	pfile->narrow_cset_desc.func = convert_no_conversion;
				1423	pfile->narrow_cset_desc.cd = (iconv_t) -1;
H.J. Lu	0b7c73c	2008-06-12 17:03:41 +0000	[diff] [blame]	1424	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1425
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1426	retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1427
				1428	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1429	return retval;
				1430	}
				1431
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1432
				1433	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1434	to a number, for narrow strings. STR is the string structure returned
				1435	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1436	cpp_interpret_charconst. */
				1437	static cppchar_t
				1438	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1439	unsigned int pchars_seen, int unsignedp)
				1440	{
				1441	size_t width = CPP_OPTION (pfile, char_precision);
				1442	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1443	size_t mask = width_to_mask (width);
				1444	size_t i;
				1445	cppchar_t result, c;
				1446	bool unsigned_p;
				1447
				1448	/* The value of a multi-character character constant, or a
				1449	single-character character constant whose representation in the
				1450	execution character set is more than one byte long, is
				1451	implementation defined. This implementation defines it to be the
				1452	number formed by interpreting the byte sequence in memory as a
				1453	big-endian binary number. If overflow occurs, the high bytes are
				1454	lost, and a warning is issued.
				1455
				1456	We don't want to process the NUL terminator handed back by
				1457	cpp_interpret_string. */
				1458	result = 0;
				1459	for (i = 0; i < str.len - 1; i++)
				1460	{
				1461	c = str.text[i] & mask;
				1462	if (width < BITS_PER_CPPCHAR_T)
				1463	result = (result << width) \| c;
				1464	else
				1465	result = c;
				1466	}
				1467
				1468	if (i > max_chars)
				1469	{
				1470	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1471	cpp_error (pfile, CPP_DL_WARNING,
				1472	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1473	}
				1474	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1475	cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1476
				1477	/* Multichar constants are of type int and therefore signed. */
				1478	if (i > 1)
				1479	unsigned_p = 0;
				1480	else
				1481	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1482
				1483	/* Truncate the constant to its natural width, and simultaneously
				1484	sign- or zero-extend to the full width of cppchar_t.
				1485	For single-character constants, the value is WIDTH bits wide.
				1486	For multi-character constants, the value is INT_PRECISION bits wide. */
				1487	if (i > 1)
				1488	width = CPP_OPTION (pfile, int_precision);
				1489	if (width < BITS_PER_CPPCHAR_T)
				1490	{
				1491	mask = ((cppchar_t) 1 << width) - 1;
				1492	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1493	result &= mask;
				1494	else
				1495	result \|= ~mask;
				1496	}
				1497	*pchars_seen = i;
				1498	*unsignedp = unsigned_p;
				1499	return result;
				1500	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1501
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1502	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1503	to a number, for wide strings. STR is the string structure returned
				1504	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1505	cpp_interpret_charconst. TYPE is the token type. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1506	static cppchar_t
				1507	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1508	unsigned int pchars_seen, int unsignedp,
				1509	enum cpp_ttype type)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1510	{
				1511	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1512	size_t width = converter_for_type (pfile, type).width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1513	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1514	size_t mask = width_to_mask (width);
				1515	size_t cmask = width_to_mask (cwidth);
				1516	size_t nbwc = width / cwidth;
				1517	size_t off, i;
				1518	cppchar_t result = 0, c;
				1519
				1520	/* This is finicky because the string is in the target's byte order,
				1521	which may not be our byte order. Only the last character, ignoring
				1522	the NUL terminator, is relevant. */
				1523	off = str.len - (nbwc * 2);
				1524	result = 0;
				1525	for (i = 0; i < nbwc; i++)
				1526	{
				1527	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1528	result = (result << cwidth) \| (c & cmask);
				1529	}
				1530
				1531	/* Wide character constants have type wchar_t, and a single
				1532	character exactly fills a wchar_t, so a multi-character wide
				1533	character constant is guaranteed to overflow. */
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1534	if (str.len > nbwc * 2)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1535	cpp_error (pfile, CPP_DL_WARNING,
				1536	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1537
				1538	/* Truncate the constant to its natural width, and simultaneously
				1539	sign- or zero-extend to the full width of cppchar_t. */
				1540	if (width < BITS_PER_CPPCHAR_T)
				1541	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1542	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1543	\|\| CPP_OPTION (pfile, unsigned_wchar)
				1544	\|\| !(result & (1 << (width - 1))))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1545	result &= mask;
				1546	else
				1547	result \|= ~mask;
				1548	}
				1549
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1550	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1551	\|\| CPP_OPTION (pfile, unsigned_wchar))
				1552	*unsignedp = 1;
				1553	else
				1554	*unsignedp = 0;
				1555
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1556	*pchars_seen = 1;
				1557	return result;
				1558	}
				1559
				1560	/* Interpret a (possibly wide) character constant in TOKEN.
				1561	PCHARS_SEEN points to a variable that is filled in with the number
				1562	of characters seen, and UNSIGNEDP to a variable that indicates
				1563	whether the result has signed type. */
				1564	cppchar_t
				1565	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1566	unsigned int pchars_seen, int unsignedp)
				1567	{
				1568	cpp_string str = { 0, 0 };
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1569	bool wide = (token->type != CPP_CHAR);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1570	cppchar_t result;
				1571
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1572	/* an empty constant will appear as L'', u'', U'' or '' */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1573	if (token->val.str.len == (size_t) (2 + wide))
				1574	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1575	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1576	return 0;
				1577	}
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1578	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1579	return 0;
				1580
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1581	if (wide)
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1582	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
				1583	token->type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1584	else
				1585	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1586
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1587	if (str.text != token->val.str.text)
				1588	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1589
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1590	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1591	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1592
				1593	/* Convert an identifier denoted by ID and LEN, which might contain
				1594	UCN escapes, to the source character set, either UTF-8 or
				1595	UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
				1596	cpp_hashnode *
				1597	_cpp_interpret_identifier (cpp_reader pfile, const uchar id, size_t len)
				1598	{
				1599	/* It turns out that a UCN escape always turns into fewer characters
				1600	than the escape itself, so we can allocate a temporary in advance. */
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1601	uchar * buf = (uchar *) alloca (len + 1);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1602	uchar * bufp = buf;
				1603	size_t idp;
				1604
				1605	for (idp = 0; idp < len; idp++)
				1606	if (id[idp] != '\\')
				1607	*bufp++ = id[idp];
				1608	else
				1609	{
				1610	unsigned length = id[idp+1] == 'u' ? 4 : 8;
				1611	cppchar_t value = 0;
				1612	size_t bufleft = len - (bufp - buf);
				1613	int rval;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1614
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1615	idp += 2;
				1616	while (length && idp < len && ISXDIGIT (id[idp]))
				1617	{
				1618	value = (value << 4) + hex_value (id[idp]);
				1619	idp++;
				1620	length--;
				1621	}
				1622	idp--;
				1623
				1624	/* Special case for EBCDIC: if the identifier contains
				1625	a '$' specified using a UCN, translate it to EBCDIC. */
				1626	if (value == 0x24)
				1627	{
				1628	*bufp++ = '$';
				1629	continue;
				1630	}
				1631
				1632	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
				1633	if (rval)
				1634	{
				1635	errno = rval;
				1636	cpp_errno (pfile, CPP_DL_ERROR,
				1637	"converting UCN to source character set");
				1638	break;
				1639	}
				1640	}
				1641
				1642	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				1643	buf, bufp - buf, HT_ALLOC));
				1644	}
				1645
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1646	/* Convert an input buffer (containing the complete contents of one
				1647	source file) from INPUT_CHARSET to the source character set. INPUT
				1648	points to the input buffer, SIZE is its allocated size, and LEN is
				1649	the length of the meaningful data within the buffer. The
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1650	translated buffer is returned, *ST_SIZE is set to the length of
				1651	the meaningful data within the translated buffer, and *BUFFER_START
				1652	is set to the start of the returned buffer. *BUFFER_START may
				1653	differ from the return value in the case of a BOM or other ignored
				1654	marker information.
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1655
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1656	INPUT is expected to have been allocated with xmalloc. This
				1657	function will either set *BUFFER_START to INPUT, or free it and set
				1658	*BUFFER_START to a pointer to another xmalloc-allocated block of
				1659	memory. */
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1660	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1661	_cpp_convert_input (cpp_reader pfile, const char input_charset,
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1662	uchar *input, size_t size, size_t len,
				1663	const unsigned char *buffer_start, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1664	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1665	struct cset_converter input_cset;
				1666	struct _cpp_strbuf to;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1667	unsigned char *buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1668
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1669	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				1670	if (input_cset.func == convert_no_conversion)
				1671	{
				1672	to.text = input;
				1673	to.asize = size;
				1674	to.len = len;
				1675	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1676	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1677	{
				1678	to.asize = MAX (65536, len);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1679	to.text = XNEWVEC (uchar, to.asize);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1680	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1681
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1682	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				1683	cpp_error (pfile, CPP_DL_ERROR,
				1684	"failure to convert %s to %s",
				1685	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				1686
				1687	free (input);
				1688	}
				1689
				1690	/* Clean up the mess. */
				1691	if (input_cset.func == convert_using_iconv)
				1692	iconv_close (input_cset.cd);
				1693
				1694	/* Resize buffer if we allocated substantially too much, or if we
				1695	haven't enough space for the \n-terminator. */
				1696	if (to.len + 4096 < to.asize \|\| to.len >= to.asize)
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1697	to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1698
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1699	/* If the file is using old-school Mac line endings (\r only),
				1700	terminate with another \r, not an \n, so that we do not mistake
				1701	the \r\n sequence for a single DOS line ending and erroneously
				1702	issue the "No newline at end of file" diagnostic. */
Tom Tromey	30b0edc	2006-12-28 18:45:48 +0000	[diff] [blame]	1703	if (to.len && to.text[to.len - 1] == '\r')
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1704	to.text[to.len] = '\r';
				1705	else
				1706	to.text[to.len] = '\n';
				1707
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1708	buffer = to.text;
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1709	*st_size = to.len;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1710	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1711	/* The HOST_CHARSET test just above ensures that the source charset
				1712	is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
				1713	glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
				1714	BOM -- however, even if it did, we would still need this code due
				1715	to the 'convert_no_conversion' case. */
				1716	if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
				1717	&& to.text[2] == 0xbf)
				1718	{
				1719	*st_size -= 3;
				1720	buffer += 3;
				1721	}
				1722	#endif
				1723
				1724	*buffer_start = to.text;
				1725	return buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1726	}
				1727
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1728	/* Decide on the default encoding to assume for input files. */
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1729	const char *
				1730	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1731	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1732	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1733
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1734	/* We disable this because the default codeset is 7-bit ASCII on
				1735	most platforms, and this causes conversion failures on every
				1736	file in GCC that happens to have one of the upper 128 characters
				1737	in it -- most likely, as part of the name of a contributor.
				1738	We should definitely recognize in-band markers of file encoding,
				1739	like:
				1740	- the appropriate Unicode byte-order mark (FE FF) to recognize
				1741	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				1742	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame]	1743	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1744	distinguish ASCII and EBCDIC.
				1745	- now we can parse something like "#pragma GCC encoding <xyz>
				1746	on the first line, or even Emacs/VIM's mode line tags (there's
				1747	a problem here in that VIM uses the last line, and Emacs has
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1748	its more elaborate "local variables" convention).
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1749	- investigate whether Java has another common convention, which
				1750	would be friendly to support.
				1751	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				1752	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1753	setlocale (LC_CTYPE, "");
				1754	current_encoding = nl_langinfo (CODESET);
				1755	#endif
				1756	if (current_encoding == NULL \|\| *current_encoding == '\0')
				1757	current_encoding = SOURCE_CHARSET;
				1758
				1759	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1760	}