Blame - libcpp/charset.c - toolchain/gcc

blob: 8e92bc65f90097cb9770bca0c4539087fc94d2d7 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Jakub Jelinek	5624e56	2015-01-05 13:33:28 +0100	[diff] [blame]	2	Copyright (C) 1998-2015 Free Software Foundation, Inc.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3
				4	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				5
				6	This program is free software; you can redistribute it and/or modify it
				7	under the terms of the GNU General Public License as published by the
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	8	Free Software Foundation; either version 3, or (at your option) any
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	9	later version.
				10
				11	This program is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	GNU General Public License for more details.
				15
				16	You should have received a copy of the GNU General Public License
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	17	along with this program; see the file COPYING3. If not see
				18	<http://www.gnu.org/licenses/>. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	19
				20	#include "config.h"
				21	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	22	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	23	#include "internal.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	24
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	25	/* Character set handling for C-family languages.
				26
				27	Terminological note: In what follows, "charset" or "character set"
				28	will be taken to mean both an abstract set of characters and an
				29	encoding for that set.
				30
				31	The C99 standard discusses two character sets: source and execution.
				32	The source character set is used for internal processing in translation
				33	phases 1 through 4; the execution character set is used thereafter.
				34	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				35	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				36	of these terms). Furthermore, the "basic character set" (listed in
				37	5.2.1p3) is to be encoded in each with values one byte wide, and is
				38	to appear in the initial shift state.
				39
				40	It is not explicitly mentioned, but there is also a "wide execution
				41	character set" used to encode wide character constants and wide
				42	string literals; this is supposed to be the result of applying the
				43	standard library function mbstowcs() to an equivalent narrow string
				44	(6.4.5p5). However, the behavior of hexadecimal and octal
				45	\-escapes is at odds with this; they are supposed to be translated
				46	directly to wchar_t values (6.4.4.4p5,6).
				47
				48	The source character set is not necessarily the character set used
				49	to encode physical source files on disk; translation phase 1 converts
				50	from whatever that encoding is to the source character set.
				51
				52	The presence of universal character names in C99 (6.4.3 et seq.)
				53	forces the source character set to be isomorphic to ISO 10646,
				54	that is, Unicode. There is no such constraint on the execution
				55	character set; note also that the conversion from source to
				56	execution character set does not occur for identifiers (5.1.1.2p1#5).
				57
				58	For convenience of implementation, the source character set's
				59	encoding of the basic character set should be identical to the
				60	execution character set OF THE HOST SYSTEM's encoding of the basic
				61	character set, and it should not be a state-dependent encoding.
				62
				63	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				64	depending on whether the host is based on ASCII or EBCDIC (see
				65	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	66	Technical Report #16). With limited exceptions, it relies on the
				67	system library's iconv() primitive to do charset conversion
				68	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	69
				70	#if !HAVE_ICONV
				71	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				72	below, which are guarded only by if statements with compile-time
				73	constant conditions, do not cause link errors. */
				74	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	75	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	76	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	77	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	78	#endif
				79
				80	#if HOST_CHARSET == HOST_CHARSET_ASCII
				81	#define SOURCE_CHARSET "UTF-8"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	82	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	83	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				84	#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	85	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	86	#else
				87	#error "Unrecognized basic host character set"
				88	#endif
				89
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	90	#ifndef EILSEQ
				91	#define EILSEQ EINVAL
				92	#endif
				93
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	94	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	95	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	96	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	97	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	98	{
				99	uchar *text;
				100	size_t asize;
				101	size_t len;
				102	};
				103
				104	/* This is enough to hold any string that fits on a single 80-column
				105	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	106	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	107	#define OUTBUF_BLOCK_SIZE 256
				108
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	109	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				110	logic. This is because a depressing number of systems lack iconv,
				111	or have have iconv libraries that do not do these conversions, so
				112	we need a fallback implementation for them. To ensure the fallback
				113	doesn't break due to neglect, it is used on all systems.
				114
				115	UTF-32 encoding is nice and simple: a four-byte binary number,
				116	constrained to the range 00000000-7FFFFFFF to avoid questions of
				117	signedness. We do have to cope with big- and little-endian
				118	variants.
				119
				120	UTF-16 encoding uses two-byte binary numbers, again in big- and
				121	little-endian variants, for all values in the 00000000-0000FFFF
				122	range. Values in the 00010000-0010FFFF range are encoded as pairs
				123	of two-byte numbers, called "surrogate pairs": given a number S in
				124	this range, it is mapped to a pair (H, L) as follows:
				125
				126	H = (S - 0x10000) / 0x400 + 0xD800
				127	L = (S - 0x10000) % 0x400 + 0xDC00
				128
				129	Two-byte values in the D800...DFFF range are ill-formed except as a
				130	component of a surrogate pair. Even if the encoding within a
				131	two-byte value is little-endian, the H member of the surrogate pair
				132	comes first.
				133
				134	There is no way to encode values in the 00110000-7FFFFFFF range,
				135	which is not currently a problem as there are no assigned code
				136	points in that range; however, the author expects that it will
				137	eventually become necessary to abandon UTF-16 due to this
				138	limitation. Note also that, because of these pairs, UTF-16 does
				139	not meet the requirements of the C standard for a wide character
				140	encoding (see 3.7.3 and 6.4.4.4p11).
				141
				142	UTF-8 encoding looks like this:
				143
				144	value range encoded as
				145	00000000-0000007F 0xxxxxxx
				146	00000080-000007FF 110xxxxx 10xxxxxx
				147	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				148	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				149	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				150	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				151
				152	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				153	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				154	never occur. Note also that any value that can be encoded by a
				155	given row of the table can also be encoded by all successive rows,
				156	but this is not done; only the shortest possible encoding for any
				157	given value is valid. For instance, the character 07C0 could be
				158	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				159	FC 80 80 80 9F 80. Only the first is valid.
				160
				161	An implementation note: the transformation from UTF-16 to UTF-8, or
				162	vice versa, is easiest done by using UTF-32 as an intermediary. */
				163
				164	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				165	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				166	operation in several places below. */
				167	static inline int
				168	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				169	cppchar_t *cp)
				170	{
Joseph Myers	9e322bc	2009-05-03 12:59:26 +0100	[diff] [blame]	171	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	172	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	173
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	174	cppchar_t c;
				175	const uchar inbuf = inbufp;
				176	size_t nbytes, i;
				177
				178	if (*inbytesleftp < 1)
				179	return EINVAL;
				180
				181	c = *inbuf;
				182	if (c < 0x80)
				183	{
				184	*cp = c;
				185	*inbytesleftp -= 1;
				186	*inbufp += 1;
				187	return 0;
				188	}
				189
				190	/* The number of leading 1-bits in the first byte indicates how many
				191	bytes follow. */
				192	for (nbytes = 2; nbytes < 7; nbytes++)
				193	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				194	goto found;
				195	return EILSEQ;
				196	found:
				197
				198	if (*inbytesleftp < nbytes)
				199	return EINVAL;
				200
				201	c = (c & masks[nbytes-1]);
				202	inbuf++;
				203	for (i = 1; i < nbytes; i++)
				204	{
				205	cppchar_t n = *inbuf++;
				206	if ((n & 0xC0) != 0x80)
				207	return EILSEQ;
				208	c = ((c << 6) + (n & 0x3F));
				209	}
				210
				211	/* Make sure the shortest possible encoding was used. */
				212	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				213	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				214	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				215	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				216	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				217
				218	/* Make sure the character is valid. */
				219	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				220
				221	*cp = c;
				222	*inbufp = inbuf;
				223	*inbytesleftp -= nbytes;
				224	return 0;
				225	}
				226
				227	static inline int
				228	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				229	{
				230	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				231	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				232	size_t nbytes;
				233	uchar buf[6], *p = &buf[6];
				234	uchar outbuf = outbufp;
				235
				236	nbytes = 1;
				237	if (c < 0x80)
				238	*--p = c;
				239	else
				240	{
				241	do
				242	{
				243	*--p = ((c & 0x3F) \| 0x80);
				244	c >>= 6;
				245	nbytes++;
				246	}
				247	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				248	*--p = (c \| masks[nbytes-1]);
				249	}
				250
				251	if (*outbytesleftp < nbytes)
				252	return E2BIG;
				253
				254	while (p < &buf[6])
				255	outbuf++ = p++;
				256	*outbytesleftp -= nbytes;
				257	*outbufp = outbuf;
				258	return 0;
				259	}
				260
				261	/* The following four functions transform one character between the two
				262	encodings named in the function name. All have the signature
				263	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				264	uchar *outbufp, size_t outbytesleftp)
				265
				266	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				267	interpreted as a boolean indicating whether big-endian or
				268	little-endian encoding is to be used for the member of the pair
				269	that is not UTF-8.
				270
				271	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				272	do for iconv.
				273
				274	The return value is either 0 for success, or an errno value for
				275	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				276	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	277
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	278	static inline int
				279	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				280	uchar *outbufp, size_t outbytesleftp)
				281	{
				282	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	283	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	284	int rval;
				285
				286	/* Check for space first, since we know exactly how much we need. */
				287	if (*outbytesleftp < 4)
				288	return E2BIG;
				289
				290	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				291	if (rval)
				292	return rval;
				293
				294	outbuf = *outbufp;
				295	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				296	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				297	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				298	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				299
				300	*outbufp += 4;
				301	*outbytesleftp -= 4;
				302	return 0;
				303	}
				304
				305	static inline int
				306	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				307	uchar *outbufp, size_t outbytesleftp)
				308	{
				309	cppchar_t s;
				310	int rval;
				311	const uchar *inbuf;
				312
				313	if (*inbytesleftp < 4)
				314	return EINVAL;
				315
				316	inbuf = *inbufp;
				317
				318	s = inbuf[bigend ? 0 : 3] << 24;
				319	s += inbuf[bigend ? 1 : 2] << 16;
				320	s += inbuf[bigend ? 2 : 1] << 8;
				321	s += inbuf[bigend ? 3 : 0];
				322
				323	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				324	return EILSEQ;
				325
				326	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				327	if (rval)
				328	return rval;
				329
				330	*inbufp += 4;
				331	*inbytesleftp -= 4;
				332	return 0;
				333	}
				334
				335	static inline int
				336	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				337	uchar *outbufp, size_t outbytesleftp)
				338	{
				339	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	340	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	341	const uchar save_inbuf = inbufp;
				342	size_t save_inbytesleft = *inbytesleftp;
				343	uchar outbuf = outbufp;
				344
				345	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				346	if (rval)
				347	return rval;
				348
				349	if (s > 0x0010FFFF)
				350	{
				351	*inbufp = save_inbuf;
				352	*inbytesleftp = save_inbytesleft;
				353	return EILSEQ;
				354	}
				355
Joseph Myers	81fee4a	2014-11-29 01:56:06 +0000	[diff] [blame]	356	if (s <= 0xFFFF)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	357	{
				358	if (*outbytesleftp < 2)
				359	{
				360	*inbufp = save_inbuf;
				361	*inbytesleftp = save_inbytesleft;
				362	return E2BIG;
				363	}
				364	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				365	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				366
				367	*outbufp += 2;
				368	*outbytesleftp -= 2;
				369	return 0;
				370	}
				371	else
				372	{
				373	cppchar_t hi, lo;
				374
				375	if (*outbytesleftp < 4)
				376	{
				377	*inbufp = save_inbuf;
				378	*inbytesleftp = save_inbytesleft;
				379	return E2BIG;
				380	}
				381
				382	hi = (s - 0x10000) / 0x400 + 0xD800;
				383	lo = (s - 0x10000) % 0x400 + 0xDC00;
				384
				385	/* Even if we are little-endian, put the high surrogate first.
				386	??? Matches practice? */
				387	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				388	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				389	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				390	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				391
				392	*outbufp += 4;
				393	*outbytesleftp -= 4;
				394	return 0;
				395	}
				396	}
				397
				398	static inline int
				399	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				400	uchar *outbufp, size_t outbytesleftp)
				401	{
				402	cppchar_t s;
				403	const uchar inbuf = inbufp;
				404	int rval;
				405
				406	if (*inbytesleftp < 2)
				407	return EINVAL;
				408	s = inbuf[bigend ? 0 : 1] << 8;
				409	s += inbuf[bigend ? 1 : 0];
				410
				411	/* Low surrogate without immediately preceding high surrogate is invalid. */
				412	if (s >= 0xDC00 && s <= 0xDFFF)
				413	return EILSEQ;
				414	/* High surrogate must have a following low surrogate. */
				415	else if (s >= 0xD800 && s <= 0xDBFF)
				416	{
				417	cppchar_t hi = s, lo;
				418	if (*inbytesleftp < 4)
				419	return EINVAL;
				420
				421	lo = inbuf[bigend ? 2 : 3] << 8;
				422	lo += inbuf[bigend ? 3 : 2];
				423
				424	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				425	return EILSEQ;
				426
				427	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				428	}
				429
				430	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				431	if (rval)
				432	return rval;
				433
				434	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				435	the output pointers for us). */
				436	if (s <= 0xFFFF)
				437	{
				438	*inbufp += 2;
				439	*inbytesleftp -= 2;
				440	}
				441	else
				442	{
				443	*inbufp += 4;
				444	*inbytesleftp -= 4;
				445	}
				446	return 0;
				447	}
				448
				449	/* Helper routine for the next few functions. The 'const' on
				450	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	451	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	452
				453	static inline bool
				454	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				455	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	456	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	457	{
				458	const uchar *inbuf;
				459	uchar *outbuf;
				460	size_t inbytesleft, outbytesleft;
				461	int rval;
				462
				463	inbuf = from;
				464	inbytesleft = flen;
				465	outbuf = to->text + to->len;
				466	outbytesleft = to->asize - to->len;
				467
				468	for (;;)
				469	{
				470	do
				471	rval = one_conversion (cd, &inbuf, &inbytesleft,
				472	&outbuf, &outbytesleft);
				473	while (inbytesleft && !rval);
				474
				475	if (__builtin_expect (inbytesleft == 0, 1))
				476	{
				477	to->len = to->asize - outbytesleft;
				478	return true;
				479	}
				480	if (rval != E2BIG)
				481	{
				482	errno = rval;
				483	return false;
				484	}
				485
				486	outbytesleft += OUTBUF_BLOCK_SIZE;
				487	to->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	488	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	489	outbuf = to->text + to->asize - outbytesleft;
				490	}
				491	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	492
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	493
				494	/* These functions convert entire strings between character sets.
				495	They all have the signature
				496
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	497	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	498
				499	The input string FROM is converted as specified by the function
				500	name plus the iconv descriptor CD (which may be fake), and the
				501	result appended to TO. On any error, false is returned, otherwise true. */
				502
				503	/* These four use the custom conversion code above. */
				504	static bool
				505	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	506	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	507	{
				508	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				509	}
				510
				511	static bool
				512	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	513	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	514	{
				515	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				516	}
				517
				518	static bool
				519	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	520	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	521	{
				522	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				523	}
				524
				525	static bool
				526	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	527	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	528	{
				529	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				530	}
				531
				532	/* Identity conversion, used when we have no alternative. */
				533	static bool
				534	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	535	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	536	{
				537	if (to->len + flen > to->asize)
				538	{
				539	to->asize = to->len + flen;
Bernd Edlinger	dc257367	2014-10-02 00:06:28 +0000	[diff] [blame]	540	to->asize += to->asize / 4;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	541	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	542	}
				543	memcpy (to->text + to->len, from, flen);
				544	to->len += flen;
				545	return true;
				546	}
				547
				548	/* And this one uses the system iconv primitive. It's a little
				549	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	550	#if HAVE_ICONV
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	551
				552	#define CONVERT_ICONV_GROW_BUFFER \
				553	do { \
				554	outbytesleft += OUTBUF_BLOCK_SIZE; \
				555	to->asize += OUTBUF_BLOCK_SIZE; \
				556	to->text = XRESIZEVEC (uchar, to->text, to->asize); \
				557	outbuf = (char *)to->text + to->asize - outbytesleft; \
				558	} while (0)
				559
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	560	static bool
				561	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	562	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	563	{
				564	ICONV_CONST char *inbuf;
				565	char *outbuf;
				566	size_t inbytesleft, outbytesleft;
				567
				568	/* Reset conversion descriptor and check that it is valid. */
				569	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				570	return false;
				571
				572	inbuf = (ICONV_CONST char *)from;
				573	inbytesleft = flen;
				574	outbuf = (char *)to->text + to->len;
				575	outbytesleft = to->asize - to->len;
				576
				577	for (;;)
				578	{
				579	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				580	if (__builtin_expect (inbytesleft == 0, 1))
				581	{
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	582	/* Close out any shift states, returning to the initial state. */
				583	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				584	{
				585	if (errno != E2BIG)
				586	return false;
				587
				588	CONVERT_ICONV_GROW_BUFFER;
				589	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				590	return false;
				591	}
				592
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	593	to->len = to->asize - outbytesleft;
				594	return true;
				595	}
				596	if (errno != E2BIG)
				597	return false;
				598
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	599	CONVERT_ICONV_GROW_BUFFER;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	600	}
				601	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	602	#else
				603	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				604	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	605
				606	/* Arrange for the above custom conversion logic to be used automatically
				607	when conversion between a suitable pair of character sets is requested. */
				608
				609	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				610	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				611
Jan Hubicka	d87fc69	2014-09-22 21:43:02 +0200	[diff] [blame]	612	struct cpp_conversion
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	613	{
				614	const char *pair;
				615	convert_f func;
				616	iconv_t fake_cd;
				617	};
Jan Hubicka	d87fc69	2014-09-22 21:43:02 +0200	[diff] [blame]	618	static const struct cpp_conversion conversion_tab[] = {
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	619	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				620	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				621	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				622	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				623	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				624	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				625	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				626	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				627	};
				628
				629	/* Subroutine of cpp_init_iconv: initialize and return a
				630	cset_converter structure for conversion from FROM to TO. If
				631	iconv_open() fails, issue an error and return an identity
				632	converter. Silently return an identity converter if FROM and TO
				633	are identical. */
				634	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	635	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				636	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	637	struct cset_converter ret;
				638	char *pair;
				639	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	640
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	641	if (!strcasecmp (to, from))
				642	{
				643	ret.func = convert_no_conversion;
				644	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	645	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	646	return ret;
				647	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	648
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	649	pair = (char *) alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	650
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	651	strcpy(pair, from);
				652	strcat(pair, "/");
				653	strcat(pair, to);
				654	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				655	if (!strcasecmp (pair, conversion_tab[i].pair))
				656	{
				657	ret.func = conversion_tab[i].func;
				658	ret.cd = conversion_tab[i].fake_cd;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	659	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	660	return ret;
				661	}
				662
				663	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	664	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	665	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	666	ret.func = convert_using_iconv;
				667	ret.cd = iconv_open (to, from);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	668	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	669
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	670	if (ret.cd == (iconv_t) -1)
				671	{
				672	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	673	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	674	"conversion from %s to %s not supported by iconv",
				675	from, to);
				676	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	677	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	678
				679	ret.func = convert_no_conversion;
				680	}
				681	}
				682	else
				683	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	684	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	685	"no iconv implementation, cannot convert from %s to %s",
				686	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	687	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	688	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	689	ret.width = -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	690	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	691	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	692	}
				693
				694	/* If charset conversion is requested, initialize iconv(3) descriptors
				695	for conversion from the source character set to the execution
				696	character sets. If iconv is not present in the C library, and
				697	conversion is requested, issue an error. */
				698
				699	void
				700	cpp_init_iconv (cpp_reader *pfile)
				701	{
				702	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				703	const char *wcset = CPP_OPTION (pfile, wide_charset);
				704	const char *default_wcset;
				705
				706	bool be = CPP_OPTION (pfile, bytes_big_endian);
				707
				708	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	709	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	710	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	711	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	712	else
				713	/* This effectively means that wide strings are not supported,
				714	so don't do any conversion at all. */
				715	default_wcset = SOURCE_CHARSET;
				716
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	717	if (!ncset)
				718	ncset = SOURCE_CHARSET;
				719	if (!wcset)
				720	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	721
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	722	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	723	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	724	pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
				725	pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	726	pfile->char16_cset_desc = init_iconv_desc (pfile,
				727	be ? "UTF-16BE" : "UTF-16LE",
				728	SOURCE_CHARSET);
				729	pfile->char16_cset_desc.width = 16;
				730	pfile->char32_cset_desc = init_iconv_desc (pfile,
				731	be ? "UTF-32BE" : "UTF-32LE",
				732	SOURCE_CHARSET);
				733	pfile->char32_cset_desc.width = 32;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	734	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	735	pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	736	}
				737
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	738	/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	739	void
				740	_cpp_destroy_iconv (cpp_reader *pfile)
				741	{
				742	if (HAVE_ICONV)
				743	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	744	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				745	iconv_close (pfile->narrow_cset_desc.cd);
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	746	if (pfile->utf8_cset_desc.func == convert_using_iconv)
				747	iconv_close (pfile->utf8_cset_desc.cd);
				748	if (pfile->char16_cset_desc.func == convert_using_iconv)
				749	iconv_close (pfile->char16_cset_desc.cd);
				750	if (pfile->char32_cset_desc.func == convert_using_iconv)
				751	iconv_close (pfile->char32_cset_desc.cd);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	752	if (pfile->wide_cset_desc.func == convert_using_iconv)
				753	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	754	}
				755	}
				756
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	757	/* Utility routine for use by a full compiler. C is a character taken
				758	from the basic source character set, encoded in the host's
				759	execution encoding. Convert it to (the target's) execution
				760	encoding, and return that value.
				761
				762	Issues an internal error if C's representation in the narrow
				763	execution character set fails to be a single-byte value (C99
				764	5.2.1p3: "The representation of each member of the source and
				765	execution character sets shall fit in a byte.") May also issue an
				766	internal error if C fails to be a member of the basic source
				767	character set (testing this exactly is too hard, especially when
				768	the host character set is EBCDIC). */
				769	cppchar_t
				770	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
				771	{
				772	uchar sbuf[1];
				773	struct _cpp_strbuf tbuf;
				774
				775	/* This test is merely an approximation, but it suffices to catch
				776	the most important thing, which is that we don't get handed a
				777	character outside the unibyte range of the host character set. */
				778	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
				779	{
				780	cpp_error (pfile, CPP_DL_ICE,
				781	"character 0x%lx is not in the basic source character set\n",
				782	(unsigned long)c);
				783	return 0;
				784	}
				785
				786	/* Being a character in the unibyte range of the host character set,
				787	we can safely splat it into a one-byte buffer and trust that that
				788	is a well-formed string. */
				789	sbuf[0] = c;
				790
				791	/* This should never need to reallocate, but just in case... */
				792	tbuf.asize = 1;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	793	tbuf.text = XNEWVEC (uchar, tbuf.asize);
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	794	tbuf.len = 0;
				795
				796	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
				797	{
				798	cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
				799	return 0;
				800	}
				801	if (tbuf.len != 1)
				802	{
				803	cpp_error (pfile, CPP_DL_ICE,
				804	"character 0x%lx is not unibyte in execution character set",
				805	(unsigned long)c);
				806	return 0;
				807	}
				808	c = tbuf.text[0];
				809	free(tbuf.text);
				810	return c;
				811	}
				812
				813
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	814
				815	/* Utility routine that computes a mask of the form 0000...111... with
				816	WIDTH 1-bits. */
				817	static inline size_t
				818	width_to_mask (size_t width)
				819	{
				820	width = MIN (width, BITS_PER_CPPCHAR_T);
				821	if (width >= CHAR_BIT * sizeof (size_t))
				822	return ~(size_t) 0;
				823	else
				824	return ((size_t) 1 << width) - 1;
				825	}
				826
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	827	/* A large table of unicode character information. */
				828	enum {
				829	/* Valid in a C99 identifier? */
				830	C99 = 1,
				831	/* Valid in a C99 identifier, but not as the first character? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	832	N99 = 2,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	833	/* Valid in a C++ identifier? */
				834	CXX = 4,
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	835	/* Valid in a C11/C++11 identifier? */
				836	C11 = 8,
				837	/* Valid in a C11/C++11 identifier, but not as the first character? */
				838	N11 = 16,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	839	/* NFC representation is not valid in an identifier? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	840	CID = 32,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	841	/* Might be valid NFC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	842	NFC = 64,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	843	/* Might be valid NFKC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	844	NKC = 128,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	845	/* Certain preceding characters might make it not valid NFC/NKFC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	846	CTX = 256
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	847	};
				848
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	849	struct ucnrange {
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	850	/* Bitmap of flags above. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	851	unsigned short flags;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	852	/* Combining class of the character. */
				853	unsigned char combine;
				854	/* Last character in the range described by this entry. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	855	unsigned int end;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	856	};
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	857	#include "ucnid.h"
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	858
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	859	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				860	the start of an identifier, and 0 if C is not valid in an
				861	identifier. We assume C has already gone through the checks of
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	862	_cpp_valid_ucn. Also update NST for C if returning nonzero. The
				863	algorithm is a simple binary search on the table defined in
				864	ucnid.h. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	865
				866	static int
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	867	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
				868	struct normalize_state *nst)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	869	{
				870	int mn, mx, md;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	871	unsigned short valid_flags, invalid_start_flags;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	872
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	873	if (c > 0x10FFFF)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	874	return 0;
				875
				876	mn = 0;
				877	mx = ARRAY_SIZE (ucnranges) - 1;
				878	while (mx != mn)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	879	{
				880	md = (mn + mx) / 2;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	881	if (c <= ucnranges[md].end)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	882	mx = md;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	883	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	884	mn = md + 1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	885	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	886
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	887	/* When -pedantic, we require the character to have been listed by
				888	the standard for the current language. Otherwise, we accept the
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	889	union of the acceptable sets for all supported language versions. */
				890	valid_flags = C99 \| CXX \| C11;
				891	if (CPP_PEDANTIC (pfile))
				892	{
				893	if (CPP_OPTION (pfile, c11_identifiers))
				894	valid_flags = C11;
				895	else if (CPP_OPTION (pfile, c99))
				896	valid_flags = C99;
				897	else if (CPP_OPTION (pfile, cplusplus))
				898	valid_flags = CXX;
				899	}
				900	if (! (ucnranges[mn].flags & valid_flags))
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	901	return 0;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	902	if (CPP_OPTION (pfile, c11_identifiers))
				903	invalid_start_flags = N11;
				904	else if (CPP_OPTION (pfile, c99))
				905	invalid_start_flags = N99;
				906	else
				907	invalid_start_flags = 0;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	908
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	909	/* Update NST. */
				910	if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
				911	nst->level = normalized_none;
				912	else if (ucnranges[mn].flags & CTX)
				913	{
				914	bool safe;
				915	cppchar_t p = nst->previous;
				916
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	917	/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
				918	and are combined algorithmically from a sequence of the form
				919	1100-1112 1161-1175 11A8-11C2
				920	(if the third is not present, it is treated as 11A7, which is not
				921	really a valid character).
				922	Unfortunately, C99 allows (only) the NFC form, but C++ allows
				923	only the combining characters. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	924	if (c >= 0x1161 && c <= 0x1175)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	925	safe = p < 0x1100 \|\| p > 0x1112;
				926	else if (c >= 0x11A8 && c <= 0x11C2)
				927	safe = (p < 0xAC00 \|\| p > 0xD7A3 \|\| (p - 0xAC00) % 28 != 0);
				928	else
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	929	safe = check_nfc (pfile, c, p);
				930	if (!safe)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	931	{
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	932	if ((c >= 0x1161 && c <= 0x1175) \|\| (c >= 0x11A8 && c <= 0x11C2))
				933	nst->level = MAX (nst->level, normalized_identifier_C);
				934	else
				935	nst->level = normalized_none;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	936	}
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	937	}
				938	else if (ucnranges[mn].flags & NKC)
				939	;
				940	else if (ucnranges[mn].flags & NFC)
				941	nst->level = MAX (nst->level, normalized_C);
				942	else if (ucnranges[mn].flags & CID)
				943	nst->level = MAX (nst->level, normalized_identifier_C);
				944	else
				945	nst->level = normalized_none;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	946	if (ucnranges[mn].combine == 0)
				947	nst->previous = c;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	948	nst->prev_class = ucnranges[mn].combine;
				949
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	950	/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
				951	UCN combining characters may not begin identifiers. */
				952	if (ucnranges[mn].flags & invalid_start_flags)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	953	return 2;
				954
				955	return 1;
				956	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	957
				958	/* [lex.charset]: The character designated by the universal character
				959	name \UNNNNNNNN is that character whose character short name in
				960	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				961	universal character name \uNNNN is that character whose character
				962	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	963	for a universal character name corresponds to a surrogate code point
				964	(in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
				965	Additionally, if the hexadecimal value for a universal-character-name
				966	outside a character or string literal corresponds to a control character
				967	(in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
				968	character in the basic source character set, the program is ill-formed.
				969
				970	C99 6.4.3: A universal character name shall not specify a character
				971	whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
				972	or 0060 (`), nor one in the range D800 through DFFF inclusive.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	973
				974	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
Geoffrey Keating	c79e602	2005-03-16 00:59:31 +0000	[diff] [blame]	975	buffer end is delimited by a non-hex digit. Returns zero if the
				976	UCN has not been consumed.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	977
Kazu Hirata	6356f89	2003-06-12 19:01:08 +0000	[diff] [blame]	978	Otherwise the nonzero value of the UCN, whether valid or invalid,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	979	is returned. Diagnostics are emitted for invalid values. PSTR
				980	is updated to point one beyond the UCN, or to the syntactically
				981	invalid character.
				982
				983	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	984	an identifier, or 2 otherwise. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	985
				986	cppchar_t
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	987	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	988	const uchar *limit, int identifier_pos,
				989	struct normalize_state *nst)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	990	{
				991	cppchar_t result, c;
				992	unsigned int length;
				993	const uchar str = pstr;
				994	const uchar *base = str - 2;
				995
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	996	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	997	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	998	"universal character names are only valid in C++ and C99");
Marek Polacek	177cce4	2014-08-19 05:34:31 +0000	[diff] [blame]	999	else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
Marek Polacek	f3bede7	2014-08-10 06:10:49 +0000	[diff] [blame]	1000	&& !CPP_OPTION (pfile, cplusplus))
				1001	cpp_error (pfile, CPP_DL_WARNING,
				1002	"C99's universal character names are incompatible with C90");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1003	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1004	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1005	"the meaning of '\\%c' is different in traditional C",
				1006	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1007
				1008	if (str[-1] == 'u')
				1009	length = 4;
				1010	else if (str[-1] == 'U')
				1011	length = 8;
				1012	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1013	{
				1014	cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
				1015	length = 4;
				1016	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1017
				1018	result = 0;
				1019	do
				1020	{
				1021	c = *str;
				1022	if (!ISXDIGIT (c))
				1023	break;
				1024	str++;
				1025	result = (result << 4) + hex_value (c);
				1026	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1027	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1028
Geoffrey Keating	c79e602	2005-03-16 00:59:31 +0000	[diff] [blame]	1029	/* Partial UCNs are not valid in strings, but decompose into
				1030	multiple tokens in identifiers, so we can't give a helpful
				1031	error message in that case. */
				1032	if (length && identifier_pos)
				1033	return 0;
				1034
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1035	*pstr = str;
				1036	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1037	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1038	cpp_error (pfile, CPP_DL_ERROR,
				1039	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1040	(int) (str - base), base);
				1041	result = 1;
				1042	}
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1043	/* The C99 standard permits $, @ and ` to be specified as UCNs. We use
				1044	hex escapes so that this also works with EBCDIC hosts.
				1045	C++0x permits everything below 0xa0 within literals;
				1046	ucn_valid_in_identifier will complain about identifiers. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1047	else if ((result < 0xa0
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1048	&& !CPP_OPTION (pfile, cplusplus)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1049	&& (result != 0x24 && result != 0x40 && result != 0x60))
				1050	\|\| (result & 0x80000000)
				1051	\|\| (result >= 0xD800 && result <= 0xDFFF))
				1052	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1053	cpp_error (pfile, CPP_DL_ERROR,
				1054	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1055	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1056	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1057	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1058	else if (identifier_pos && result == 0x24
				1059	&& CPP_OPTION (pfile, dollars_in_ident))
				1060	{
				1061	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
				1062	{
				1063	CPP_OPTION (pfile, warn_dollars) = 0;
				1064	cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
				1065	}
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	1066	NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1067	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1068	else if (identifier_pos)
				1069	{
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1070	int validity = ucn_valid_in_identifier (pfile, result, nst);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1071
				1072	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1073	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1074	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1075	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1076	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1077	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1078	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1079	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1080	}
				1081
				1082	if (result == 0)
				1083	result = 1;
				1084
				1085	return result;
				1086	}
				1087
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1088	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
				1089	it to the execution character set and write the result into TBUF.
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1090	An advanced pointer is returned. Issues all relevant diagnostics. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1091	static const uchar *
				1092	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1093	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1094	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1095	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1096	uchar buf[6];
				1097	uchar *bufp = buf;
				1098	size_t bytesleft = 6;
				1099	int rval;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1100	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1101
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1102	from++; /* Skip u/U. */
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1103	ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1104
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1105	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				1106	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1107	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1108	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1109	cpp_errno (pfile, CPP_DL_ERROR,
				1110	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1111	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1112	else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1113	cpp_errno (pfile, CPP_DL_ERROR,
				1114	"converting UCN to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1115
				1116	return from;
				1117	}
				1118
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1119	/* Subroutine of convert_hex and convert_oct. N is the representation
				1120	in the execution character set of a numeric escape; write it into the
				1121	string buffer TBUF and update the end-of-string pointer therein. WIDE
				1122	is true if it's a wide string that's being assembled in TBUF. This
				1123	function issues no diagnostics and never fails. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1124	static void
				1125	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1126	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1127	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1128	size_t width = cvt.width;
				1129
				1130	if (width != CPP_OPTION (pfile, char_precision))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1131	{
				1132	/* We have to render this into the target byte order, which may not
				1133	be our byte order. */
				1134	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1135	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1136	size_t cmask = width_to_mask (cwidth);
				1137	size_t nbwc = width / cwidth;
				1138	size_t i;
				1139	size_t off = tbuf->len;
				1140	cppchar_t c;
				1141
				1142	if (tbuf->len + nbwc > tbuf->asize)
				1143	{
				1144	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1145	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1146	}
				1147
				1148	for (i = 0; i < nbwc; i++)
				1149	{
				1150	c = n & cmask;
				1151	n >>= cwidth;
				1152	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				1153	}
				1154	tbuf->len += nbwc;
				1155	}
				1156	else
				1157	{
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1158	/* Note: this code does not handle the case where the target
				1159	and host have a different number of bits in a byte. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1160	if (tbuf->len + 1 > tbuf->asize)
				1161	{
				1162	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1163	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1164	}
				1165	tbuf->text[tbuf->len++] = n;
				1166	}
				1167	}
				1168
				1169	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
				1170	character set and write it into the string buffer TBUF. Returns an
				1171	advanced pointer, and issues diagnostics as necessary.
				1172	No character set translation occurs; this routine always produces the
				1173	execution-set character with numeric value equal to the given hex
				1174	number. You can, e.g. generate surrogate pairs this way. */
				1175	static const uchar *
				1176	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1177	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1178	{
				1179	cppchar_t c, n = 0, overflow = 0;
				1180	int digits_found = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1181	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1182	size_t mask = width_to_mask (width);
				1183
				1184	if (CPP_WTRADITIONAL (pfile))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1185	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1186	"the meaning of '\\x' is different in traditional C");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1187
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1188	from++; /* Skip 'x'. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1189	while (from < limit)
				1190	{
				1191	c = *from;
				1192	if (! hex_p (c))
				1193	break;
				1194	from++;
				1195	overflow \|= n ^ (n << 4 >> 4);
				1196	n = (n << 4) + hex_value (c);
				1197	digits_found = 1;
				1198	}
				1199
				1200	if (!digits_found)
				1201	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1202	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1203	"\\x used with no following hex digits");
				1204	return from;
				1205	}
				1206
				1207	if (overflow \| (n != (n & mask)))
				1208	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1209	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1210	"hex escape sequence out of range");
				1211	n &= mask;
				1212	}
				1213
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1214	emit_numeric_escape (pfile, n, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1215
				1216	return from;
				1217	}
				1218
				1219	/* Convert an octal escape, pointed to by FROM, to the execution
				1220	character set and write it into the string buffer TBUF. Returns an
				1221	advanced pointer, and issues diagnostics as necessary.
				1222	No character set translation occurs; this routine always produces the
				1223	execution-set character with numeric value equal to the given octal
				1224	number. */
				1225	static const uchar *
				1226	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1227	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1228	{
				1229	size_t count = 0;
				1230	cppchar_t c, n = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1231	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1232	size_t mask = width_to_mask (width);
				1233	bool overflow = false;
				1234
				1235	while (from < limit && count++ < 3)
				1236	{
				1237	c = *from;
				1238	if (c < '0' \|\| c > '7')
				1239	break;
				1240	from++;
				1241	overflow \|= n ^ (n << 3 >> 3);
				1242	n = (n << 3) + c - '0';
				1243	}
				1244
				1245	if (n != (n & mask))
				1246	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1247	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1248	"octal escape sequence out of range");
				1249	n &= mask;
				1250	}
				1251
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1252	emit_numeric_escape (pfile, n, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1253
				1254	return from;
				1255	}
				1256
				1257	/* Convert an escape sequence (pointed to by FROM) to its value on
				1258	the target, and to the execution character set. Do not scan past
				1259	LIMIT. Write the converted value into TBUF. Returns an advanced
				1260	pointer. Handles all relevant diagnostics. */
				1261	static const uchar *
				1262	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1263	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1264	{
				1265	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1266	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1267	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1268	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1269	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1270	#else
				1271	#error "unknown host character set"
				1272	#endif
				1273
				1274	uchar c;
				1275
				1276	c = *from;
				1277	switch (c)
				1278	{
				1279	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1280	case 'u': case 'U':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1281	return convert_ucn (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1282
				1283	case 'x':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1284	return convert_hex (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1285	break;
				1286
				1287	case '0': case '1': case '2': case '3':
				1288	case '4': case '5': case '6': case '7':
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1289	return convert_oct (pfile, from, limit, tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1290
				1291	/* Various letter escapes. Get the appropriate host-charset
				1292	value into C. */
				1293	case '\\': case '\'': case '"': case '?': break;
				1294
				1295	case '(': case '{': case '[': case '%':
				1296	/* '\(', etc, can be used at the beginning of a line in a long
				1297	string split onto multiple lines with \-newline, to prevent
				1298	Emacs or other text editors from getting confused. '\%' can
				1299	be used to prevent SCCS from mangling printf format strings. */
				1300	if (CPP_PEDANTIC (pfile))
				1301	goto unknown;
				1302	break;
				1303
				1304	case 'b': c = charconsts[1]; break;
				1305	case 'f': c = charconsts[3]; break;
				1306	case 'n': c = charconsts[4]; break;
				1307	case 'r': c = charconsts[5]; break;
				1308	case 't': c = charconsts[6]; break;
				1309	case 'v': c = charconsts[7]; break;
				1310
				1311	case 'a':
				1312	if (CPP_WTRADITIONAL (pfile))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1313	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1314	"the meaning of '\\a' is different in traditional C");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1315	c = charconsts[0];
				1316	break;
				1317
				1318	case 'e': case 'E':
				1319	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1320	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1321	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1322	c = charconsts[2];
				1323	break;
				1324
				1325	default:
				1326	unknown:
				1327	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1328	cpp_error (pfile, CPP_DL_PEDWARN,
Tom Tromey	709a22d	2009-08-17 17:34:53 +0000	[diff] [blame]	1329	"unknown escape sequence: '\\%c'", (int) c);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1330	else
Joseph Myers	178b58b	2005-11-03 23:08:18 +0000	[diff] [blame]	1331	{
				1332	/* diagnostic.c does not support "%03o". When it does, this
				1333	code can use %03o directly in the diagnostic again. */
				1334	char buf[32];
				1335	sprintf(buf, "%03o", (int) c);
				1336	cpp_error (pfile, CPP_DL_PEDWARN,
				1337	"unknown escape sequence: '\\%s'", buf);
				1338	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1339	}
				1340
				1341	/* Now convert what we have to the execution character set. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1342	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1343	cpp_errno (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1344	"converting escape sequence to execution character set");
				1345
				1346	return from + 1;
				1347	}
				1348
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1349	/* TYPE is a token type. The return value is the conversion needed to
				1350	convert from source to execution character set for the given type. */
				1351	static struct cset_converter
				1352	converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
				1353	{
				1354	switch (type)
				1355	{
				1356	default:
				1357	return pfile->narrow_cset_desc;
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame^]	1358	case CPP_UTF8CHAR:
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1359	case CPP_UTF8STRING:
				1360	return pfile->utf8_cset_desc;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1361	case CPP_CHAR16:
				1362	case CPP_STRING16:
				1363	return pfile->char16_cset_desc;
				1364	case CPP_CHAR32:
				1365	case CPP_STRING32:
				1366	return pfile->char32_cset_desc;
				1367	case CPP_WCHAR:
				1368	case CPP_WSTRING:
				1369	return pfile->wide_cset_desc;
				1370	}
				1371	}
				1372
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1373	/* FROM is an array of cpp_string structures of length COUNT. These
				1374	are to be converted from the source to the execution character set,
				1375	escape sequences translated, and finally all are to be
				1376	concatenated. WIDE indicates whether or not to produce a wide
				1377	string. The result is written into TO. Returns true for success,
				1378	false for failure. */
				1379	bool
				1380	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1381	cpp_string *to, enum cpp_ttype type)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1382	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1383	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1384	const uchar p, base, *limit;
				1385	size_t i;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1386	struct cset_converter cvt = converter_for_type (pfile, type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1387
				1388	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1389	tbuf.text = XNEWVEC (uchar, tbuf.asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1390	tbuf.len = 0;
				1391
				1392	for (i = 0; i < count; i++)
				1393	{
				1394	p = from[i].text;
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1395	if (*p == 'u')
				1396	{
				1397	if (*++p == '8')
				1398	p++;
				1399	}
				1400	else if (p == 'L' \|\| p == 'U') p++;
				1401	if (*p == 'R')
				1402	{
				1403	const uchar *prefix;
				1404
				1405	/* Skip over 'R"'. */
				1406	p += 2;
				1407	prefix = p;
Jason Merrill	5215062	2010-03-29 11:00:43 -0400	[diff] [blame]	1408	while (*p != '(')
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1409	p++;
				1410	p++;
				1411	limit = from[i].text + from[i].len;
				1412	if (limit >= p + (p - prefix) + 1)
				1413	limit -= (p - prefix) + 1;
				1414
Jason Merrill	00a81b8	2010-03-29 16:07:29 -0400	[diff] [blame]	1415	/* Raw strings are all normal characters; these can be fed
				1416	directly to convert_cset. */
				1417	if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
				1418	goto fail;
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1419
				1420	continue;
				1421	}
				1422
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1423	p++; /* Skip leading quote. */
				1424	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1425
				1426	for (;;)
				1427	{
				1428	base = p;
				1429	while (p < limit && *p != '\\')
				1430	p++;
				1431	if (p > base)
				1432	{
				1433	/* We have a run of normal characters; these can be fed
				1434	directly to convert_cset. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1435	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1436	goto fail;
				1437	}
				1438	if (p == limit)
				1439	break;
				1440
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1441	p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1442	}
				1443	}
				1444	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1445	structure. */
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1446	emit_numeric_escape (pfile, 0, &tbuf, cvt);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1447	tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1448	to->text = tbuf.text;
				1449	to->len = tbuf.len;
				1450	return true;
				1451
				1452	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1453	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1454	free (tbuf.text);
				1455	return false;
				1456	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1457
				1458	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1459	in a string, but do not perform character set conversion. */
				1460	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1461	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1462	size_t count, cpp_string *to,
				1463	enum cpp_ttype type ATTRIBUTE_UNUSED)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1464	{
				1465	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1466	bool retval;
				1467
				1468	pfile->narrow_cset_desc.func = convert_no_conversion;
				1469	pfile->narrow_cset_desc.cd = (iconv_t) -1;
H.J. Lu	0b7c73c	2008-06-12 17:03:41 +0000	[diff] [blame]	1470	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1471
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1472	retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1473
				1474	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1475	return retval;
				1476	}
				1477
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1478
				1479	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1480	to a number, for narrow strings. STR is the string structure returned
				1481	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1482	cpp_interpret_charconst. */
				1483	static cppchar_t
				1484	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1485	unsigned int pchars_seen, int unsignedp)
				1486	{
				1487	size_t width = CPP_OPTION (pfile, char_precision);
				1488	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1489	size_t mask = width_to_mask (width);
				1490	size_t i;
				1491	cppchar_t result, c;
				1492	bool unsigned_p;
				1493
				1494	/* The value of a multi-character character constant, or a
				1495	single-character character constant whose representation in the
				1496	execution character set is more than one byte long, is
				1497	implementation defined. This implementation defines it to be the
				1498	number formed by interpreting the byte sequence in memory as a
				1499	big-endian binary number. If overflow occurs, the high bytes are
				1500	lost, and a warning is issued.
				1501
				1502	We don't want to process the NUL terminator handed back by
				1503	cpp_interpret_string. */
				1504	result = 0;
				1505	for (i = 0; i < str.len - 1; i++)
				1506	{
				1507	c = str.text[i] & mask;
				1508	if (width < BITS_PER_CPPCHAR_T)
				1509	result = (result << width) \| c;
				1510	else
				1511	result = c;
				1512	}
				1513
				1514	if (i > max_chars)
				1515	{
				1516	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1517	cpp_error (pfile, CPP_DL_WARNING,
				1518	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1519	}
				1520	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1521	cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1522
				1523	/* Multichar constants are of type int and therefore signed. */
				1524	if (i > 1)
				1525	unsigned_p = 0;
				1526	else
				1527	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1528
				1529	/* Truncate the constant to its natural width, and simultaneously
				1530	sign- or zero-extend to the full width of cppchar_t.
				1531	For single-character constants, the value is WIDTH bits wide.
				1532	For multi-character constants, the value is INT_PRECISION bits wide. */
				1533	if (i > 1)
				1534	width = CPP_OPTION (pfile, int_precision);
				1535	if (width < BITS_PER_CPPCHAR_T)
				1536	{
				1537	mask = ((cppchar_t) 1 << width) - 1;
				1538	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1539	result &= mask;
				1540	else
				1541	result \|= ~mask;
				1542	}
				1543	*pchars_seen = i;
				1544	*unsignedp = unsigned_p;
				1545	return result;
				1546	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1547
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1548	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1549	to a number, for wide strings. STR is the string structure returned
				1550	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1551	cpp_interpret_charconst. TYPE is the token type. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1552	static cppchar_t
				1553	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1554	unsigned int pchars_seen, int unsignedp,
				1555	enum cpp_ttype type)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1556	{
				1557	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1558	size_t width = converter_for_type (pfile, type).width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1559	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1560	size_t mask = width_to_mask (width);
				1561	size_t cmask = width_to_mask (cwidth);
				1562	size_t nbwc = width / cwidth;
				1563	size_t off, i;
				1564	cppchar_t result = 0, c;
				1565
				1566	/* This is finicky because the string is in the target's byte order,
				1567	which may not be our byte order. Only the last character, ignoring
				1568	the NUL terminator, is relevant. */
				1569	off = str.len - (nbwc * 2);
				1570	result = 0;
				1571	for (i = 0; i < nbwc; i++)
				1572	{
				1573	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1574	result = (result << cwidth) \| (c & cmask);
				1575	}
				1576
				1577	/* Wide character constants have type wchar_t, and a single
				1578	character exactly fills a wchar_t, so a multi-character wide
				1579	character constant is guaranteed to overflow. */
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1580	if (str.len > nbwc * 2)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1581	cpp_error (pfile, CPP_DL_WARNING,
				1582	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1583
				1584	/* Truncate the constant to its natural width, and simultaneously
				1585	sign- or zero-extend to the full width of cppchar_t. */
				1586	if (width < BITS_PER_CPPCHAR_T)
				1587	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1588	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1589	\|\| CPP_OPTION (pfile, unsigned_wchar)
				1590	\|\| !(result & (1 << (width - 1))))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1591	result &= mask;
				1592	else
				1593	result \|= ~mask;
				1594	}
				1595
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1596	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1597	\|\| CPP_OPTION (pfile, unsigned_wchar))
				1598	*unsignedp = 1;
				1599	else
				1600	*unsignedp = 0;
				1601
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1602	*pchars_seen = 1;
				1603	return result;
				1604	}
				1605
				1606	/* Interpret a (possibly wide) character constant in TOKEN.
				1607	PCHARS_SEEN points to a variable that is filled in with the number
				1608	of characters seen, and UNSIGNEDP to a variable that indicates
				1609	whether the result has signed type. */
				1610	cppchar_t
				1611	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1612	unsigned int pchars_seen, int unsignedp)
				1613	{
				1614	cpp_string str = { 0, 0 };
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame^]	1615	bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
				1616	int u8 = 2 * int(token->type == CPP_UTF8CHAR);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1617	cppchar_t result;
				1618
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame^]	1619	/* An empty constant will appear as L'', u'', U'', u8'', or '' */
				1620	if (token->val.str.len == (size_t) (2 + wide + u8))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1621	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1622	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1623	return 0;
				1624	}
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1625	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1626	return 0;
				1627
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1628	if (wide)
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1629	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
				1630	token->type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1631	else
				1632	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1633
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1634	if (str.text != token->val.str.text)
				1635	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1636
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1637	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1638	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1639
				1640	/* Convert an identifier denoted by ID and LEN, which might contain
				1641	UCN escapes, to the source character set, either UTF-8 or
				1642	UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
				1643	cpp_hashnode *
				1644	_cpp_interpret_identifier (cpp_reader pfile, const uchar id, size_t len)
				1645	{
				1646	/* It turns out that a UCN escape always turns into fewer characters
				1647	than the escape itself, so we can allocate a temporary in advance. */
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1648	uchar * buf = (uchar *) alloca (len + 1);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1649	uchar * bufp = buf;
				1650	size_t idp;
				1651
				1652	for (idp = 0; idp < len; idp++)
				1653	if (id[idp] != '\\')
				1654	*bufp++ = id[idp];
				1655	else
				1656	{
				1657	unsigned length = id[idp+1] == 'u' ? 4 : 8;
				1658	cppchar_t value = 0;
				1659	size_t bufleft = len - (bufp - buf);
				1660	int rval;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1661
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1662	idp += 2;
				1663	while (length && idp < len && ISXDIGIT (id[idp]))
				1664	{
				1665	value = (value << 4) + hex_value (id[idp]);
				1666	idp++;
				1667	length--;
				1668	}
				1669	idp--;
				1670
				1671	/* Special case for EBCDIC: if the identifier contains
				1672	a '$' specified using a UCN, translate it to EBCDIC. */
				1673	if (value == 0x24)
				1674	{
				1675	*bufp++ = '$';
				1676	continue;
				1677	}
				1678
				1679	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
				1680	if (rval)
				1681	{
				1682	errno = rval;
				1683	cpp_errno (pfile, CPP_DL_ERROR,
				1684	"converting UCN to source character set");
				1685	break;
				1686	}
				1687	}
				1688
				1689	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				1690	buf, bufp - buf, HT_ALLOC));
				1691	}
				1692
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1693	/* Convert an input buffer (containing the complete contents of one
				1694	source file) from INPUT_CHARSET to the source character set. INPUT
				1695	points to the input buffer, SIZE is its allocated size, and LEN is
				1696	the length of the meaningful data within the buffer. The
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1697	translated buffer is returned, *ST_SIZE is set to the length of
				1698	the meaningful data within the translated buffer, and *BUFFER_START
				1699	is set to the start of the returned buffer. *BUFFER_START may
				1700	differ from the return value in the case of a BOM or other ignored
				1701	marker information.
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1702
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1703	INPUT is expected to have been allocated with xmalloc. This
				1704	function will either set *BUFFER_START to INPUT, or free it and set
				1705	*BUFFER_START to a pointer to another xmalloc-allocated block of
				1706	memory. */
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1707	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1708	_cpp_convert_input (cpp_reader pfile, const char input_charset,
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1709	uchar *input, size_t size, size_t len,
				1710	const unsigned char *buffer_start, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1711	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1712	struct cset_converter input_cset;
				1713	struct _cpp_strbuf to;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1714	unsigned char *buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1715
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1716	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				1717	if (input_cset.func == convert_no_conversion)
				1718	{
				1719	to.text = input;
				1720	to.asize = size;
				1721	to.len = len;
				1722	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1723	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1724	{
				1725	to.asize = MAX (65536, len);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1726	to.text = XNEWVEC (uchar, to.asize);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1727	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1728
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1729	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				1730	cpp_error (pfile, CPP_DL_ERROR,
				1731	"failure to convert %s to %s",
				1732	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				1733
				1734	free (input);
				1735	}
				1736
				1737	/* Clean up the mess. */
				1738	if (input_cset.func == convert_using_iconv)
				1739	iconv_close (input_cset.cd);
				1740
				1741	/* Resize buffer if we allocated substantially too much, or if we
Jakub Jelinek	f41e5bd	2012-12-03 18:19:47 +0100	[diff] [blame]	1742	haven't enough space for the \n-terminator or following
				1743	15 bytes of padding (used to quiet warnings from valgrind or
				1744	Address Sanitizer, when the optimized lexer accesses aligned
				1745	16-byte memory chunks, including the bytes after the malloced,
				1746	area, and stops lexing on '\n'). */
				1747	if (to.len + 4096 < to.asize \|\| to.len + 16 > to.asize)
				1748	to.text = XRESIZEVEC (uchar, to.text, to.len + 16);
				1749
				1750	memset (to.text + to.len, '\0', 16);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1751
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1752	/* If the file is using old-school Mac line endings (\r only),
				1753	terminate with another \r, not an \n, so that we do not mistake
				1754	the \r\n sequence for a single DOS line ending and erroneously
				1755	issue the "No newline at end of file" diagnostic. */
Tom Tromey	30b0edc	2006-12-28 18:45:48 +0000	[diff] [blame]	1756	if (to.len && to.text[to.len - 1] == '\r')
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	1757	to.text[to.len] = '\r';
				1758	else
				1759	to.text[to.len] = '\n';
				1760
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1761	buffer = to.text;
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1762	*st_size = to.len;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	1763	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1764	/* The HOST_CHARSET test just above ensures that the source charset
				1765	is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
				1766	glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
				1767	BOM -- however, even if it did, we would still need this code due
				1768	to the 'convert_no_conversion' case. */
				1769	if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
				1770	&& to.text[2] == 0xbf)
				1771	{
				1772	*st_size -= 3;
				1773	buffer += 3;
				1774	}
				1775	#endif
				1776
				1777	*buffer_start = to.text;
				1778	return buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1779	}
				1780
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1781	/* Decide on the default encoding to assume for input files. */
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1782	const char *
				1783	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1784	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1785	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1786
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1787	/* We disable this because the default codeset is 7-bit ASCII on
				1788	most platforms, and this causes conversion failures on every
				1789	file in GCC that happens to have one of the upper 128 characters
				1790	in it -- most likely, as part of the name of a contributor.
				1791	We should definitely recognize in-band markers of file encoding,
				1792	like:
				1793	- the appropriate Unicode byte-order mark (FE FF) to recognize
				1794	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				1795	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame]	1796	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1797	distinguish ASCII and EBCDIC.
				1798	- now we can parse something like "#pragma GCC encoding <xyz>
				1799	on the first line, or even Emacs/VIM's mode line tags (there's
				1800	a problem here in that VIM uses the last line, and Emacs has
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1801	its more elaborate "local variables" convention).
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	1802	- investigate whether Java has another common convention, which
				1803	would be friendly to support.
				1804	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				1805	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	1806	setlocale (LC_CTYPE, "");
				1807	current_encoding = nl_langinfo (CODESET);
				1808	#endif
				1809	if (current_encoding == NULL \|\| *current_encoding == '\0')
				1810	current_encoding = SOURCE_CHARSET;
				1811
				1812	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1813	}