Blame - libcpp/charset.c - toolchain/gcc

blob: c6dce0d063e952438e85f1f8fee249be69b41ad2 [file] [log] [blame]

Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1	/* CPP Library - charsets
Jakub Jelinek	85ec4fe	2018-01-03 11:03:58 +0100	[diff] [blame]	2	Copyright (C) 1998-2018 Free Software Foundation, Inc.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	3
				4	Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
				5
				6	This program is free software; you can redistribute it and/or modify it
				7	under the terms of the GNU General Public License as published by the
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	8	Free Software Foundation; either version 3, or (at your option) any
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	9	later version.
				10
				11	This program is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	GNU General Public License for more details.
				15
				16	You should have received a copy of the GNU General Public License
Jakub Jelinek	748086b	2009-04-09 17:00:19 +0200	[diff] [blame]	17	along with this program; see the file COPYING3. If not see
				18	<http://www.gnu.org/licenses/>. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	19
				20	#include "config.h"
				21	#include "system.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	22	#include "cpplib.h"
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	23	#include "internal.h"
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	24
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	25	/* Character set handling for C-family languages.
				26
				27	Terminological note: In what follows, "charset" or "character set"
				28	will be taken to mean both an abstract set of characters and an
				29	encoding for that set.
				30
				31	The C99 standard discusses two character sets: source and execution.
				32	The source character set is used for internal processing in translation
				33	phases 1 through 4; the execution character set is used thereafter.
				34	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
				35	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
				36	of these terms). Furthermore, the "basic character set" (listed in
				37	5.2.1p3) is to be encoded in each with values one byte wide, and is
				38	to appear in the initial shift state.
				39
				40	It is not explicitly mentioned, but there is also a "wide execution
				41	character set" used to encode wide character constants and wide
				42	string literals; this is supposed to be the result of applying the
				43	standard library function mbstowcs() to an equivalent narrow string
				44	(6.4.5p5). However, the behavior of hexadecimal and octal
				45	\-escapes is at odds with this; they are supposed to be translated
				46	directly to wchar_t values (6.4.4.4p5,6).
				47
				48	The source character set is not necessarily the character set used
				49	to encode physical source files on disk; translation phase 1 converts
				50	from whatever that encoding is to the source character set.
				51
				52	The presence of universal character names in C99 (6.4.3 et seq.)
				53	forces the source character set to be isomorphic to ISO 10646,
				54	that is, Unicode. There is no such constraint on the execution
				55	character set; note also that the conversion from source to
				56	execution character set does not occur for identifiers (5.1.1.2p1#5).
				57
				58	For convenience of implementation, the source character set's
				59	encoding of the basic character set should be identical to the
				60	execution character set OF THE HOST SYSTEM's encoding of the basic
				61	character set, and it should not be a state-dependent encoding.
				62
				63	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
				64	depending on whether the host is based on ASCII or EBCDIC (see
				65	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	66	Technical Report #16). With limited exceptions, it relies on the
				67	system library's iconv() primitive to do charset conversion
				68	(specified in SUSv2). */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	69
				70	#if !HAVE_ICONV
				71	/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
				72	below, which are guarded only by if statements with compile-time
				73	constant conditions, do not cause link errors. */
				74	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
Zack Weinberg	f1c4bc4	2003-07-05 16:44:29 +0200	[diff] [blame]	75	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	76	#define iconv_close(x) (void)0
Andrew Pinski	5beadb3	2003-07-07 04:46:29 +0000	[diff] [blame]	77	#define ICONV_CONST
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	78	#endif
				79
				80	#if HOST_CHARSET == HOST_CHARSET_ASCII
				81	#define SOURCE_CHARSET "UTF-8"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	82	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	83	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				84	#define SOURCE_CHARSET "UTF-EBCDIC"
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	85	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	86	#else
				87	#error "Unrecognized basic host character set"
				88	#endif
				89
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	90	#ifndef EILSEQ
				91	#define EILSEQ EINVAL
				92	#endif
				93
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	94	/* This structure is used for a resizable string buffer throughout. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	95	/* Don't call it strbuf, as that conflicts with unistd.h on systems
Kazu Hirata	0ee55ad	2003-10-05 13:09:48 +0000	[diff] [blame]	96	such as DYNIX/ptx where unistd.h includes stropts.h. */
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	97	struct _cpp_strbuf
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	98	{
				99	uchar *text;
				100	size_t asize;
				101	size_t len;
				102	};
				103
				104	/* This is enough to hold any string that fits on a single 80-column
				105	line, even if iconv quadruples its size (e.g. conversion from
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	106	ASCII to UTF-32) rounded up to a power of two. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	107	#define OUTBUF_BLOCK_SIZE 256
				108
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	109	/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
				110	logic. This is because a depressing number of systems lack iconv,
				111	or have have iconv libraries that do not do these conversions, so
				112	we need a fallback implementation for them. To ensure the fallback
				113	doesn't break due to neglect, it is used on all systems.
				114
				115	UTF-32 encoding is nice and simple: a four-byte binary number,
				116	constrained to the range 00000000-7FFFFFFF to avoid questions of
				117	signedness. We do have to cope with big- and little-endian
				118	variants.
				119
				120	UTF-16 encoding uses two-byte binary numbers, again in big- and
				121	little-endian variants, for all values in the 00000000-0000FFFF
				122	range. Values in the 00010000-0010FFFF range are encoded as pairs
				123	of two-byte numbers, called "surrogate pairs": given a number S in
				124	this range, it is mapped to a pair (H, L) as follows:
				125
				126	H = (S - 0x10000) / 0x400 + 0xD800
				127	L = (S - 0x10000) % 0x400 + 0xDC00
				128
				129	Two-byte values in the D800...DFFF range are ill-formed except as a
				130	component of a surrogate pair. Even if the encoding within a
				131	two-byte value is little-endian, the H member of the surrogate pair
				132	comes first.
				133
				134	There is no way to encode values in the 00110000-7FFFFFFF range,
				135	which is not currently a problem as there are no assigned code
				136	points in that range; however, the author expects that it will
				137	eventually become necessary to abandon UTF-16 due to this
				138	limitation. Note also that, because of these pairs, UTF-16 does
				139	not meet the requirements of the C standard for a wide character
				140	encoding (see 3.7.3 and 6.4.4.4p11).
				141
				142	UTF-8 encoding looks like this:
				143
				144	value range encoded as
				145	00000000-0000007F 0xxxxxxx
				146	00000080-000007FF 110xxxxx 10xxxxxx
				147	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
				148	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				149	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				150	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				151
				152	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
				153	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
				154	never occur. Note also that any value that can be encoded by a
				155	given row of the table can also be encoded by all successive rows,
				156	but this is not done; only the shortest possible encoding for any
				157	given value is valid. For instance, the character 07C0 could be
				158	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
				159	FC 80 80 80 9F 80. Only the first is valid.
				160
				161	An implementation note: the transformation from UTF-16 to UTF-8, or
				162	vice versa, is easiest done by using UTF-32 as an intermediary. */
				163
				164	/* Internal primitives which go from an UTF-8 byte stream to native-endian
				165	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
				166	operation in several places below. */
				167	static inline int
				168	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
				169	cppchar_t *cp)
				170	{
Joseph Myers	9e322bc	2009-05-03 12:59:26 +0100	[diff] [blame]	171	static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	172	static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	173
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	174	cppchar_t c;
				175	const uchar inbuf = inbufp;
				176	size_t nbytes, i;
				177
				178	if (*inbytesleftp < 1)
				179	return EINVAL;
				180
				181	c = *inbuf;
				182	if (c < 0x80)
				183	{
				184	*cp = c;
				185	*inbytesleftp -= 1;
				186	*inbufp += 1;
				187	return 0;
				188	}
				189
				190	/* The number of leading 1-bits in the first byte indicates how many
				191	bytes follow. */
				192	for (nbytes = 2; nbytes < 7; nbytes++)
				193	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
				194	goto found;
				195	return EILSEQ;
				196	found:
				197
				198	if (*inbytesleftp < nbytes)
				199	return EINVAL;
				200
				201	c = (c & masks[nbytes-1]);
				202	inbuf++;
				203	for (i = 1; i < nbytes; i++)
				204	{
				205	cppchar_t n = *inbuf++;
				206	if ((n & 0xC0) != 0x80)
				207	return EILSEQ;
				208	c = ((c << 6) + (n & 0x3F));
				209	}
				210
				211	/* Make sure the shortest possible encoding was used. */
				212	if (c <= 0x7F && nbytes > 1) return EILSEQ;
				213	if (c <= 0x7FF && nbytes > 2) return EILSEQ;
				214	if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
				215	if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
				216	if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
				217
				218	/* Make sure the character is valid. */
				219	if (c > 0x7FFFFFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
				220
				221	*cp = c;
				222	*inbufp = inbuf;
				223	*inbytesleftp -= nbytes;
				224	return 0;
				225	}
				226
				227	static inline int
				228	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
				229	{
				230	static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				231	static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
				232	size_t nbytes;
				233	uchar buf[6], *p = &buf[6];
				234	uchar outbuf = outbufp;
				235
				236	nbytes = 1;
				237	if (c < 0x80)
				238	*--p = c;
				239	else
				240	{
				241	do
				242	{
				243	*--p = ((c & 0x3F) \| 0x80);
				244	c >>= 6;
				245	nbytes++;
				246	}
				247	while (c >= 0x3F \|\| (c & limits[nbytes-1]));
				248	*--p = (c \| masks[nbytes-1]);
				249	}
				250
				251	if (*outbytesleftp < nbytes)
				252	return E2BIG;
				253
				254	while (p < &buf[6])
				255	outbuf++ = p++;
				256	*outbytesleftp -= nbytes;
				257	*outbufp = outbuf;
				258	return 0;
				259	}
				260
				261	/* The following four functions transform one character between the two
				262	encodings named in the function name. All have the signature
				263	int ()(iconv_t bigend, const uchar inbufp, size_t inbytesleftp,
				264	uchar *outbufp, size_t outbytesleftp)
				265
				266	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
				267	interpreted as a boolean indicating whether big-endian or
				268	little-endian encoding is to be used for the member of the pair
				269	that is not UTF-8.
				270
				271	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
				272	do for iconv.
				273
				274	The return value is either 0 for success, or an errno value for
				275	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
				276	input sequence), ir EINVAL (incomplete input sequence). */
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	277
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	278	static inline int
				279	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				280	uchar *outbufp, size_t outbytesleftp)
				281	{
				282	uchar *outbuf;
Jan Hubicka	bd18496	2003-10-25 17:12:01 +0200	[diff] [blame]	283	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	284	int rval;
				285
				286	/* Check for space first, since we know exactly how much we need. */
				287	if (*outbytesleftp < 4)
				288	return E2BIG;
				289
				290	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				291	if (rval)
				292	return rval;
				293
				294	outbuf = *outbufp;
				295	outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
				296	outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
				297	outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
				298	outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
				299
				300	*outbufp += 4;
				301	*outbytesleftp -= 4;
				302	return 0;
				303	}
				304
				305	static inline int
				306	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				307	uchar *outbufp, size_t outbytesleftp)
				308	{
				309	cppchar_t s;
				310	int rval;
				311	const uchar *inbuf;
				312
				313	if (*inbytesleftp < 4)
				314	return EINVAL;
				315
				316	inbuf = *inbufp;
				317
				318	s = inbuf[bigend ? 0 : 3] << 24;
				319	s += inbuf[bigend ? 1 : 2] << 16;
				320	s += inbuf[bigend ? 2 : 1] << 8;
				321	s += inbuf[bigend ? 3 : 0];
				322
				323	if (s >= 0x7FFFFFFF \|\| (s >= 0xD800 && s <= 0xDFFF))
				324	return EILSEQ;
				325
				326	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				327	if (rval)
				328	return rval;
				329
				330	*inbufp += 4;
				331	*inbytesleftp -= 4;
				332	return 0;
				333	}
				334
				335	static inline int
				336	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				337	uchar *outbufp, size_t outbytesleftp)
				338	{
				339	int rval;
Richard Henderson	671ca9e	2003-10-30 08:36:27 -0800	[diff] [blame]	340	cppchar_t s = 0;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	341	const uchar save_inbuf = inbufp;
				342	size_t save_inbytesleft = *inbytesleftp;
				343	uchar outbuf = outbufp;
				344
				345	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
				346	if (rval)
				347	return rval;
				348
				349	if (s > 0x0010FFFF)
				350	{
				351	*inbufp = save_inbuf;
				352	*inbytesleftp = save_inbytesleft;
				353	return EILSEQ;
				354	}
				355
Joseph Myers	81fee4a	2014-11-29 01:56:06 +0000	[diff] [blame]	356	if (s <= 0xFFFF)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	357	{
				358	if (*outbytesleftp < 2)
				359	{
				360	*inbufp = save_inbuf;
				361	*inbytesleftp = save_inbytesleft;
				362	return E2BIG;
				363	}
				364	outbuf[bigend ? 1 : 0] = (s & 0x00FF);
				365	outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
				366
				367	*outbufp += 2;
				368	*outbytesleftp -= 2;
				369	return 0;
				370	}
				371	else
				372	{
				373	cppchar_t hi, lo;
				374
				375	if (*outbytesleftp < 4)
				376	{
				377	*inbufp = save_inbuf;
				378	*inbytesleftp = save_inbytesleft;
				379	return E2BIG;
				380	}
				381
				382	hi = (s - 0x10000) / 0x400 + 0xD800;
				383	lo = (s - 0x10000) % 0x400 + 0xDC00;
				384
				385	/* Even if we are little-endian, put the high surrogate first.
				386	??? Matches practice? */
				387	outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
				388	outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
				389	outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
				390	outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
				391
				392	*outbufp += 4;
				393	*outbytesleftp -= 4;
				394	return 0;
				395	}
				396	}
				397
				398	static inline int
				399	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
				400	uchar *outbufp, size_t outbytesleftp)
				401	{
				402	cppchar_t s;
				403	const uchar inbuf = inbufp;
				404	int rval;
				405
				406	if (*inbytesleftp < 2)
				407	return EINVAL;
				408	s = inbuf[bigend ? 0 : 1] << 8;
				409	s += inbuf[bigend ? 1 : 0];
				410
				411	/* Low surrogate without immediately preceding high surrogate is invalid. */
				412	if (s >= 0xDC00 && s <= 0xDFFF)
				413	return EILSEQ;
				414	/* High surrogate must have a following low surrogate. */
				415	else if (s >= 0xD800 && s <= 0xDBFF)
				416	{
				417	cppchar_t hi = s, lo;
				418	if (*inbytesleftp < 4)
				419	return EINVAL;
				420
				421	lo = inbuf[bigend ? 2 : 3] << 8;
				422	lo += inbuf[bigend ? 3 : 2];
				423
				424	if (lo < 0xDC00 \|\| lo > 0xDFFF)
				425	return EILSEQ;
				426
				427	s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
				428	}
				429
				430	rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
				431	if (rval)
				432	return rval;
				433
				434	/* Success - update the input pointers (one_cppchar_to_utf8 has done
				435	the output pointers for us). */
				436	if (s <= 0xFFFF)
				437	{
				438	*inbufp += 2;
				439	*inbytesleftp -= 2;
				440	}
				441	else
				442	{
				443	*inbufp += 4;
				444	*inbytesleftp -= 4;
				445	}
				446	return 0;
				447	}
				448
				449	/* Helper routine for the next few functions. The 'const' on
				450	one_conversion means that we promise not to modify what function is
Kazu Hirata	4ed4321	2003-07-12 22:49:48 +0000	[diff] [blame]	451	pointed to, which lets the inliner see through it. */
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	452
				453	static inline bool
				454	conversion_loop (int (const one_conversion)(iconv_t, const uchar , size_t ,
				455	uchar *, size_t ),
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	456	iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	457	{
				458	const uchar *inbuf;
				459	uchar *outbuf;
				460	size_t inbytesleft, outbytesleft;
				461	int rval;
				462
				463	inbuf = from;
				464	inbytesleft = flen;
				465	outbuf = to->text + to->len;
				466	outbytesleft = to->asize - to->len;
				467
				468	for (;;)
				469	{
				470	do
				471	rval = one_conversion (cd, &inbuf, &inbytesleft,
				472	&outbuf, &outbytesleft);
				473	while (inbytesleft && !rval);
				474
				475	if (__builtin_expect (inbytesleft == 0, 1))
				476	{
				477	to->len = to->asize - outbytesleft;
				478	return true;
				479	}
				480	if (rval != E2BIG)
				481	{
				482	errno = rval;
				483	return false;
				484	}
				485
				486	outbytesleft += OUTBUF_BLOCK_SIZE;
				487	to->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	488	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	489	outbuf = to->text + to->asize - outbytesleft;
				490	}
				491	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	492
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	493
				494	/* These functions convert entire strings between character sets.
				495	They all have the signature
				496
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	497	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf *to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	498
				499	The input string FROM is converted as specified by the function
				500	name plus the iconv descriptor CD (which may be fake), and the
				501	result appended to TO. On any error, false is returned, otherwise true. */
				502
				503	/* These four use the custom conversion code above. */
				504	static bool
				505	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	506	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	507	{
				508	return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
				509	}
				510
				511	static bool
				512	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	513	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	514	{
				515	return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
				516	}
				517
				518	static bool
				519	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	520	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	521	{
				522	return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
				523	}
				524
				525	static bool
				526	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	527	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	528	{
				529	return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
				530	}
				531
				532	/* Identity conversion, used when we have no alternative. */
				533	static bool
				534	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	535	const uchar from, size_t flen, struct _cpp_strbuf to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	536	{
				537	if (to->len + flen > to->asize)
				538	{
				539	to->asize = to->len + flen;
Bernd Edlinger	dc257367	2014-10-02 00:06:28 +0000	[diff] [blame]	540	to->asize += to->asize / 4;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	541	to->text = XRESIZEVEC (uchar, to->text, to->asize);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	542	}
				543	memcpy (to->text + to->len, from, flen);
				544	to->len += flen;
				545	return true;
				546	}
				547
				548	/* And this one uses the system iconv primitive. It's a little
				549	different, since iconv's interface is a little different. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	550	#if HAVE_ICONV
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	551
				552	#define CONVERT_ICONV_GROW_BUFFER \
				553	do { \
				554	outbytesleft += OUTBUF_BLOCK_SIZE; \
				555	to->asize += OUTBUF_BLOCK_SIZE; \
				556	to->text = XRESIZEVEC (uchar, to->text, to->asize); \
				557	outbuf = (char *)to->text + to->asize - outbytesleft; \
				558	} while (0)
				559
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	560	static bool
				561	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	562	struct _cpp_strbuf *to)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	563	{
				564	ICONV_CONST char *inbuf;
				565	char *outbuf;
				566	size_t inbytesleft, outbytesleft;
				567
				568	/* Reset conversion descriptor and check that it is valid. */
				569	if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
				570	return false;
				571
				572	inbuf = (ICONV_CONST char *)from;
				573	inbytesleft = flen;
				574	outbuf = (char *)to->text + to->len;
				575	outbytesleft = to->asize - to->len;
				576
				577	for (;;)
				578	{
				579	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
				580	if (__builtin_expect (inbytesleft == 0, 1))
				581	{
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	582	/* Close out any shift states, returning to the initial state. */
				583	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				584	{
				585	if (errno != E2BIG)
				586	return false;
				587
				588	CONVERT_ICONV_GROW_BUFFER;
				589	if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
				590	return false;
				591	}
				592
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	593	to->len = to->asize - outbytesleft;
				594	return true;
				595	}
				596	if (errno != E2BIG)
				597	return false;
				598
DJ Delorie	675575f	2007-12-11 20:38:10 -0500	[diff] [blame]	599	CONVERT_ICONV_GROW_BUFFER;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	600	}
				601	}
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	602	#else
				603	#define convert_using_iconv 0 /* prevent undefined symbol error below */
				604	#endif
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	605
				606	/* Arrange for the above custom conversion logic to be used automatically
				607	when conversion between a suitable pair of character sets is requested. */
				608
				609	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
				610	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
				611
Jan Hubicka	d87fc69	2014-09-22 21:43:02 +0200	[diff] [blame]	612	struct cpp_conversion
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	613	{
				614	const char *pair;
				615	convert_f func;
				616	iconv_t fake_cd;
				617	};
Jan Hubicka	d87fc69	2014-09-22 21:43:02 +0200	[diff] [blame]	618	static const struct cpp_conversion conversion_tab[] = {
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	619	{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
				620	{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
				621	{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
				622	{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
				623	{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
				624	{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
				625	{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
				626	{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
				627	};
				628
				629	/* Subroutine of cpp_init_iconv: initialize and return a
				630	cset_converter structure for conversion from FROM to TO. If
				631	iconv_open() fails, issue an error and return an identity
				632	converter. Silently return an identity converter if FROM and TO
				633	are identical. */
				634	static struct cset_converter
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	635	init_iconv_desc (cpp_reader pfile, const char to, const char *from)
				636	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	637	struct cset_converter ret;
				638	char *pair;
				639	size_t i;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	640
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	641	if (!strcasecmp (to, from))
				642	{
				643	ret.func = convert_no_conversion;
				644	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	645	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	646	return ret;
				647	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	648
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	649	pair = (char *) alloca(strlen(to) + strlen(from) + 2);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	650
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	651	strcpy(pair, from);
				652	strcat(pair, "/");
				653	strcat(pair, to);
				654	for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
				655	if (!strcasecmp (pair, conversion_tab[i].pair))
				656	{
				657	ret.func = conversion_tab[i].func;
				658	ret.cd = conversion_tab[i].fake_cd;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	659	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	660	return ret;
				661	}
				662
				663	/* No custom converter - try iconv. */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	664	if (HAVE_ICONV)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	665	{
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	666	ret.func = convert_using_iconv;
				667	ret.cd = iconv_open (to, from);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	668	ret.width = -1;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	669
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	670	if (ret.cd == (iconv_t) -1)
				671	{
				672	if (errno == EINVAL)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	673	cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	674	"conversion from %s to %s not supported by iconv",
				675	from, to);
				676	else
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	677	cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	678
				679	ret.func = convert_no_conversion;
				680	}
				681	}
				682	else
				683	{
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	684	cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	685	"no iconv implementation, cannot convert from %s to %s",
				686	from, to);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	687	ret.func = convert_no_conversion;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	688	ret.cd = (iconv_t) -1;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	689	ret.width = -1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	690	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	691	return ret;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	692	}
				693
				694	/* If charset conversion is requested, initialize iconv(3) descriptors
				695	for conversion from the source character set to the execution
				696	character sets. If iconv is not present in the C library, and
				697	conversion is requested, issue an error. */
				698
				699	void
				700	cpp_init_iconv (cpp_reader *pfile)
				701	{
				702	const char *ncset = CPP_OPTION (pfile, narrow_charset);
				703	const char *wcset = CPP_OPTION (pfile, wide_charset);
				704	const char *default_wcset;
				705
				706	bool be = CPP_OPTION (pfile, bytes_big_endian);
				707
				708	if (CPP_OPTION (pfile, wchar_precision) >= 32)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	709	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	710	else if (CPP_OPTION (pfile, wchar_precision) >= 16)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	711	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	712	else
				713	/* This effectively means that wide strings are not supported,
				714	so don't do any conversion at all. */
				715	default_wcset = SOURCE_CHARSET;
				716
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	717	if (!ncset)
				718	ncset = SOURCE_CHARSET;
				719	if (!wcset)
				720	wcset = default_wcset;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	721
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	722	pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	723	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	724	pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
				725	pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	726	pfile->char16_cset_desc = init_iconv_desc (pfile,
				727	be ? "UTF-16BE" : "UTF-16LE",
				728	SOURCE_CHARSET);
				729	pfile->char16_cset_desc.width = 16;
				730	pfile->char32_cset_desc = init_iconv_desc (pfile,
				731	be ? "UTF-32BE" : "UTF-32LE",
				732	SOURCE_CHARSET);
				733	pfile->char32_cset_desc.width = 32;
Zack Weinberg	dea55da	2003-07-11 05:49:47 +0000	[diff] [blame]	734	pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	735	pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	736	}
				737
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	738	/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	739	void
				740	_cpp_destroy_iconv (cpp_reader *pfile)
				741	{
				742	if (HAVE_ICONV)
				743	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	744	if (pfile->narrow_cset_desc.func == convert_using_iconv)
				745	iconv_close (pfile->narrow_cset_desc.cd);
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	746	if (pfile->utf8_cset_desc.func == convert_using_iconv)
				747	iconv_close (pfile->utf8_cset_desc.cd);
				748	if (pfile->char16_cset_desc.func == convert_using_iconv)
				749	iconv_close (pfile->char16_cset_desc.cd);
				750	if (pfile->char32_cset_desc.func == convert_using_iconv)
				751	iconv_close (pfile->char32_cset_desc.cd);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	752	if (pfile->wide_cset_desc.func == convert_using_iconv)
				753	iconv_close (pfile->wide_cset_desc.cd);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	754	}
				755	}
				756
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	757	/* Utility routine for use by a full compiler. C is a character taken
				758	from the basic source character set, encoded in the host's
				759	execution encoding. Convert it to (the target's) execution
				760	encoding, and return that value.
				761
				762	Issues an internal error if C's representation in the narrow
				763	execution character set fails to be a single-byte value (C99
				764	5.2.1p3: "The representation of each member of the source and
				765	execution character sets shall fit in a byte.") May also issue an
				766	internal error if C fails to be a member of the basic source
				767	character set (testing this exactly is too hard, especially when
				768	the host character set is EBCDIC). */
				769	cppchar_t
				770	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
				771	{
				772	uchar sbuf[1];
				773	struct _cpp_strbuf tbuf;
				774
				775	/* This test is merely an approximation, but it suffices to catch
				776	the most important thing, which is that we don't get handed a
				777	character outside the unibyte range of the host character set. */
				778	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
				779	{
				780	cpp_error (pfile, CPP_DL_ICE,
				781	"character 0x%lx is not in the basic source character set\n",
				782	(unsigned long)c);
				783	return 0;
				784	}
				785
				786	/* Being a character in the unibyte range of the host character set,
				787	we can safely splat it into a one-byte buffer and trust that that
				788	is a well-formed string. */
				789	sbuf[0] = c;
				790
				791	/* This should never need to reallocate, but just in case... */
				792	tbuf.asize = 1;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	793	tbuf.text = XNEWVEC (uchar, tbuf.asize);
Zack Weinberg	c5ff069	2005-02-20 17:01:32 +0000	[diff] [blame]	794	tbuf.len = 0;
				795
				796	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
				797	{
				798	cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
				799	return 0;
				800	}
				801	if (tbuf.len != 1)
				802	{
				803	cpp_error (pfile, CPP_DL_ICE,
				804	"character 0x%lx is not unibyte in execution character set",
				805	(unsigned long)c);
				806	return 0;
				807	}
				808	c = tbuf.text[0];
				809	free(tbuf.text);
				810	return c;
				811	}
				812
				813
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	814
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	815	/* cpp_substring_ranges's constructor. */
				816
				817	cpp_substring_ranges::cpp_substring_ranges () :
				818	m_ranges (NULL),
				819	m_num_ranges (0),
				820	m_alloc_ranges (8)
				821	{
				822	m_ranges = XNEWVEC (source_range, m_alloc_ranges);
				823	}
				824
				825	/* cpp_substring_ranges's destructor. */
				826
				827	cpp_substring_ranges::~cpp_substring_ranges ()
				828	{
				829	free (m_ranges);
				830	}
				831
				832	/* Add RANGE to the vector of source_range information. */
				833
				834	void
				835	cpp_substring_ranges::add_range (source_range range)
				836	{
				837	if (m_num_ranges >= m_alloc_ranges)
				838	{
				839	m_alloc_ranges *= 2;
				840	m_ranges
				841	= (source_range *)xrealloc (m_ranges,
				842	sizeof (source_range) * m_alloc_ranges);
				843	}
				844	m_ranges[m_num_ranges++] = range;
				845	}
				846
				847	/* Read NUM ranges from LOC_READER, adding them to the vector of source_range
				848	information. */
				849
				850	void
				851	cpp_substring_ranges::add_n_ranges (int num,
				852	cpp_string_location_reader &loc_reader)
				853	{
				854	for (int i = 0; i < num; i++)
				855	add_range (loc_reader.get_next ());
				856	}
				857
				858
				859
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	860	/* Utility routine that computes a mask of the form 0000...111... with
				861	WIDTH 1-bits. */
				862	static inline size_t
				863	width_to_mask (size_t width)
				864	{
				865	width = MIN (width, BITS_PER_CPPCHAR_T);
				866	if (width >= CHAR_BIT * sizeof (size_t))
				867	return ~(size_t) 0;
				868	else
				869	return ((size_t) 1 << width) - 1;
				870	}
				871
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	872	/* A large table of unicode character information. */
				873	enum {
				874	/* Valid in a C99 identifier? */
				875	C99 = 1,
				876	/* Valid in a C99 identifier, but not as the first character? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	877	N99 = 2,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	878	/* Valid in a C++ identifier? */
				879	CXX = 4,
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	880	/* Valid in a C11/C++11 identifier? */
				881	C11 = 8,
				882	/* Valid in a C11/C++11 identifier, but not as the first character? */
				883	N11 = 16,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	884	/* NFC representation is not valid in an identifier? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	885	CID = 32,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	886	/* Might be valid NFC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	887	NFC = 64,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	888	/* Might be valid NFKC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	889	NKC = 128,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	890	/* Certain preceding characters might make it not valid NFC/NKFC form? */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	891	CTX = 256
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	892	};
				893
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	894	struct ucnrange {
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	895	/* Bitmap of flags above. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	896	unsigned short flags;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	897	/* Combining class of the character. */
				898	unsigned char combine;
				899	/* Last character in the range described by this entry. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	900	unsigned int end;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	901	};
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	902	#include "ucnid.h"
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	903
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	904	/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
				905	the start of an identifier, and 0 if C is not valid in an
				906	identifier. We assume C has already gone through the checks of
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	907	_cpp_valid_ucn. Also update NST for C if returning nonzero. The
				908	algorithm is a simple binary search on the table defined in
				909	ucnid.h. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	910
				911	static int
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	912	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
				913	struct normalize_state *nst)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	914	{
				915	int mn, mx, md;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	916	unsigned short valid_flags, invalid_start_flags;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	917
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	918	if (c > 0x10FFFF)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	919	return 0;
				920
				921	mn = 0;
				922	mx = ARRAY_SIZE (ucnranges) - 1;
				923	while (mx != mn)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	924	{
				925	md = (mn + mx) / 2;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	926	if (c <= ucnranges[md].end)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	927	mx = md;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	928	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	929	mn = md + 1;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	930	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	931
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	932	/* When -pedantic, we require the character to have been listed by
				933	the standard for the current language. Otherwise, we accept the
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	934	union of the acceptable sets for all supported language versions. */
				935	valid_flags = C99 \| CXX \| C11;
				936	if (CPP_PEDANTIC (pfile))
				937	{
				938	if (CPP_OPTION (pfile, c11_identifiers))
				939	valid_flags = C11;
				940	else if (CPP_OPTION (pfile, c99))
				941	valid_flags = C99;
				942	else if (CPP_OPTION (pfile, cplusplus))
				943	valid_flags = CXX;
				944	}
				945	if (! (ucnranges[mn].flags & valid_flags))
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	946	return 0;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	947	if (CPP_OPTION (pfile, c11_identifiers))
				948	invalid_start_flags = N11;
				949	else if (CPP_OPTION (pfile, c99))
				950	invalid_start_flags = N99;
				951	else
				952	invalid_start_flags = 0;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	953
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	954	/* Update NST. */
				955	if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
				956	nst->level = normalized_none;
				957	else if (ucnranges[mn].flags & CTX)
				958	{
				959	bool safe;
				960	cppchar_t p = nst->previous;
				961
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	962	/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
				963	and are combined algorithmically from a sequence of the form
				964	1100-1112 1161-1175 11A8-11C2
				965	(if the third is not present, it is treated as 11A7, which is not
				966	really a valid character).
				967	Unfortunately, C99 allows (only) the NFC form, but C++ allows
				968	only the combining characters. */
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	969	if (c >= 0x1161 && c <= 0x1175)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	970	safe = p < 0x1100 \|\| p > 0x1112;
				971	else if (c >= 0x11A8 && c <= 0x11C2)
				972	safe = (p < 0xAC00 \|\| p > 0xD7A3 \|\| (p - 0xAC00) % 28 != 0);
				973	else
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	974	safe = check_nfc (pfile, c, p);
				975	if (!safe)
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	976	{
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	977	if ((c >= 0x1161 && c <= 0x1175) \|\| (c >= 0x11A8 && c <= 0x11C2))
				978	nst->level = MAX (nst->level, normalized_identifier_C);
				979	else
				980	nst->level = normalized_none;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	981	}
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	982	}
				983	else if (ucnranges[mn].flags & NKC)
				984	;
				985	else if (ucnranges[mn].flags & NFC)
				986	nst->level = MAX (nst->level, normalized_C);
				987	else if (ucnranges[mn].flags & CID)
				988	nst->level = MAX (nst->level, normalized_identifier_C);
				989	else
				990	nst->level = normalized_none;
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	991	if (ucnranges[mn].combine == 0)
				992	nst->previous = c;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	993	nst->prev_class = ucnranges[mn].combine;
				994
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	995	/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
				996	UCN combining characters may not begin identifiers. */
				997	if (ucnranges[mn].flags & invalid_start_flags)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	998	return 2;
				999
				1000	return 1;
				1001	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1002
				1003	/* [lex.charset]: The character designated by the universal character
				1004	name \UNNNNNNNN is that character whose character short name in
				1005	ISO/IEC 10646 is NNNNNNNN; the character designated by the
				1006	universal character name \uNNNN is that character whose character
				1007	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1008	for a universal character name corresponds to a surrogate code point
				1009	(in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
				1010	Additionally, if the hexadecimal value for a universal-character-name
				1011	outside a character or string literal corresponds to a control character
				1012	(in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
				1013	character in the basic source character set, the program is ill-formed.
				1014
				1015	C99 6.4.3: A universal character name shall not specify a character
				1016	whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
				1017	or 0060 (`), nor one in the range D800 through DFFF inclusive.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1018
				1019	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
Paolo Carlini	fbb2291	2015-07-02 18:54:41 +0000	[diff] [blame]	1020	buffer end is delimited by a non-hex digit. Returns false if the
				1021	UCN has not been consumed, true otherwise.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1022
Paolo Carlini	fbb2291	2015-07-02 18:54:41 +0000	[diff] [blame]	1023	The value of the UCN, whether valid or invalid, is returned in *CP.
				1024	Diagnostics are emitted for invalid values. PSTR is updated to point
				1025	one beyond the UCN, or to the syntactically invalid character.
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1026
				1027	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1028	an identifier, or 2 otherwise.
				1029
David Malcolm	e7864d6	2016-08-06 18:06:30 +0000	[diff] [blame]	1030	If LOC_READER is non-NULL, then position information is
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1031	read from LOC_READER and CHAR_RANGE->m_finish is updated accordingly. /
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1032
Paolo Carlini	fbb2291	2015-07-02 18:54:41 +0000	[diff] [blame]	1033	bool
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1034	_cpp_valid_ucn (cpp_reader pfile, const uchar *pstr,
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1035	const uchar *limit, int identifier_pos,
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1036	struct normalize_state nst, cppchar_t cp,
				1037	source_range *char_range,
				1038	cpp_string_location_reader *loc_reader)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1039	{
				1040	cppchar_t result, c;
				1041	unsigned int length;
				1042	const uchar str = pstr;
				1043	const uchar *base = str - 2;
				1044
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1045	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1046	cpp_error (pfile, CPP_DL_WARNING,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1047	"universal character names are only valid in C++ and C99");
Marek Polacek	177cce4	2014-08-19 05:34:31 +0000	[diff] [blame]	1048	else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
Marek Polacek	f3bede7	2014-08-10 06:10:49 +0000	[diff] [blame]	1049	&& !CPP_OPTION (pfile, cplusplus))
				1050	cpp_error (pfile, CPP_DL_WARNING,
				1051	"C99's universal character names are incompatible with C90");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1052	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1053	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1054	"the meaning of '\\%c' is different in traditional C",
				1055	(int) str[-1]);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1056
				1057	if (str[-1] == 'u')
				1058	length = 4;
				1059	else if (str[-1] == 'U')
				1060	length = 8;
				1061	else
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1062	{
				1063	cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
				1064	length = 4;
				1065	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1066
				1067	result = 0;
				1068	do
				1069	{
				1070	c = *str;
				1071	if (!ISXDIGIT (c))
				1072	break;
				1073	str++;
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1074	if (loc_reader)
David Malcolm	e7864d6	2016-08-06 18:06:30 +0000	[diff] [blame]	1075	{
				1076	gcc_assert (char_range);
				1077	char_range->m_finish = loc_reader->get_next ().m_finish;
				1078	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1079	result = (result << 4) + hex_value (c);
				1080	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1081	while (--length && str < limit);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1082
Geoffrey Keating	c79e602	2005-03-16 00:59:31 +0000	[diff] [blame]	1083	/* Partial UCNs are not valid in strings, but decompose into
				1084	multiple tokens in identifiers, so we can't give a helpful
				1085	error message in that case. */
				1086	if (length && identifier_pos)
Paolo Carlini	fbb2291	2015-07-02 18:54:41 +0000	[diff] [blame]	1087	{
				1088	*cp = 0;
				1089	return false;
				1090	}
				1091
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1092	*pstr = str;
				1093	if (length)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1094	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1095	cpp_error (pfile, CPP_DL_ERROR,
				1096	"incomplete universal character name %.*s",
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1097	(int) (str - base), base);
				1098	result = 1;
				1099	}
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1100	/* The C99 standard permits $, @ and ` to be specified as UCNs. We use
				1101	hex escapes so that this also works with EBCDIC hosts.
				1102	C++0x permits everything below 0xa0 within literals;
				1103	ucn_valid_in_identifier will complain about identifiers. */
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1104	else if ((result < 0xa0
Jason Merrill	30c99a9	2009-10-09 20:39:46 -0400	[diff] [blame]	1105	&& !CPP_OPTION (pfile, cplusplus)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1106	&& (result != 0x24 && result != 0x40 && result != 0x60))
				1107	\|\| (result & 0x80000000)
				1108	\|\| (result >= 0xD800 && result <= 0xDFFF))
				1109	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1110	cpp_error (pfile, CPP_DL_ERROR,
				1111	"%.*s is not a valid universal character",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1112	(int) (str - base), base);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1113	result = 1;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1114	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1115	else if (identifier_pos && result == 0x24
				1116	&& CPP_OPTION (pfile, dollars_in_ident))
				1117	{
				1118	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
				1119	{
				1120	CPP_OPTION (pfile, warn_dollars) = 0;
				1121	cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
				1122	}
Joseph Myers	d3f4ff8	2013-11-16 00:05:08 +0000	[diff] [blame]	1123	NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1124	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1125	else if (identifier_pos)
				1126	{
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1127	int validity = ucn_valid_in_identifier (pfile, result, nst);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1128
				1129	if (validity == 0)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1130	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1131	"universal character %.*s is not valid in an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1132	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1133	else if (validity == 2 && identifier_pos == 1)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1134	cpp_error (pfile, CPP_DL_ERROR,
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1135	"universal character %.*s is not valid at the start of an identifier",
Andreas Jaeger	0e7a8c4	2003-04-21 14:06:12 +0200	[diff] [blame]	1136	(int) (str - base), base);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1137	}
				1138
Paolo Carlini	fbb2291	2015-07-02 18:54:41 +0000	[diff] [blame]	1139	*cp = result;
				1140	return true;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1141	}
				1142
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1143	/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1144	it to the execution character set and write the result into TBUF,
				1145	if TBUF is non-NULL.
				1146	An advanced pointer is returned. Issues all relevant diagnostics.
				1147	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
				1148	contains the location of the character so far: location information
				1149	is read from LOC_READER, and RANGES is updated accordingly. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1150	static const uchar *
				1151	convert_ucn (cpp_reader pfile, const uchar from, const uchar *limit,
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1152	struct _cpp_strbuf *tbuf, struct cset_converter cvt,
				1153	source_range char_range,
				1154	cpp_string_location_reader *loc_reader,
				1155	cpp_substring_ranges *ranges)
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1156	{
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1157	cppchar_t ucn;
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1158	uchar buf[6];
				1159	uchar *bufp = buf;
				1160	size_t bytesleft = 6;
				1161	int rval;
Geoffrey Keating	50668cf	2005-03-15 00:36:33 +0000	[diff] [blame]	1162	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1163
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1164	/* loc_reader and ranges must either be both NULL, or both be non-NULL. */
				1165	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
				1166
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1167	from++; /* Skip u/U. */
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1168
				1169	if (loc_reader)
				1170	/* The u/U is part of the spelling of this character. */
				1171	char_range.m_finish = loc_reader->get_next ().m_finish;
				1172
				1173	_cpp_valid_ucn (pfile, &from, limit, 0, &nst,
				1174	&ucn, &char_range, loc_reader);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1175
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1176	rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
				1177	if (rval)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1178	{
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1179	errno = rval;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1180	cpp_errno (pfile, CPP_DL_ERROR,
				1181	"converting UCN to source character set");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1182	}
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1183	else
				1184	{
				1185	if (tbuf)
				1186	if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
				1187	cpp_errno (pfile, CPP_DL_ERROR,
				1188	"converting UCN to execution character set");
				1189
				1190	if (loc_reader)
				1191	{
				1192	int num_encoded_bytes = 6 - bytesleft;
				1193	for (int i = 0; i < num_encoded_bytes; i++)
				1194	ranges->add_range (char_range);
				1195	}
				1196	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1197
				1198	return from;
				1199	}
				1200
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1201	/* Subroutine of convert_hex and convert_oct. N is the representation
				1202	in the execution character set of a numeric escape; write it into the
				1203	string buffer TBUF and update the end-of-string pointer therein. WIDE
				1204	is true if it's a wide string that's being assembled in TBUF. This
				1205	function issues no diagnostics and never fails. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1206	static void
				1207	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1208	struct _cpp_strbuf *tbuf, struct cset_converter cvt)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1209	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1210	size_t width = cvt.width;
				1211
				1212	if (width != CPP_OPTION (pfile, char_precision))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1213	{
				1214	/* We have to render this into the target byte order, which may not
				1215	be our byte order. */
				1216	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1217	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1218	size_t cmask = width_to_mask (cwidth);
				1219	size_t nbwc = width / cwidth;
				1220	size_t i;
				1221	size_t off = tbuf->len;
				1222	cppchar_t c;
				1223
				1224	if (tbuf->len + nbwc > tbuf->asize)
				1225	{
				1226	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1227	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1228	}
				1229
				1230	for (i = 0; i < nbwc; i++)
				1231	{
				1232	c = n & cmask;
				1233	n >>= cwidth;
				1234	tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
				1235	}
				1236	tbuf->len += nbwc;
				1237	}
				1238	else
				1239	{
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	1240	/* Note: this code does not handle the case where the target
				1241	and host have a different number of bits in a byte. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1242	if (tbuf->len + 1 > tbuf->asize)
				1243	{
				1244	tbuf->asize += OUTBUF_BLOCK_SIZE;
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1245	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1246	}
				1247	tbuf->text[tbuf->len++] = n;
				1248	}
				1249	}
				1250
				1251	/* Convert a hexadecimal escape, pointed to by FROM, to the execution
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1252	character set and write it into the string buffer TBUF (if non-NULL).
				1253	Returns an advanced pointer, and issues diagnostics as necessary.
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1254	No character set translation occurs; this routine always produces the
				1255	execution-set character with numeric value equal to the given hex
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1256	number. You can, e.g. generate surrogate pairs this way.
				1257	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
				1258	contains the location of the character so far: location information
				1259	is read from LOC_READER, and RANGES is updated accordingly. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1260	static const uchar *
				1261	convert_hex (cpp_reader pfile, const uchar from, const uchar *limit,
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1262	struct _cpp_strbuf *tbuf, struct cset_converter cvt,
				1263	source_range char_range,
				1264	cpp_string_location_reader *loc_reader,
				1265	cpp_substring_ranges *ranges)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1266	{
				1267	cppchar_t c, n = 0, overflow = 0;
				1268	int digits_found = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1269	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1270	size_t mask = width_to_mask (width);
				1271
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1272	/* loc_reader and ranges must either be both NULL, or both be non-NULL. */
				1273	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
				1274
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1275	if (CPP_WTRADITIONAL (pfile))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1276	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1277	"the meaning of '\\x' is different in traditional C");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1278
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1279	/* Skip 'x'. */
				1280	from++;
				1281
				1282	/* The 'x' is part of the spelling of this character. */
				1283	if (loc_reader)
				1284	char_range.m_finish = loc_reader->get_next ().m_finish;
				1285
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1286	while (from < limit)
				1287	{
				1288	c = *from;
				1289	if (! hex_p (c))
				1290	break;
				1291	from++;
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1292	if (loc_reader)
				1293	char_range.m_finish = loc_reader->get_next ().m_finish;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1294	overflow \|= n ^ (n << 4 >> 4);
				1295	n = (n << 4) + hex_value (c);
				1296	digits_found = 1;
				1297	}
				1298
				1299	if (!digits_found)
				1300	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1301	cpp_error (pfile, CPP_DL_ERROR,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1302	"\\x used with no following hex digits");
				1303	return from;
				1304	}
				1305
				1306	if (overflow \| (n != (n & mask)))
				1307	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1308	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1309	"hex escape sequence out of range");
				1310	n &= mask;
				1311	}
				1312
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1313	if (tbuf)
				1314	emit_numeric_escape (pfile, n, tbuf, cvt);
				1315	if (ranges)
				1316	ranges->add_range (char_range);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1317
				1318	return from;
				1319	}
				1320
				1321	/* Convert an octal escape, pointed to by FROM, to the execution
				1322	character set and write it into the string buffer TBUF. Returns an
				1323	advanced pointer, and issues diagnostics as necessary.
				1324	No character set translation occurs; this routine always produces the
				1325	execution-set character with numeric value equal to the given octal
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1326	number.
				1327	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
				1328	contains the location of the character so far: location information
				1329	is read from LOC_READER, and RANGES is updated accordingly. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1330	static const uchar *
				1331	convert_oct (cpp_reader pfile, const uchar from, const uchar *limit,
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1332	struct _cpp_strbuf *tbuf, struct cset_converter cvt,
				1333	source_range char_range,
				1334	cpp_string_location_reader *loc_reader,
				1335	cpp_substring_ranges *ranges)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1336	{
				1337	size_t count = 0;
				1338	cppchar_t c, n = 0;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1339	size_t width = cvt.width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1340	size_t mask = width_to_mask (width);
				1341	bool overflow = false;
				1342
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1343	/* loc_reader and ranges must either be both NULL, or both be non-NULL. */
				1344	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
				1345
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1346	while (from < limit && count++ < 3)
				1347	{
				1348	c = *from;
				1349	if (c < '0' \|\| c > '7')
				1350	break;
				1351	from++;
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1352	if (loc_reader)
				1353	char_range.m_finish = loc_reader->get_next ().m_finish;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1354	overflow \|= n ^ (n << 3 >> 3);
				1355	n = (n << 3) + c - '0';
				1356	}
				1357
				1358	if (n != (n & mask))
				1359	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1360	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1361	"octal escape sequence out of range");
				1362	n &= mask;
				1363	}
				1364
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1365	if (tbuf)
				1366	emit_numeric_escape (pfile, n, tbuf, cvt);
				1367	if (ranges)
				1368	ranges->add_range (char_range);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1369
				1370	return from;
				1371	}
				1372
				1373	/* Convert an escape sequence (pointed to by FROM) to its value on
				1374	the target, and to the execution character set. Do not scan past
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1375	LIMIT. Write the converted value into TBUF, if TBUF is non-NULL.
				1376	Returns an advanced pointer. Handles all relevant diagnostics.
				1377	If LOC_READER is non-NULL, then RANGES must be non-NULL: location
				1378	information is read from LOC_READER, and RANGES is updated
				1379	accordingly. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1380	static const uchar *
				1381	convert_escape (cpp_reader pfile, const uchar from, const uchar *limit,
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1382	struct _cpp_strbuf *tbuf, struct cset_converter cvt,
				1383	cpp_string_location_reader *loc_reader,
				1384	cpp_substring_ranges *ranges)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1385	{
				1386	/* Values of \a \b \e \f \n \r \t \v respectively. */
				1387	#if HOST_CHARSET == HOST_CHARSET_ASCII
				1388	static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
				1389	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
				1390	static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
				1391	#else
				1392	#error "unknown host character set"
				1393	#endif
				1394
				1395	uchar c;
				1396
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1397	/* Record the location of the backslash. */
				1398	source_range char_range;
				1399	if (loc_reader)
				1400	char_range = loc_reader->get_next ();
				1401
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1402	c = *from;
				1403	switch (c)
				1404	{
				1405	/* UCNs, hex escapes, and octal escapes are processed separately. */
				1406	case 'u': case 'U':
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1407	return convert_ucn (pfile, from, limit, tbuf, cvt,
				1408	char_range, loc_reader, ranges);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1409
				1410	case 'x':
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1411	return convert_hex (pfile, from, limit, tbuf, cvt,
				1412	char_range, loc_reader, ranges);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1413	break;
				1414
				1415	case '0': case '1': case '2': case '3':
				1416	case '4': case '5': case '6': case '7':
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1417	return convert_oct (pfile, from, limit, tbuf, cvt,
				1418	char_range, loc_reader, ranges);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1419
				1420	/* Various letter escapes. Get the appropriate host-charset
				1421	value into C. */
				1422	case '\\': case '\'': case '"': case '?': break;
				1423
				1424	case '(': case '{': case '[': case '%':
				1425	/* '\(', etc, can be used at the beginning of a line in a long
				1426	string split onto multiple lines with \-newline, to prevent
				1427	Emacs or other text editors from getting confused. '\%' can
				1428	be used to prevent SCCS from mangling printf format strings. */
				1429	if (CPP_PEDANTIC (pfile))
				1430	goto unknown;
				1431	break;
				1432
				1433	case 'b': c = charconsts[1]; break;
				1434	case 'f': c = charconsts[3]; break;
				1435	case 'n': c = charconsts[4]; break;
				1436	case 'r': c = charconsts[5]; break;
				1437	case 't': c = charconsts[6]; break;
				1438	case 'v': c = charconsts[7]; break;
				1439
				1440	case 'a':
				1441	if (CPP_WTRADITIONAL (pfile))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1442	cpp_warning (pfile, CPP_W_TRADITIONAL,
				1443	"the meaning of '\\a' is different in traditional C");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1444	c = charconsts[0];
				1445	break;
				1446
				1447	case 'e': case 'E':
				1448	if (CPP_PEDANTIC (pfile))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1449	cpp_error (pfile, CPP_DL_PEDWARN,
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1450	"non-ISO-standard escape sequence, '\\%c'", (int) c);
				1451	c = charconsts[2];
				1452	break;
				1453
				1454	default:
				1455	unknown:
				1456	if (ISGRAPH (c))
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1457	cpp_error (pfile, CPP_DL_PEDWARN,
Tom Tromey	709a22d	2009-08-17 17:34:53 +0000	[diff] [blame]	1458	"unknown escape sequence: '\\%c'", (int) c);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1459	else
Joseph Myers	178b58b	2005-11-03 23:08:18 +0000	[diff] [blame]	1460	{
				1461	/* diagnostic.c does not support "%03o". When it does, this
				1462	code can use %03o directly in the diagnostic again. */
				1463	char buf[32];
				1464	sprintf(buf, "%03o", (int) c);
				1465	cpp_error (pfile, CPP_DL_PEDWARN,
				1466	"unknown escape sequence: '\\%s'", buf);
				1467	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1468	}
				1469
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1470	if (tbuf)
				1471	/* Now convert what we have to the execution character set. */
				1472	if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
				1473	cpp_errno (pfile, CPP_DL_ERROR,
				1474	"converting escape sequence to execution character set");
				1475
				1476	if (loc_reader)
				1477	{
				1478	char_range.m_finish = loc_reader->get_next ().m_finish;
				1479	ranges->add_range (char_range);
				1480	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1481
				1482	return from + 1;
				1483	}
				1484
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1485	/* TYPE is a token type. The return value is the conversion needed to
				1486	convert from source to execution character set for the given type. */
				1487	static struct cset_converter
				1488	converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
				1489	{
				1490	switch (type)
				1491	{
				1492	default:
				1493	return pfile->narrow_cset_desc;
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame]	1494	case CPP_UTF8CHAR:
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1495	case CPP_UTF8STRING:
				1496	return pfile->utf8_cset_desc;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1497	case CPP_CHAR16:
				1498	case CPP_STRING16:
				1499	return pfile->char16_cset_desc;
				1500	case CPP_CHAR32:
				1501	case CPP_STRING32:
				1502	return pfile->char32_cset_desc;
				1503	case CPP_WCHAR:
				1504	case CPP_WSTRING:
				1505	return pfile->wide_cset_desc;
				1506	}
				1507	}
				1508
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1509	/* FROM is an array of cpp_string structures of length COUNT. These
				1510	are to be converted from the source to the execution character set,
				1511	escape sequences translated, and finally all are to be
				1512	concatenated. WIDE indicates whether or not to produce a wide
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1513	string. If TO is non-NULL, the result is written into TO.
				1514	If LOC_READERS and OUT are non-NULL, then location information
				1515	is read from LOC_READERS (which must be an array of length COUNT),
				1516	and location information is written to *RANGES.
				1517
				1518	Returns true for success, false for failure. */
				1519
				1520	static bool
				1521	cpp_interpret_string_1 (cpp_reader pfile, const cpp_string from, size_t count,
				1522	cpp_string *to, enum cpp_ttype type,
				1523	cpp_string_location_reader *loc_readers,
				1524	cpp_substring_ranges *out)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1525	{
Nathanael Nerode	a801686	2003-09-26 05:52:43 +0000	[diff] [blame]	1526	struct _cpp_strbuf tbuf;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1527	const uchar p, base, *limit;
				1528	size_t i;
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1529	struct cset_converter cvt = converter_for_type (pfile, type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1530
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1531	/* loc_readers and out must either be both NULL, or both be non-NULL. */
				1532	gcc_assert ((loc_readers != NULL) == (out != NULL));
				1533
				1534	if (to)
				1535	{
				1536	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
				1537	tbuf.text = XNEWVEC (uchar, tbuf.asize);
				1538	tbuf.len = 0;
				1539	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1540
David Malcolm	bbd6fcf	2016-09-23 14:14:52 +0000	[diff] [blame]	1541	cpp_string_location_reader *loc_reader = NULL;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1542	for (i = 0; i < count; i++)
				1543	{
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1544	if (loc_readers)
				1545	loc_reader = &loc_readers[i];
				1546
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1547	p = from[i].text;
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1548	if (*p == 'u')
				1549	{
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1550	p++;
				1551	if (loc_reader)
				1552	loc_reader->get_next ();
				1553	if (*p == '8')
				1554	{
				1555	p++;
				1556	if (loc_reader)
				1557	loc_reader->get_next ();
				1558	}
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1559	}
				1560	else if (p == 'L' \|\| p == 'U') p++;
				1561	if (*p == 'R')
				1562	{
				1563	const uchar *prefix;
				1564
				1565	/* Skip over 'R"'. */
				1566	p += 2;
David Malcolm	b8f5641	2016-11-17 15:55:26 +0000	[diff] [blame]	1567	if (loc_reader)
				1568	{
				1569	loc_reader->get_next ();
				1570	loc_reader->get_next ();
				1571	}
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1572	prefix = p;
Jason Merrill	5215062	2010-03-29 11:00:43 -0400	[diff] [blame]	1573	while (*p != '(')
David Malcolm	b8f5641	2016-11-17 15:55:26 +0000	[diff] [blame]	1574	{
				1575	p++;
				1576	if (loc_reader)
				1577	loc_reader->get_next ();
				1578	}
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1579	p++;
David Malcolm	b8f5641	2016-11-17 15:55:26 +0000	[diff] [blame]	1580	if (loc_reader)
				1581	loc_reader->get_next ();
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1582	limit = from[i].text + from[i].len;
				1583	if (limit >= p + (p - prefix) + 1)
				1584	limit -= (p - prefix) + 1;
				1585
Jason Merrill	00a81b8	2010-03-29 16:07:29 -0400	[diff] [blame]	1586	/* Raw strings are all normal characters; these can be fed
				1587	directly to convert_cset. */
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1588	if (to)
				1589	if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
				1590	goto fail;
				1591
				1592	if (loc_reader)
				1593	{
				1594	/* If generating source ranges, assume we have a 1:1
				1595	correspondence between bytes in the source encoding and bytes
				1596	in the execution encoding (e.g. if we have a UTF-8 to UTF-8
				1597	conversion), so that this run of bytes in the source file
				1598	corresponds to a run of bytes in the execution string.
				1599	This requirement is guaranteed by an early-reject in
				1600	cpp_interpret_string_ranges. */
				1601	gcc_assert (cvt.func == convert_no_conversion);
				1602	out->add_n_ranges (limit - p, *loc_reader);
				1603	}
Jakub Jelinek	2c6e3f5	2009-10-19 23:41:15 +0200	[diff] [blame]	1604
				1605	continue;
				1606	}
				1607
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1608	/* If we don't now have a leading quote, something has gone wrong.
				1609	This can occur if cpp_interpret_string_ranges is handling a
				1610	stringified macro argument, but should not be possible otherwise. */
				1611	if (p != '"' && p != '\'')
				1612	{
				1613	gcc_assert (out != NULL);
				1614	cpp_error (pfile, CPP_DL_ERROR, "missing open quote");
				1615	if (to)
				1616	free (tbuf.text);
				1617	return false;
				1618	}
				1619
				1620	/* Skip leading quote. */
				1621	p++;
				1622	if (loc_reader)
				1623	loc_reader->get_next ();
				1624
Kazu Hirata	e0a21ab	2004-01-16 01:44:06 +0000	[diff] [blame]	1625	limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1626
				1627	for (;;)
				1628	{
				1629	base = p;
				1630	while (p < limit && *p != '\\')
				1631	p++;
				1632	if (p > base)
				1633	{
				1634	/* We have a run of normal characters; these can be fed
				1635	directly to convert_cset. */
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1636	if (to)
				1637	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
				1638	goto fail;
				1639	/* Similar to above: assumes we have a 1:1 correspondence
				1640	between bytes in the source encoding and bytes in the
				1641	execution encoding. */
				1642	if (loc_reader)
				1643	{
				1644	gcc_assert (cvt.func == convert_no_conversion);
				1645	out->add_n_ranges (p - base, *loc_reader);
				1646	}
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1647	}
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1648	if (p >= limit)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1649	break;
				1650
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1651	struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
				1652	p = convert_escape (pfile, p + 1, limit, tbuf_ptr, cvt,
				1653	loc_reader, out);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1654	}
				1655	}
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1656
				1657	if (to)
				1658	{
				1659	/* NUL-terminate the 'to' buffer and translate it to a cpp_string
				1660	structure. */
				1661	emit_numeric_escape (pfile, 0, &tbuf, cvt);
				1662	tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
				1663	to->text = tbuf.text;
				1664	to->len = tbuf.len;
				1665	}
David Malcolm	bbd6fcf	2016-09-23 14:14:52 +0000	[diff] [blame]	1666	/* Use the location of the trailing quote as the location of the
				1667	NUL-terminator. */
				1668	if (loc_reader)
				1669	{
				1670	source_range range = loc_reader->get_next ();
				1671	out->add_range (range);
				1672	}
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1673
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1674	return true;
				1675
				1676	fail:
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1677	cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1678	if (to)
				1679	free (tbuf.text);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1680	return false;
				1681	}
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1682
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	1683	/* FROM is an array of cpp_string structures of length COUNT. These
				1684	are to be converted from the source to the execution character set,
				1685	escape sequences translated, and finally all are to be
				1686	concatenated. WIDE indicates whether or not to produce a wide
				1687	string. The result is written into TO. Returns true for success,
				1688	false for failure. */
				1689	bool
				1690	cpp_interpret_string (cpp_reader pfile, const cpp_string from, size_t count,
				1691	cpp_string *to, enum cpp_ttype type)
				1692	{
				1693	return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
				1694	}
				1695
				1696	/* A "do nothing" error-handling callback for use by
				1697	cpp_interpret_string_ranges, so that it can temporarily suppress
				1698	error-handling. */
				1699
				1700	static bool
				1701	noop_error_cb (cpp_reader , int, int, rich_location ,
				1702	const char , va_list )
				1703	{
				1704	/* no-op. */
				1705	return true;
				1706	}
				1707
				1708	/* This function mimics the behavior of cpp_interpret_string, but
				1709	rather than generating a string in the execution character set,
				1710	*OUT is written to with the source code ranges of the characters
				1711	in such a string.
				1712	FROM and LOC_READERS should both be arrays of length COUNT.
				1713	Returns NULL for success, or an error message for failure. */
				1714
				1715	const char *
				1716	cpp_interpret_string_ranges (cpp_reader pfile, const cpp_string from,
				1717	cpp_string_location_reader *loc_readers,
				1718	size_t count,
				1719	cpp_substring_ranges *out,
				1720	enum cpp_ttype type)
				1721	{
				1722	/* There are a couple of cases in the range-handling in
				1723	cpp_interpret_string_1 that rely on there being a 1:1 correspondence
				1724	between bytes in the source encoding and bytes in the execution
				1725	encoding, so that each byte in the execution string can correspond
				1726	to the location of a byte in the source string.
				1727
				1728	This holds for the typical case of a UTF-8 to UTF-8 conversion.
				1729	Enforce this requirement by only attempting to track substring
				1730	locations if we have source encoding == execution encoding.
				1731
				1732	This is a stronger condition than we need, since we could e.g.
				1733	have ASCII to EBCDIC (with 1 byte per character before and after),
				1734	but it seems to be a reasonable restriction. */
				1735	struct cset_converter cvt = converter_for_type (pfile, type);
				1736	if (cvt.func != convert_no_conversion)
				1737	return "execution character set != source character set";
				1738
				1739	/* For on-demand strings we have already lexed the strings, so there
				1740	should be no errors. However, if we have bogus source location
				1741	data (or stringified macro arguments), the attempt to lex the
				1742	strings could fail with an error. Temporarily install an
				1743	error-handler to catch the error, so that it can lead to this call
				1744	failing, rather than being emitted as a user-visible diagnostic.
				1745	If an error does occur, we should see it via the return value of
				1746	cpp_interpret_string_1. */
				1747	bool (saved_error_handler) (cpp_reader , int, int, rich_location *,
				1748	const char , va_list )
				1749	ATTRIBUTE_FPTR_PRINTF(5,0);
				1750
				1751	saved_error_handler = pfile->cb.error;
				1752	pfile->cb.error = noop_error_cb;
				1753
				1754	bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
				1755	loc_readers, out);
				1756
				1757	/* Restore the saved error-handler. */
				1758	pfile->cb.error = saved_error_handler;
				1759
				1760	if (!result)
				1761	return "cpp_interpret_string_1 failed";
				1762
				1763	/* Success. */
				1764	return NULL;
				1765	}
				1766
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1767	/* Subroutine of do_line and do_linemarker. Convert escape sequences
				1768	in a string, but do not perform character set conversion. */
				1769	bool
Eric Christopher	423e95e	2004-02-12 02:25:03 +0000	[diff] [blame]	1770	cpp_interpret_string_notranslate (cpp_reader pfile, const cpp_string from,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1771	size_t count, cpp_string *to,
				1772	enum cpp_ttype type ATTRIBUTE_UNUSED)
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1773	{
				1774	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
				1775	bool retval;
				1776
				1777	pfile->narrow_cset_desc.func = convert_no_conversion;
				1778	pfile->narrow_cset_desc.cd = (iconv_t) -1;
H.J. Lu	0b7c73c	2008-06-12 17:03:41 +0000	[diff] [blame]	1779	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1780
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1781	retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
Zack Weinberg	6b88314	2003-07-10 23:16:31 +0000	[diff] [blame]	1782
				1783	pfile->narrow_cset_desc = save_narrow_cset_desc;
				1784	return retval;
				1785	}
				1786
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1787
				1788	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1789	to a number, for narrow strings. STR is the string structure returned
				1790	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
				1791	cpp_interpret_charconst. */
				1792	static cppchar_t
				1793	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
				1794	unsigned int pchars_seen, int unsignedp)
				1795	{
				1796	size_t width = CPP_OPTION (pfile, char_precision);
				1797	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
				1798	size_t mask = width_to_mask (width);
				1799	size_t i;
				1800	cppchar_t result, c;
				1801	bool unsigned_p;
				1802
				1803	/* The value of a multi-character character constant, or a
				1804	single-character character constant whose representation in the
				1805	execution character set is more than one byte long, is
				1806	implementation defined. This implementation defines it to be the
				1807	number formed by interpreting the byte sequence in memory as a
				1808	big-endian binary number. If overflow occurs, the high bytes are
				1809	lost, and a warning is issued.
				1810
				1811	We don't want to process the NUL terminator handed back by
				1812	cpp_interpret_string. */
				1813	result = 0;
				1814	for (i = 0; i < str.len - 1; i++)
				1815	{
				1816	c = str.text[i] & mask;
				1817	if (width < BITS_PER_CPPCHAR_T)
				1818	result = (result << width) \| c;
				1819	else
				1820	result = c;
				1821	}
				1822
				1823	if (i > max_chars)
				1824	{
				1825	i = max_chars;
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1826	cpp_error (pfile, CPP_DL_WARNING,
				1827	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1828	}
				1829	else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
Simon Baldwin	87cf065	2010-04-07 17:18:10 +0000	[diff] [blame]	1830	cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1831
				1832	/* Multichar constants are of type int and therefore signed. */
				1833	if (i > 1)
				1834	unsigned_p = 0;
				1835	else
				1836	unsigned_p = CPP_OPTION (pfile, unsigned_char);
				1837
				1838	/* Truncate the constant to its natural width, and simultaneously
				1839	sign- or zero-extend to the full width of cppchar_t.
				1840	For single-character constants, the value is WIDTH bits wide.
				1841	For multi-character constants, the value is INT_PRECISION bits wide. */
				1842	if (i > 1)
				1843	width = CPP_OPTION (pfile, int_precision);
				1844	if (width < BITS_PER_CPPCHAR_T)
				1845	{
				1846	mask = ((cppchar_t) 1 << width) - 1;
				1847	if (unsigned_p \|\| !(result & (1 << (width - 1))))
				1848	result &= mask;
				1849	else
				1850	result \|= ~mask;
				1851	}
				1852	*pchars_seen = i;
				1853	*unsignedp = unsigned_p;
				1854	return result;
				1855	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1856
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1857	/* Subroutine of cpp_interpret_charconst which performs the conversion
				1858	to a number, for wide strings. STR is the string structure returned
				1859	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1860	cpp_interpret_charconst. TYPE is the token type. */
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1861	static cppchar_t
				1862	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1863	unsigned int pchars_seen, int unsignedp,
				1864	enum cpp_ttype type)
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1865	{
				1866	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1867	size_t width = converter_for_type (pfile, type).width;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1868	size_t cwidth = CPP_OPTION (pfile, char_precision);
				1869	size_t mask = width_to_mask (width);
				1870	size_t cmask = width_to_mask (cwidth);
				1871	size_t nbwc = width / cwidth;
				1872	size_t off, i;
				1873	cppchar_t result = 0, c;
				1874
				1875	/* This is finicky because the string is in the target's byte order,
				1876	which may not be our byte order. Only the last character, ignoring
				1877	the NUL terminator, is relevant. */
				1878	off = str.len - (nbwc * 2);
				1879	result = 0;
				1880	for (i = 0; i < nbwc; i++)
				1881	{
				1882	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
				1883	result = (result << cwidth) \| (c & cmask);
				1884	}
				1885
				1886	/* Wide character constants have type wchar_t, and a single
				1887	character exactly fills a wchar_t, so a multi-character wide
				1888	character constant is guaranteed to overflow. */
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1889	if (str.len > nbwc * 2)
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1890	cpp_error (pfile, CPP_DL_WARNING,
				1891	"character constant too long for its type");
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1892
				1893	/* Truncate the constant to its natural width, and simultaneously
				1894	sign- or zero-extend to the full width of cppchar_t. */
				1895	if (width < BITS_PER_CPPCHAR_T)
				1896	{
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1897	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1898	\|\| CPP_OPTION (pfile, unsigned_wchar)
				1899	\|\| !(result & (1 << (width - 1))))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1900	result &= mask;
				1901	else
				1902	result \|= ~mask;
				1903	}
				1904
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1905	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
				1906	\|\| CPP_OPTION (pfile, unsigned_wchar))
				1907	*unsignedp = 1;
				1908	else
				1909	*unsignedp = 0;
				1910
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1911	*pchars_seen = 1;
				1912	return result;
				1913	}
				1914
				1915	/* Interpret a (possibly wide) character constant in TOKEN.
				1916	PCHARS_SEEN points to a variable that is filled in with the number
				1917	of characters seen, and UNSIGNEDP to a variable that indicates
				1918	whether the result has signed type. */
				1919	cppchar_t
				1920	cpp_interpret_charconst (cpp_reader pfile, const cpp_token token,
				1921	unsigned int pchars_seen, int unsignedp)
				1922	{
				1923	cpp_string str = { 0, 0 };
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame]	1924	bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
				1925	int u8 = 2 * int(token->type == CPP_UTF8CHAR);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1926	cppchar_t result;
				1927
Edward Smith-Rowland	fe95b03	2015-06-30 12:58:48 +0000	[diff] [blame]	1928	/* An empty constant will appear as L'', u'', U'', u8'', or '' */
				1929	if (token->val.str.len == (size_t) (2 + wide + u8))
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1930	{
John David Anglin	0527bc4	2003-11-01 22:56:54 +0000	[diff] [blame]	1931	cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
Jakub Jelinek	b5c1c98	2016-02-05 20:39:48 +0100	[diff] [blame]	1932	*pchars_seen = 0;
				1933	*unsignedp = 0;
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1934	return 0;
				1935	}
Jakub Jelinek	b5c1c98	2016-02-05 20:39:48 +0100	[diff] [blame]	1936	else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str,
				1937	token->type))
				1938	{
				1939	*pchars_seen = 0;
				1940	*unsignedp = 0;
				1941	return 0;
				1942	}
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1943
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1944	if (wide)
Kris Van Hees	b6baa67	2008-04-18 13:58:08 +0000	[diff] [blame]	1945	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
				1946	token->type);
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1947	else
				1948	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1949
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1950	if (str.text != token->val.str.text)
				1951	free ((void *)str.text);
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1952
Zack Weinberg	e6cc3a2	2003-07-05 00:24:00 +0000	[diff] [blame]	1953	return result;
Neil Booth	1613e52	2003-04-20 07:29:23 +0000	[diff] [blame]	1954	}
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1955
				1956	/* Convert an identifier denoted by ID and LEN, which might contain
				1957	UCN escapes, to the source character set, either UTF-8 or
				1958	UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
				1959	cpp_hashnode *
				1960	_cpp_interpret_identifier (cpp_reader pfile, const uchar id, size_t len)
				1961	{
				1962	/* It turns out that a UCN escape always turns into fewer characters
				1963	than the escape itself, so we can allocate a temporary in advance. */
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	1964	uchar * buf = (uchar *) alloca (len + 1);
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1965	uchar * bufp = buf;
				1966	size_t idp;
				1967
				1968	for (idp = 0; idp < len; idp++)
				1969	if (id[idp] != '\\')
				1970	*bufp++ = id[idp];
				1971	else
				1972	{
				1973	unsigned length = id[idp+1] == 'u' ? 4 : 8;
				1974	cppchar_t value = 0;
				1975	size_t bufleft = len - (bufp - buf);
				1976	int rval;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	1977
Geoffrey Keating	47e2049	2005-03-12 10:44:06 +0000	[diff] [blame]	1978	idp += 2;
				1979	while (length && idp < len && ISXDIGIT (id[idp]))
				1980	{
				1981	value = (value << 4) + hex_value (id[idp]);
				1982	idp++;
				1983	length--;
				1984	}
				1985	idp--;
				1986
				1987	/* Special case for EBCDIC: if the identifier contains
				1988	a '$' specified using a UCN, translate it to EBCDIC. */
				1989	if (value == 0x24)
				1990	{
				1991	*bufp++ = '$';
				1992	continue;
				1993	}
				1994
				1995	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
				1996	if (rval)
				1997	{
				1998	errno = rval;
				1999	cpp_errno (pfile, CPP_DL_ERROR,
				2000	"converting UCN to source character set");
				2001	break;
				2002	}
				2003	}
				2004
				2005	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				2006	buf, bufp - buf, HT_ALLOC));
				2007	}
				2008
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	2009	/* Convert an input buffer (containing the complete contents of one
				2010	source file) from INPUT_CHARSET to the source character set. INPUT
				2011	points to the input buffer, SIZE is its allocated size, and LEN is
				2012	the length of the meaningful data within the buffer. The
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2013	translated buffer is returned, *ST_SIZE is set to the length of
				2014	the meaningful data within the translated buffer, and *BUFFER_START
				2015	is set to the start of the returned buffer. *BUFFER_START may
				2016	differ from the return value in the case of a BOM or other ignored
				2017	marker information.
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	2018
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2019	INPUT is expected to have been allocated with xmalloc. This
				2020	function will either set *BUFFER_START to INPUT, or free it and set
				2021	*BUFFER_START to a pointer to another xmalloc-allocated block of
				2022	memory. */
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	2023	uchar *
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2024	_cpp_convert_input (cpp_reader pfile, const char input_charset,
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2025	uchar *input, size_t size, size_t len,
				2026	const unsigned char *buffer_start, off_t st_size)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2027	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2028	struct cset_converter input_cset;
				2029	struct _cpp_strbuf to;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2030	unsigned char *buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2031
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2032	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
				2033	if (input_cset.func == convert_no_conversion)
				2034	{
				2035	to.text = input;
				2036	to.asize = size;
				2037	to.len = len;
				2038	}
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2039	else
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2040	{
				2041	to.asize = MAX (65536, len);
Gabriel Dos Reis	c3f829c	2005-05-28 15:52:48 +0000	[diff] [blame]	2042	to.text = XNEWVEC (uchar, to.asize);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2043	to.len = 0;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2044
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2045	if (!APPLY_CONVERSION (input_cset, input, len, &to))
				2046	cpp_error (pfile, CPP_DL_ERROR,
				2047	"failure to convert %s to %s",
				2048	CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
				2049
				2050	free (input);
				2051	}
				2052
				2053	/* Clean up the mess. */
				2054	if (input_cset.func == convert_using_iconv)
				2055	iconv_close (input_cset.cd);
				2056
				2057	/* Resize buffer if we allocated substantially too much, or if we
Jakub Jelinek	f41e5bd	2012-12-03 18:19:47 +0100	[diff] [blame]	2058	haven't enough space for the \n-terminator or following
				2059	15 bytes of padding (used to quiet warnings from valgrind or
				2060	Address Sanitizer, when the optimized lexer accesses aligned
				2061	16-byte memory chunks, including the bytes after the malloced,
				2062	area, and stops lexing on '\n'). */
				2063	if (to.len + 4096 < to.asize \|\| to.len + 16 > to.asize)
				2064	to.text = XRESIZEVEC (uchar, to.text, to.len + 16);
				2065
				2066	memset (to.text + to.len, '\0', 16);
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2067
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	2068	/* If the file is using old-school Mac line endings (\r only),
				2069	terminate with another \r, not an \n, so that we do not mistake
				2070	the \r\n sequence for a single DOS line ending and erroneously
				2071	issue the "No newline at end of file" diagnostic. */
Tom Tromey	30b0edc	2006-12-28 18:45:48 +0000	[diff] [blame]	2072	if (to.len && to.text[to.len - 1] == '\r')
Devang Patel	04c90ee	2005-02-19 11:48:02 -0800	[diff] [blame]	2073	to.text[to.len] = '\r';
				2074	else
				2075	to.text[to.len] = '\n';
				2076
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2077	buffer = to.text;
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2078	*st_size = to.len;
Tom Tromey	688e7a5	2008-04-21 14:02:00 +0000	[diff] [blame]	2079	#if HOST_CHARSET == HOST_CHARSET_ASCII
				2080	/* The HOST_CHARSET test just above ensures that the source charset
				2081	is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
				2082	glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
				2083	BOM -- however, even if it did, we would still need this code due
				2084	to the 'convert_no_conversion' case. */
				2085	if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
				2086	&& to.text[2] == 0xbf)
				2087	{
				2088	*st_size -= 3;
				2089	buffer += 3;
				2090	}
				2091	#endif
				2092
				2093	*buffer_start = to.text;
				2094	return buffer;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2095	}
				2096
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	2097	/* Decide on the default encoding to assume for input files. */
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2098	const char *
				2099	_cpp_default_encoding (void)
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2100	{
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2101	const char *current_encoding = NULL;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2102
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	2103	/* We disable this because the default codeset is 7-bit ASCII on
				2104	most platforms, and this causes conversion failures on every
				2105	file in GCC that happens to have one of the upper 128 characters
				2106	in it -- most likely, as part of the name of a contributor.
				2107	We should definitely recognize in-band markers of file encoding,
				2108	like:
				2109	- the appropriate Unicode byte-order mark (FE FF) to recognize
				2110	UTF16 and UCS4 (in both big-endian and little-endian flavors)
				2111	and UTF8
Zack Weinberg	c6e8380	2004-06-05 20:58:06 +0000	[diff] [blame]	2112	- a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	2113	distinguish ASCII and EBCDIC.
				2114	- now we can parse something like "#pragma GCC encoding <xyz>
				2115	on the first line, or even Emacs/VIM's mode line tags (there's
				2116	a problem here in that VIM uses the last line, and Emacs has
Zack Weinberg	a29f62d	2004-09-18 00:56:19 +0000	[diff] [blame]	2117	its more elaborate "local variables" convention).
Paolo Bonzini	4f4e53dd	2004-05-24 10:50:45 +0000	[diff] [blame]	2118	- investigate whether Java has another common convention, which
				2119	would be friendly to support.
				2120	(Zack Weinberg and Paolo Bonzini, May 20th 2004) */
				2121	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher	16dd5cf	2004-02-02 20:20:58 +0000	[diff] [blame]	2122	setlocale (LC_CTYPE, "");
				2123	current_encoding = nl_langinfo (CODESET);
				2124	#endif
				2125	if (current_encoding == NULL \|\| *current_encoding == '\0')
				2126	current_encoding = SOURCE_CHARSET;
				2127
				2128	return current_encoding;
Eric Christopher	cf551fb	2004-01-16 22:37:49 +0000	[diff] [blame]	2129	}
David Malcolm	88fa555	2016-08-05 18:08:33 +0000	[diff] [blame]	2130
				2131	/* Implementation of class cpp_string_location_reader. */
				2132
				2133	/* Constructor for cpp_string_location_reader. */
				2134
				2135	cpp_string_location_reader::
				2136	cpp_string_location_reader (source_location src_loc,
				2137	line_maps *line_table)
				2138	: m_line_table (line_table)
				2139	{
				2140	src_loc = get_range_from_loc (line_table, src_loc).m_start;
				2141
				2142	/* SRC_LOC might be a macro location. It only makes sense to do
				2143	column-by-column calculations on ordinary maps, so get the
				2144	corresponding location in an ordinary map. */
				2145	m_loc
				2146	= linemap_resolve_location (line_table, src_loc,
				2147	LRK_SPELLING_LOCATION, NULL);
				2148
				2149	const line_map_ordinary *map
				2150	= linemap_check_ordinary (linemap_lookup (line_table, m_loc));
				2151	m_offset_per_column = (1 << map->m_range_bits);
				2152	}
				2153
				2154	/* Get the range of the next source byte. */
				2155
				2156	source_range
				2157	cpp_string_location_reader::get_next ()
				2158	{
				2159	source_range result;
				2160	result.m_start = m_loc;
				2161	result.m_finish = m_loc;
				2162	if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
				2163	m_loc += m_offset_per_column;
				2164	return result;
				2165	}