Index: gcc/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* doc/cppopts.texi (-fexec-charset): Add concept index entry.
(-fwide-exec-charset): Likewise.
(-finput-charset): Likewise.
* doc/invoke.texi (Warning Options): Document -Wnormalized=.
* c-opts.c (c_common_handle_option): Handle -Wnormalized=.
* c.opt (Wnormalized): New.
Index: libcpp/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* init.c (cpp_create_reader): Default warn_normalize to normalized_C.
* charset.c: Update for new format of ucnid.h.
(ucn_valid_in_identifier): Update for new format of ucnid.h.
Add NST parameter, and update it; update callers.
(cpp_valid_ucn): Add NST parameter, update callers. Replace abort
with cpp_error.
(convert_ucn): Pass normalize_state to cpp_valid_ucn.
* internal.h (struct normalize_state): New.
(INITIAL_NORMALIZE_STATE): New.
(NORMALIZE_STATE_RESULT): New.
(NORMALIZE_STATE_UPDATE_IDNUM): New.
(_cpp_valid_ucn): New.
* lex.c (warn_about_normalization): New.
(forms_identifier_p): Add normalize_state parameter, update callers.
(lex_identifier): Add normalize_state parameter, update callers. Keep
the state current.
(lex_number): Likewise.
(_cpp_lex_direct): Pass normalize_state to subroutines. Check
it with warn_about_normalization.
* makeucnid.c: New.
* ucnid.h: Replace.
* ucnid.pl: Remove.
* ucnid.tab: Make appropriate for input to makeucnid.c. Remove
comments about obsolete version of C++.
* include/cpplib.h (enum cpp_normalize_level): New.
(struct cpp_options): Add warn_normalize field.
Index: gcc/testsuite/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* gcc.dg/cpp/normalize-1.c: New.
* gcc.dg/cpp/normalize-2.c: New.
* gcc.dg/cpp/normalize-3.c: New.
* gcc.dg/cpp/normalize-4.c: New.
* gcc.dg/cpp/ucnid-4.c: New.
* gcc.dg/cpp/ucnid-5.c: New.
* g++.dg/cpp/normalize-1.C: New.
* g++.dg/cpp/ucnid-1.C: New.
From-SVN: r96459
diff --git a/libcpp/charset.c b/libcpp/charset.c
index cd25f10..f028b37 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -22,7 +22,6 @@
#include "system.h"
#include "cpplib.h"
#include "internal.h"
-#include "ucnid.h"
/* Character set handling for C-family languages.
@@ -786,43 +785,128 @@
return ((size_t) 1 << width) - 1;
}
+/* A large table of unicode character information. */
+enum {
+ /* Valid in a C99 identifier? */
+ C99 = 1,
+ /* Valid in a C99 identifier, but not as the first character? */
+ DIG = 2,
+ /* Valid in a C++ identifier? */
+ CXX = 4,
+ /* NFC representation is not valid in an identifier? */
+ CID = 8,
+ /* Might be valid NFC form? */
+ NFC = 16,
+ /* Might be valid NFKC form? */
+ NKC = 32,
+ /* Certain preceding characters might make it not valid NFC/NKFC form? */
+ CTX = 64
+};
+
+static const struct {
+ /* Bitmap of flags above. */
+ unsigned char flags;
+ /* Combining class of the character. */
+ unsigned char combine;
+ /* Last character in the range described by this entry. */
+ unsigned short end;
+} ucnranges[] = {
+#include "ucnid.h"
+};
+
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
identifier. We assume C has already gone through the checks of
- _cpp_valid_ucn. The algorithm is a simple binary search on the
- table defined in cppucnid.h. */
+ _cpp_valid_ucn. Also update NST for C if returning nonzero. The
+ algorithm is a simple binary search on the table defined in
+ ucnid.h. */
static int
-ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
+ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
+ struct normalize_state *nst)
{
int mn, mx, md;
- mn = -1;
- mx = ARRAY_SIZE (ucnranges);
- while (mx - mn > 1)
+ if (c > 0xFFFF)
+ return 0;
+
+ mn = 0;
+ mx = ARRAY_SIZE (ucnranges) - 1;
+ while (mx != mn)
{
md = (mn + mx) / 2;
- if (c < ucnranges[md].lo)
+ if (c <= ucnranges[md].end)
mx = md;
- else if (c > ucnranges[md].hi)
- mn = md;
else
- goto found;
+ mn = md + 1;
}
- return 0;
- found:
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
union of the acceptable sets for C++98 and C99. */
+ if (! (ucnranges[mn].flags & (C99 | CXX)))
+ return 0;
+
if (CPP_PEDANTIC (pfile)
- && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
+ && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
|| (CPP_OPTION (pfile, cplusplus)
- && !(ucnranges[md].flags & CXX))))
+ && !(ucnranges[mn].flags & CXX))))
return 0;
+ /* Update NST. */
+ if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
+ nst->level = normalized_none;
+ else if (ucnranges[mn].flags & CTX)
+ {
+ bool safe;
+ cppchar_t p = nst->previous;
+
+ /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
+ if (c == 0x09BE)
+ safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
+ else if (c == 0x0B3E)
+ safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
+ else if (c == 0x0BBE)
+ safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
+ else if (c == 0x0CC2)
+ safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
+ else if (c == 0x0D3E)
+ safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
+ /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
+ and are combined algorithmically from a sequence of the form
+ 1100-1112 1161-1175 11A8-11C2
+ (if the third is not present, it is treated as 11A7, which is not
+ really a valid character).
+ Unfortunately, C99 allows (only) the NFC form, but C++ allows
+ only the combining characters. */
+ else if (c >= 0x1161 && c <= 0x1175)
+ safe = p < 0x1100 || p > 0x1112;
+ else if (c >= 0x11A8 && c <= 0x11C2)
+ safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
+ else
+ {
+ /* Uh-oh, someone updated ucnid.h without updating this code. */
+ cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
+ safe = true;
+ }
+ if (!safe && c < 0x1161)
+ nst->level = normalized_none;
+ else if (!safe)
+ nst->level = MAX (nst->level, normalized_identifier_C);
+ }
+ else if (ucnranges[mn].flags & NKC)
+ ;
+ else if (ucnranges[mn].flags & NFC)
+ nst->level = MAX (nst->level, normalized_C);
+ else if (ucnranges[mn].flags & CID)
+ nst->level = MAX (nst->level, normalized_identifier_C);
+ else
+ nst->level = normalized_none;
+ nst->previous = c;
+ nst->prev_class = ucnranges[mn].combine;
+
/* In C99, UCN digits may not begin identifiers. */
- if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
+ if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
return 2;
return 1;
@@ -853,7 +937,8 @@
cppchar_t
_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
- const uchar *limit, int identifier_pos)
+ const uchar *limit, int identifier_pos,
+ struct normalize_state *nst)
{
cppchar_t result, c;
unsigned int length;
@@ -873,7 +958,10 @@
else if (str[-1] == 'U')
length = 8;
else
- abort();
+ {
+ cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
+ length = 4;
+ }
result = 0;
do
@@ -915,10 +1003,11 @@
CPP_OPTION (pfile, warn_dollars) = 0;
cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
}
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
}
else if (identifier_pos)
{
- int validity = ucn_valid_in_identifier (pfile, result);
+ int validity = ucn_valid_in_identifier (pfile, result, nst);
if (validity == 0)
cpp_error (pfile, CPP_DL_ERROR,
@@ -950,9 +1039,10 @@
int rval;
struct cset_converter cvt
= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
from++; /* Skip u/U. */
- ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
+ ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
if (rval)