aboutsummaryrefslogtreecommitdiff
path: root/util/unicode.c
diff options
context:
space:
mode:
authorMarkus Armbruster <armbru@redhat.com>2018-08-23 18:39:49 +0200
committerMarkus Armbruster <armbru@redhat.com>2018-08-24 20:26:37 +0200
commite59f39d40397645477b959255aedfa17a7c9c779 (patch)
tree9d1bbd816632aa79e9015ddd47eb7a827b1787f0 /util/unicode.c
parenta89d3104a29c400dfed4b675d6385a17223f9e0f (diff)
json: Reject invalid UTF-8 sequences
We reject bytes that can't occur in valid UTF-8 (\xC0..\xC1, \xF5..\xFF in the lexer. That's insufficient; there's plenty of invalid UTF-8 not containing these bytes, as demonstrated by check-qjson: * Malformed sequences - Unexpected continuation bytes - Missing continuation bytes after start bytes other than \xC0..\xC1, \xF5..\xFD. * Overlong sequences with start bytes other than \xC0..\xC1, \xF5..\xFD. * Invalid code points Fixing this in the lexer would be bothersome. Fixing it in the parser is straightforward, so do that. Signed-off-by: Markus Armbruster <armbru@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Message-Id: <20180823164025.12553-23-armbru@redhat.com>
Diffstat (limited to 'util/unicode.c')
-rw-r--r--util/unicode.c69
1 files changed, 62 insertions, 7 deletions
diff --git a/util/unicode.c b/util/unicode.c
index a812a35171..8580bc598b 100644
--- a/util/unicode.c
+++ b/util/unicode.c
@@ -13,6 +13,21 @@
#include "qemu/osdep.h"
#include "qemu/unicode.h"
+static bool is_valid_codepoint(int codepoint)
+{
+ if (codepoint > 0x10FFFFu) {
+ return false; /* beyond Unicode range */
+ }
+ if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
+ || (codepoint & 0xFFFE) == 0xFFFE) {
+ return false; /* noncharacter */
+ }
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+ return false; /* surrogate code point */
+ }
+ return true;
+}
+
/**
* mod_utf8_codepoint:
* @s: string encoded in modified UTF-8
@@ -83,13 +98,8 @@ int mod_utf8_codepoint(const char *s, size_t n, char **end)
cp <<= 6;
cp |= byte & 0x3F;
}
- if (cp > 0x10FFFF) {
- cp = -1; /* beyond Unicode range */
- } else if ((cp >= 0xFDD0 && cp <= 0xFDEF)
- || (cp & 0xFFFE) == 0xFFFE) {
- cp = -1; /* noncharacter */
- } else if (cp >= 0xD800 && cp <= 0xDFFF) {
- cp = -1; /* surrogate code point */
+ if (!is_valid_codepoint(cp)) {
+ cp = -1;
} else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
cp = -1; /* overlong, not \xC0\x80 */
}
@@ -99,3 +109,48 @@ out:
*end = (char *)p;
return cp;
}
+
+/**
+ * mod_utf8_encode:
+ * @buf: Destination buffer
+ * @bufsz: size of @buf, at least 5.
+ * @codepoint: Unicode codepoint to encode
+ *
+ * Convert Unicode codepoint @codepoint to modified UTF-8.
+ *
+ * Returns: the length of the UTF-8 sequence on success, -1 when
+ * @codepoint is invalid.
+ */
+ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
+{
+ assert(bufsz >= 5);
+
+ if (!is_valid_codepoint(codepoint)) {
+ return -1;
+ }
+
+ if (codepoint > 0 && codepoint <= 0x7F) {
+ buf[0] = codepoint & 0x7F;
+ buf[1] = 0;
+ return 1;
+ }
+ if (codepoint <= 0x7FF) {
+ buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+ buf[1] = 0x80 | (codepoint & 0x3F);
+ buf[2] = 0;
+ return 2;
+ }
+ if (codepoint <= 0xFFFF) {
+ buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+ buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[2] = 0x80 | (codepoint & 0x3F);
+ buf[3] = 0;
+ return 3;
+ }
+ buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+ buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[3] = 0x80 | (codepoint & 0x3F);
+ buf[4] = 0;
+ return 4;
+}