From 0f07a5d5f1f484c9c334d52193617e89442da7c9 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Fri, 31 Aug 2018 09:58:39 +0200 Subject: json: Nicer recovery from lexical errors When the lexer chokes on an input character, it consumes the character, emits a JSON error token, and enters its start state. This can lead to suboptimal error recovery. For instance, input 0123 , produces the tokens JSON_ERROR 01 JSON_INTEGER 23 JSON_COMMA , Make the lexer skip characters after a lexical error until a structural character ('[', ']', '{', '}', ':', ','), an ASCII control character, or '\xFE', or '\xFF'. Note that we must not skip ASCII control characters, '\xFE', '\xFF', because those are documented to force the JSON parser into known-good state, by docs/interop/qmp-spec.txt. The lexer now produces JSON_ERROR 01 JSON_COMMA , Update qmp-test for the nicer error recovery: QMP now reports just one error for input %p instead of two. Also drop the newline after %p; it was needed to tease out the second error. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-Id: <20180831075841.13363-5-armbru@redhat.com> [Conflict with commit ebb4d82d888 resolved] --- qobject/json-lexer.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'qobject') diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c index 28582e17d9..39c7ce7adc 100644 --- a/qobject/json-lexer.c +++ b/qobject/json-lexer.c @@ -101,6 +101,7 @@ enum json_lexer_state { IN_ERROR = 0, /* must really be 0, see json_lexer[] */ + IN_RECOVERY, IN_DQ_STRING_ESCAPE, IN_DQ_STRING, IN_SQ_STRING_ESCAPE, @@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1); static const uint8_t json_lexer[][256] = { /* Relies on default initialization to IN_ERROR! */ + /* error recovery */ + [IN_RECOVERY] = { + /* + * Skip characters until a structural character, an ASCII + * control character other than '\t', or impossible UTF-8 + * bytes '\xFE', '\xFF'. Structural characters and line + * endings are promising resynchronization points. Clients + * may use the others to force the JSON parser into known-good + * state; see docs/interop/qmp-spec.txt. + */ + [0 ... 0x1F] = IN_START | LOOKAHEAD, + [0x20 ... 0xFD] = IN_RECOVERY, + [0xFE ... 0xFF] = IN_START | LOOKAHEAD, + ['\t'] = IN_RECOVERY, + ['['] = IN_START | LOOKAHEAD, + [']'] = IN_START | LOOKAHEAD, + ['{'] = IN_START | LOOKAHEAD, + ['}'] = IN_START | LOOKAHEAD, + [':'] = IN_START | LOOKAHEAD, + [','] = IN_START | LOOKAHEAD, + }, + /* double quote string */ [IN_DQ_STRING_ESCAPE] = { [0x20 ... 0xFD] = IN_DQ_STRING, @@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) /* fall through */ case JSON_SKIP: g_string_truncate(lexer->token, 0); + /* fall through */ + case IN_START: new_state = lexer->start_state; break; case IN_ERROR: - /* XXX: To avoid having previous bad input leaving the parser in an - * unresponsive state where we consume unpredictable amounts of - * subsequent "good" input, percolate this error state up to the - * parser by emitting a JSON_ERROR token, then reset lexer state. - * - * Also note that this handling is required for reliable channel - * negotiation between QMP and the guest agent, since chr(0xFF) - * is placed at the beginning of certain events to ensure proper - * delivery when the channel is in an unknown state. chr(0xFF) is - * never a valid ASCII/UTF-8 sequence, so this should reliably - * induce an error/flush state. - */ json_message_process_token(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); + new_state = IN_RECOVERY; + /* fall through */ + case IN_RECOVERY: g_string_truncate(lexer->token, 0); - lexer->state = lexer->start_state; - return; + break; default: break; } -- cgit v1.2.3