diff options
Diffstat (limited to 'include/lib/modernjson/detail/input/lexer.hpp')
-rw-r--r-- | include/lib/modernjson/detail/input/lexer.hpp | 237 |
1 files changed, 205 insertions, 32 deletions
diff --git a/include/lib/modernjson/detail/input/lexer.hpp b/include/lib/modernjson/detail/input/lexer.hpp index 9606211..b61e289 100644 --- a/include/lib/modernjson/detail/input/lexer.hpp +++ b/include/lib/modernjson/detail/input/lexer.hpp @@ -10,6 +10,7 @@ #include <lib/modernjson/detail/macro_scope.hpp> #include <lib/modernjson/detail/input/input_adapters.hpp> +#include <lib/modernjson/detail/input/position_t.hpp> namespace nlohmann { @@ -104,7 +105,10 @@ class lexer // delete because of pointer members lexer(const lexer&) = delete; + lexer(lexer&&) = delete; lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = delete; + ~lexer() = default; private: ///////////////////// @@ -393,39 +397,194 @@ class lexer // invalid control characters case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + case 0x1F: { - error_message = "invalid string: control character must be escaped"; + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; return token_type::parse_error; } @@ -709,7 +868,7 @@ class lexer locale's decimal point is used instead of `.` to work with the locale-dependent converters. */ - token_type scan_number() + token_type scan_number() // lgtm [cpp/use-of-goto] { // reset token_buffer to store the number's bytes reset(); @@ -1082,7 +1241,9 @@ scan_number_done: */ std::char_traits<char>::int_type get() { - ++chars_read; + ++position.chars_read_total; + ++position.chars_read_current_line; + if (next_unget) { // just reset the next_unget variable and work with current @@ -1097,6 +1258,13 @@ scan_number_done: { token_string.push_back(std::char_traits<char>::to_char_type(current)); } + + if (current == '\n') + { + ++position.lines_read; + ++position.chars_read_current_line = 0; + } + return current; } @@ -1104,14 +1272,29 @@ scan_number_done: @brief unget current character (read it again on next get) We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read and - token_string. The next call to get() will behave as if the unget character - is read again. + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. */ void unget() { next_unget = true; - --chars_read; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + if (JSON_LIKELY(current != std::char_traits<char>::eof())) { assert(token_string.size() != 0); @@ -1159,9 +1342,9 @@ scan_number_done: ///////////////////// /// return position of last read token - constexpr std::size_t get_position() const noexcept + constexpr position_t get_position() const noexcept { - return chars_read; + return position; } /// return the last read token (for errors only). Will never contain EOF @@ -1177,7 +1360,7 @@ scan_number_done: { // escape control characters char cs[9]; - snprintf(cs, 9, "<U+%.4X>", static_cast<unsigned char>(c)); + (std::snprintf)(cs, 9, "<U+%.4X>", static_cast<unsigned char>(c)); result += cs; } else @@ -1208,30 +1391,20 @@ scan_number_done: { if (get() == 0xEF) { - if (get() == 0xBB and get() == 0xBF) - { - // we completely parsed the BOM - return true; - } - else - { - // after reading 0xEF, an unexpected character followed - return false; - } - } - else - { - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; + // check if we completely parse the BOM + return get() == 0xBB and get() == 0xBF; } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; } token_type scan() { // initially, skip the BOM - if (chars_read == 0 and not skip_bom()) + if (position.chars_read_total == 0 and not skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; return token_type::parse_error; @@ -1309,8 +1482,8 @@ scan_number_done: /// whether the next get() call should just return current bool next_unget = false; - /// the number of characters read - std::size_t chars_read = 0; + /// the start position of the current token + position_t position; /// raw input token string (for error messages) std::vector<char> token_string {}; @@ -1329,5 +1502,5 @@ scan_number_done: /// the decimal point const char decimal_point_char = '.'; }; -} -} +} // namespace detail +} // namespace nlohmann |