diff options
author | Daniel Friesel <derf@finalrewind.org> | 2018-11-26 09:06:31 +0100 |
---|---|---|
committer | Daniel Friesel <derf@finalrewind.org> | 2018-11-26 09:06:31 +0100 |
commit | 1542f34f0e0fc53324f6fdc5905f4b77b252a789 (patch) | |
tree | d2126bc53f8759c36809ff25b9ae3a19fd7aa362 /include/lib/modernjson/detail/input | |
parent | e7711c06640f098323cab80934c198090e9120a3 (diff) |
update nlohmann modernjson to v3.4 (with bson support)
Diffstat (limited to 'include/lib/modernjson/detail/input')
-rw-r--r-- | include/lib/modernjson/detail/input/binary_reader.hpp | 1025 | ||||
-rw-r--r-- | include/lib/modernjson/detail/input/input_adapters.hpp | 169 | ||||
-rw-r--r-- | include/lib/modernjson/detail/input/json_sax.hpp | 105 | ||||
-rw-r--r-- | include/lib/modernjson/detail/input/lexer.hpp | 237 | ||||
-rw-r--r-- | include/lib/modernjson/detail/input/parser.hpp | 60 | ||||
-rw-r--r-- | include/lib/modernjson/detail/input/position_t.hpp | 27 |
6 files changed, 1073 insertions, 550 deletions
diff --git a/include/lib/modernjson/detail/input/binary_reader.hpp b/include/lib/modernjson/detail/input/binary_reader.hpp index 103cbf7..637569a 100644 --- a/include/lib/modernjson/detail/input/binary_reader.hpp +++ b/include/lib/modernjson/detail/input/binary_reader.hpp @@ -68,6 +68,10 @@ class binary_reader switch (format) { + case input_format_t::bson: + result = parse_bson_internal(); + break; + case input_format_t::cbor: result = parse_cbor_internal(); break; @@ -100,7 +104,8 @@ class binary_reader if (JSON_UNLIKELY(current != std::char_traits<char>::eof())) { - return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read, "expected end of input")); + return sax->parse_error(chars_read, get_token_string(), + parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value"))); } } @@ -120,6 +125,225 @@ class binary_reader } private: + ////////// + // BSON // + ////////// + + /*! + @brief Reads in a BSON-object and passes it to the SAX-parser. + @return whether a valid BSON-value was passed to the SAX parser + */ + bool parse_bson_internal() + { + std::int32_t document_size; + get_number<std::int32_t, true>(input_format_t::bson, document_size); + + if (JSON_UNLIKELY(not sax->start_object(std::size_t(-1)))) + { + return false; + } + + if (JSON_UNLIKELY(not parse_bson_element_list(/*is_array*/false))) + { + return false; + } + + return sax->end_object(); + } + + /*! + @brief Parses a C-style string from the BSON input. + @param[in, out] result A reference to the string variable where the read + string is to be stored. + @return `true` if the \x00-byte indicating the end of the string was + encountered before the EOF; false` indicates an unexpected EOF. + */ + bool get_bson_cstr(string_t& result) + { + auto out = std::back_inserter(result); + while (true) + { + get(); + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::bson, "cstring"))) + { + return false; + } + if (current == 0x00) + { + return true; + } + *out++ = static_cast<char>(current); + } + + return true; + } + + /*! + @brief Parses a zero-terminated string of length @a len from the BSON + input. + @param[in] len The length (including the zero-byte at the end) of the + string to be read. + @param[in, out] result A reference to the string variable where the read + string is to be stored. + @tparam NumberType The type of the length @a len + @pre len >= 1 + @return `true` if the string was successfully parsed + */ + template<typename NumberType> + bool get_bson_string(const NumberType len, string_t& result) + { + if (JSON_UNLIKELY(len < 1)) + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string"))); + } + + return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) and get() != std::char_traits<char>::eof(); + } + + /*! + @brief Read a BSON document element of the given @a element_type. + @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html + @param[in] element_type_parse_position The position in the input stream, + where the `element_type` was read. + @warning Not all BSON element types are supported yet. An unsupported + @a element_type will give rise to a parse_error.114: + Unsupported BSON record type 0x... + @return whether a valid BSON-object/array was passed to the SAX parser + */ + bool parse_bson_element_internal(const int element_type, + const std::size_t element_type_parse_position) + { + switch (element_type) + { + case 0x01: // double + { + double number; + return get_number<double, true>(input_format_t::bson, number) and sax->number_float(static_cast<number_float_t>(number), ""); + } + + case 0x02: // string + { + std::int32_t len; + string_t value; + return get_number<std::int32_t, true>(input_format_t::bson, len) and get_bson_string(len, value) and sax->string(value); + } + + case 0x03: // object + { + return parse_bson_internal(); + } + + case 0x04: // array + { + return parse_bson_array(); + } + + case 0x08: // boolean + { + return sax->boolean(get() != 0); + } + + case 0x0A: // null + { + return sax->null(); + } + + case 0x10: // int32 + { + std::int32_t value; + return get_number<std::int32_t, true>(input_format_t::bson, value) and sax->number_integer(value); + } + + case 0x12: // int64 + { + std::int64_t value; + return get_number<std::int64_t, true>(input_format_t::bson, value) and sax->number_integer(value); + } + + default: // anything else not supported (yet) + { + char cr[3]; + (std::snprintf)(cr, sizeof(cr), "%.2hhX", static_cast<unsigned char>(element_type)); + return sax->parse_error(element_type_parse_position, std::string(cr), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr))); + } + } + } + + /*! + @brief Read a BSON element list (as specified in the BSON-spec) + + The same binary layout is used for objects and arrays, hence it must be + indicated with the argument @a is_array which one is expected + (true --> array, false --> object). + + @param[in] is_array Determines if the element list being read is to be + treated as an object (@a is_array == false), or as an + array (@a is_array == true). + @return whether a valid BSON-object/array was passed to the SAX parser + */ + bool parse_bson_element_list(const bool is_array) + { + string_t key; + while (int element_type = get()) + { + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::bson, "element list"))) + { + return false; + } + + const std::size_t element_type_parse_position = chars_read; + if (JSON_UNLIKELY(not get_bson_cstr(key))) + { + return false; + } + + if (not is_array) + { + if (not sax->key(key)) + { + return false; + } + } + + if (JSON_UNLIKELY(not parse_bson_element_internal(element_type, element_type_parse_position))) + { + return false; + } + + // get_bson_cstr only appends + key.clear(); + } + + return true; + } + + /*! + @brief Reads an array from the BSON input and passes it to the SAX-parser. + @return whether a valid BSON-array was passed to the SAX parser + */ + bool parse_bson_array() + { + std::int32_t document_size; + get_number<std::int32_t, true>(input_format_t::bson, document_size); + + if (JSON_UNLIKELY(not sax->start_array(std::size_t(-1)))) + { + return false; + } + + if (JSON_UNLIKELY(not parse_bson_element_list(/*is_array*/true))) + { + return false; + } + + return sax->end_array(); + } + + ////////// + // CBOR // + ////////// + /*! @param[in] get_char whether a new character should be retrieved from the input (true, default) or whether the last read @@ -133,7 +357,7 @@ class binary_reader { // EOF case std::char_traits<char>::eof(): - return unexpect_eof(); + return unexpect_eof(input_format_t::cbor, "value"); // Integer 0x00..0x17 (0..23) case 0x00: @@ -165,25 +389,25 @@ class binary_reader case 0x18: // Unsigned integer (one-byte uint8_t follows) { uint8_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::cbor, number) and sax->number_unsigned(number); } case 0x19: // Unsigned integer (two-byte uint16_t follows) { uint16_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::cbor, number) and sax->number_unsigned(number); } case 0x1A: // Unsigned integer (four-byte uint32_t follows) { uint32_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::cbor, number) and sax->number_unsigned(number); } case 0x1B: // Unsigned integer (eight-byte uint64_t follows) { uint64_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::cbor, number) and sax->number_unsigned(number); } // Negative integer -1-0x00..-1-0x17 (-1..-24) @@ -216,25 +440,25 @@ class binary_reader case 0x38: // Negative integer (one-byte uint8_t follows) { uint8_t number; - return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); + return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); } case 0x39: // Negative integer -1-n (two-byte uint16_t follows) { uint16_t number; - return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); + return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); } case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) { uint32_t number; - return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); + return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number); } case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) { uint64_t number; - return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) + return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - static_cast<number_integer_t>(number)); } @@ -303,25 +527,25 @@ class binary_reader case 0x98: // array (one-byte uint8_t for n follows) { uint8_t len; - return get_number(len) and get_cbor_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len)); } case 0x99: // array (two-byte uint16_t for n follow) { uint16_t len; - return get_number(len) and get_cbor_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len)); } case 0x9A: // array (four-byte uint32_t for n follow) { uint32_t len; - return get_number(len) and get_cbor_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len)); } case 0x9B: // array (eight-byte uint64_t for n follow) { uint64_t len; - return get_number(len) and get_cbor_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len)); } case 0x9F: // array (indefinite length) @@ -357,25 +581,25 @@ class binary_reader case 0xB8: // map (one-byte uint8_t for n follows) { uint8_t len; - return get_number(len) and get_cbor_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len)); } case 0xB9: // map (two-byte uint16_t for n follow) { uint16_t len; - return get_number(len) and get_cbor_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len)); } case 0xBA: // map (four-byte uint32_t for n follow) { uint32_t len; - return get_number(len) and get_cbor_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len)); } case 0xBB: // map (eight-byte uint64_t for n follow) { uint64_t len; - return get_number(len) and get_cbor_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len)); } case 0xBF: // map (indefinite length) @@ -392,17 +616,20 @@ class binary_reader case 0xF9: // Half-Precision Float (two-byte IEEE 754) { - const int byte1 = get(); - if (JSON_UNLIKELY(not unexpect_eof())) + const int byte1_raw = get(); + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::cbor, "number"))) { return false; } - const int byte2 = get(); - if (JSON_UNLIKELY(not unexpect_eof())) + const int byte2_raw = get(); + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::cbor, "number"))) { return false; } + const auto byte1 = static_cast<unsigned char>(byte1_raw); + const auto byte2 = static_cast<unsigned char>(byte2_raw); + // code from RFC 7049, Appendix D, Figure 3: // As half-precision floating-point numbers were only added // to IEEE 754 in 2008, today's programming platforms often @@ -438,24 +665,209 @@ class binary_reader case 0xFA: // Single-Precision Float (four-byte IEEE 754) { float number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::cbor, number) and sax->number_float(static_cast<number_float_t>(number), ""); } case 0xFB: // Double-Precision Float (eight-byte IEEE 754) { double number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::cbor, number) and sax->number_float(static_cast<number_float_t>(number), ""); } default: // anything else (0xFF is handled inside the other types) { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading CBOR; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"))); + } + } + } + + /*! + @brief reads a CBOR string + + This function first reads starting bytes to determine the expected + string length and then copies this number of bytes into a string. + Additionally, CBOR's strings with indefinite lengths are supported. + + @param[out] result created string + + @return whether string creation completed + */ + bool get_cbor_string(string_t& result) + { + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::cbor, "string"))) + { + return false; + } + + switch (current) + { + // UTF-8 string (0x00..0x17 bytes follow) + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + { + return get_string(input_format_t::cbor, current & 0x1F, result); + } + + case 0x78: // UTF-8 string (one-byte uint8_t for n follows) + { + uint8_t len; + return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result); + } + + case 0x79: // UTF-8 string (two-byte uint16_t for n follow) + { + uint16_t len; + return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result); + } + + case 0x7A: // UTF-8 string (four-byte uint32_t for n follow) + { + uint32_t len; + return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result); + } + + case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow) + { + uint64_t len; + return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result); + } + + case 0x7F: // UTF-8 string (indefinite length) + { + while (get() != 0xFF) + { + string_t chunk; + if (not get_cbor_string(chunk)) + { + return false; + } + result.append(chunk); + } + return true; + } + + default: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string"))); } } } /*! + @param[in] len the length of the array or std::size_t(-1) for an + array of indefinite size + @return whether array creation completed + */ + bool get_cbor_array(const std::size_t len) + { + if (JSON_UNLIKELY(not sax->start_array(len))) + { + return false; + } + + if (len != std::size_t(-1)) + { + for (std::size_t i = 0; i < len; ++i) + { + if (JSON_UNLIKELY(not parse_cbor_internal())) + { + return false; + } + } + } + else + { + while (get() != 0xFF) + { + if (JSON_UNLIKELY(not parse_cbor_internal(false))) + { + return false; + } + } + } + + return sax->end_array(); + } + + /*! + @param[in] len the length of the object or std::size_t(-1) for an + object of indefinite size + @return whether object creation completed + */ + bool get_cbor_object(const std::size_t len) + { + if (not JSON_UNLIKELY(sax->start_object(len))) + { + return false; + } + + string_t key; + if (len != std::size_t(-1)) + { + for (std::size_t i = 0; i < len; ++i) + { + get(); + if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key))) + { + return false; + } + + if (JSON_UNLIKELY(not parse_cbor_internal())) + { + return false; + } + key.clear(); + } + } + else + { + while (get() != 0xFF) + { + if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key))) + { + return false; + } + + if (JSON_UNLIKELY(not parse_cbor_internal())) + { + return false; + } + key.clear(); + } + } + + return sax->end_object(); + } + + ///////////// + // MsgPack // + ///////////// + + /*! @return whether a valid MessagePack value was passed to the SAX parser */ bool parse_msgpack_internal() @@ -464,7 +876,7 @@ class binary_reader { // EOF case std::char_traits<char>::eof(): - return unexpect_eof(); + return unexpect_eof(input_format_t::msgpack, "value"); // positive fixint case 0x00: @@ -685,61 +1097,61 @@ class binary_reader case 0xCA: // float 32 { float number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::msgpack, number) and sax->number_float(static_cast<number_float_t>(number), ""); } case 0xCB: // float 64 { double number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::msgpack, number) and sax->number_float(static_cast<number_float_t>(number), ""); } case 0xCC: // uint 8 { uint8_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number); } case 0xCD: // uint 16 { uint16_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number); } case 0xCE: // uint 32 { uint32_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number); } case 0xCF: // uint 64 { uint64_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number); } case 0xD0: // int 8 { int8_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::msgpack, number) and sax->number_integer(number); } case 0xD1: // int 16 { int16_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::msgpack, number) and sax->number_integer(number); } case 0xD2: // int 32 { int32_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::msgpack, number) and sax->number_integer(number); } case 0xD3: // int 64 { int64_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::msgpack, number) and sax->number_integer(number); } case 0xD9: // str 8 @@ -753,25 +1165,25 @@ class binary_reader case 0xDC: // array 16 { uint16_t len; - return get_number(len) and get_msgpack_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::msgpack, len) and get_msgpack_array(static_cast<std::size_t>(len)); } case 0xDD: // array 32 { uint32_t len; - return get_number(len) and get_msgpack_array(static_cast<std::size_t>(len)); + return get_number(input_format_t::msgpack, len) and get_msgpack_array(static_cast<std::size_t>(len)); } case 0xDE: // map 16 { uint16_t len; - return get_number(len) and get_msgpack_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::msgpack, len) and get_msgpack_object(static_cast<std::size_t>(len)); } case 0xDF: // map 32 { uint32_t len; - return get_number(len) and get_msgpack_object(static_cast<std::size_t>(len)); + return get_number(input_format_t::msgpack, len) and get_msgpack_object(static_cast<std::size_t>(len)); } // negative fixint @@ -812,302 +1224,12 @@ class binary_reader default: // anything else { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading MessagePack; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value"))); } } } /*! - @param[in] get_char whether a new character should be retrieved from the - input (true, default) or whether the last read - character should be considered instead - - @return whether a valid UBJSON value was passed to the SAX parser - */ - bool parse_ubjson_internal(const bool get_char = true) - { - return get_ubjson_value(get_char ? get_ignore_noop() : current); - } - - /*! - @brief get next character from the input - - This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a -'ve valued - `std::char_traits<char>::eof()` in that case. - - @return character read from the input - */ - int get() - { - ++chars_read; - return (current = ia->get_character()); - } - - /*! - @return character read from the input after ignoring all 'N' entries - */ - int get_ignore_noop() - { - do - { - get(); - } - while (current == 'N'); - - return current; - } - - /* - @brief read a number from the input - - @tparam NumberType the type of the number - @param[out] result number of type @a NumberType - - @return whether conversion completed - - @note This function needs to respect the system's endianess, because - bytes in CBOR, MessagePack, and UBJSON are stored in network order - (big endian) and therefore need reordering on little endian systems. - */ - template<typename NumberType> - bool get_number(NumberType& result) - { - // step 1: read input into array with system's byte order - std::array<uint8_t, sizeof(NumberType)> vec; - for (std::size_t i = 0; i < sizeof(NumberType); ++i) - { - get(); - if (JSON_UNLIKELY(not unexpect_eof())) - { - return false; - } - - // reverse byte order prior to conversion if necessary - if (is_little_endian) - { - vec[sizeof(NumberType) - i - 1] = static_cast<uint8_t>(current); - } - else - { - vec[i] = static_cast<uint8_t>(current); // LCOV_EXCL_LINE - } - } - - // step 2: convert array into number of type T and return - std::memcpy(&result, vec.data(), sizeof(NumberType)); - return true; - } - - /*! - @brief create a string by reading characters from the input - - @tparam NumberType the type of the number - @param[in] len number of characters to read - @param[out] string created by reading @a len bytes - - @return whether string creation completed - - @note We can not reserve @a len bytes for the result, because @a len - may be too large. Usually, @ref unexpect_eof() detects the end of - the input before we run out of string memory. - */ - template<typename NumberType> - bool get_string(const NumberType len, string_t& result) - { - bool success = true; - std::generate_n(std::back_inserter(result), len, [this, &success]() - { - get(); - if (JSON_UNLIKELY(not unexpect_eof())) - { - success = false; - } - return static_cast<char>(current); - }); - return success; - } - - /*! - @brief reads a CBOR string - - This function first reads starting bytes to determine the expected - string length and then copies this number of bytes into a string. - Additionally, CBOR's strings with indefinite lengths are supported. - - @param[out] result created string - - @return whether string creation completed - */ - bool get_cbor_string(string_t& result) - { - if (JSON_UNLIKELY(not unexpect_eof())) - { - return false; - } - - switch (current) - { - // UTF-8 string (0x00..0x17 bytes follow) - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6A: - case 0x6B: - case 0x6C: - case 0x6D: - case 0x6E: - case 0x6F: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - { - return get_string(current & 0x1F, result); - } - - case 0x78: // UTF-8 string (one-byte uint8_t for n follows) - { - uint8_t len; - return get_number(len) and get_string(len, result); - } - - case 0x79: // UTF-8 string (two-byte uint16_t for n follow) - { - uint16_t len; - return get_number(len) and get_string(len, result); - } - - case 0x7A: // UTF-8 string (four-byte uint32_t for n follow) - { - uint32_t len; - return get_number(len) and get_string(len, result); - } - - case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow) - { - uint64_t len; - return get_number(len) and get_string(len, result); - } - - case 0x7F: // UTF-8 string (indefinite length) - { - while (get() != 0xFF) - { - string_t chunk; - if (not get_cbor_string(chunk)) - { - return false; - } - result.append(chunk); - } - return true; - } - - default: - { - auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a CBOR string; last byte: 0x" + last_token)); - } - } - } - - /*! - @param[in] len the length of the array or std::size_t(-1) for an - array of indefinite size - @return whether array creation completed - */ - bool get_cbor_array(const std::size_t len) - { - if (JSON_UNLIKELY(not sax->start_array(len))) - { - return false; - } - - if (len != std::size_t(-1)) - for (std::size_t i = 0; i < len; ++i) - { - if (JSON_UNLIKELY(not parse_cbor_internal())) - { - return false; - } - } - else - { - while (get() != 0xFF) - { - if (JSON_UNLIKELY(not parse_cbor_internal(false))) - { - return false; - } - } - } - - return sax->end_array(); - } - - /*! - @param[in] len the length of the object or std::size_t(-1) for an - object of indefinite size - @return whether object creation completed - */ - bool get_cbor_object(const std::size_t len) - { - if (not JSON_UNLIKELY(sax->start_object(len))) - { - return false; - } - - string_t key; - if (len != std::size_t(-1)) - { - for (std::size_t i = 0; i < len; ++i) - { - get(); - if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key))) - { - return false; - } - - if (JSON_UNLIKELY(not parse_cbor_internal())) - { - return false; - } - key.clear(); - } - } - else - { - while (get() != 0xFF) - { - if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key))) - { - return false; - } - - if (JSON_UNLIKELY(not parse_cbor_internal())) - { - return false; - } - key.clear(); - } - } - - return sax->end_object(); - } - - /*! @brief reads a MessagePack string This function first reads starting bytes to determine the expected @@ -1119,7 +1241,7 @@ class binary_reader */ bool get_msgpack_string(string_t& result) { - if (JSON_UNLIKELY(not unexpect_eof())) + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::msgpack, "string"))) { return false; } @@ -1160,31 +1282,31 @@ class binary_reader case 0xBE: case 0xBF: { - return get_string(current & 0x1F, result); + return get_string(input_format_t::msgpack, current & 0x1F, result); } case 0xD9: // str 8 { uint8_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result); } case 0xDA: // str 16 { uint16_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result); } case 0xDB: // str 32 { uint32_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result); } default: { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a MessagePack string; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string"))); } } } @@ -1241,6 +1363,22 @@ class binary_reader return sax->end_object(); } + //////////// + // UBJSON // + //////////// + + /*! + @param[in] get_char whether a new character should be retrieved from the + input (true, default) or whether the last read + character should be considered instead + + @return whether a valid UBJSON value was passed to the SAX parser + */ + bool parse_ubjson_internal(const bool get_char = true) + { + return get_ubjson_value(get_char ? get_ignore_noop() : current); + } + /*! @brief reads a UBJSON string @@ -1262,7 +1400,7 @@ class binary_reader get(); // TODO: may we ignore N here? } - if (JSON_UNLIKELY(not unexpect_eof())) + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "value"))) { return false; } @@ -1272,36 +1410,36 @@ class binary_reader case 'U': { uint8_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result); } case 'i': { int8_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result); } case 'I': { int16_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result); } case 'l': { int32_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result); } case 'L': { int64_t len; - return get_number(len) and get_string(len, result); + return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result); } default: auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a UBJSON string; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string"))); } } @@ -1316,7 +1454,7 @@ class binary_reader case 'U': { uint8_t number; - if (JSON_UNLIKELY(not get_number(number))) + if (JSON_UNLIKELY(not get_number(input_format_t::ubjson, number))) { return false; } @@ -1327,7 +1465,7 @@ class binary_reader case 'i': { int8_t number; - if (JSON_UNLIKELY(not get_number(number))) + if (JSON_UNLIKELY(not get_number(input_format_t::ubjson, number))) { return false; } @@ -1338,7 +1476,7 @@ class binary_reader case 'I': { int16_t number; - if (JSON_UNLIKELY(not get_number(number))) + if (JSON_UNLIKELY(not get_number(input_format_t::ubjson, number))) { return false; } @@ -1349,7 +1487,7 @@ class binary_reader case 'l': { int32_t number; - if (JSON_UNLIKELY(not get_number(number))) + if (JSON_UNLIKELY(not get_number(input_format_t::ubjson, number))) { return false; } @@ -1360,7 +1498,7 @@ class binary_reader case 'L': { int64_t number; - if (JSON_UNLIKELY(not get_number(number))) + if (JSON_UNLIKELY(not get_number(input_format_t::ubjson, number))) { return false; } @@ -1371,7 +1509,7 @@ class binary_reader default: { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "byte after '#' must denote a number type; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size"))); } } } @@ -1396,7 +1534,7 @@ class binary_reader if (current == '$') { result.second = get(); // must not ignore 'N', because 'N' maybe the type - if (JSON_UNLIKELY(not unexpect_eof())) + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "type"))) { return false; } @@ -1404,12 +1542,12 @@ class binary_reader get_ignore_noop(); if (JSON_UNLIKELY(current != '#')) { - if (JSON_UNLIKELY(not unexpect_eof())) + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "value"))) { return false; } auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "expected '#' after UBJSON type information; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size"))); } return get_ubjson_size_value(result.first); @@ -1430,7 +1568,7 @@ class binary_reader switch (prefix) { case std::char_traits<char>::eof(): // EOF - return unexpect_eof(); + return unexpect_eof(input_format_t::ubjson, "value"); case 'T': // true return sax->boolean(true); @@ -1443,56 +1581,56 @@ class binary_reader case 'U': { uint8_t number; - return get_number(number) and sax->number_unsigned(number); + return get_number(input_format_t::ubjson, number) and sax->number_unsigned(number); } case 'i': { int8_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::ubjson, number) and sax->number_integer(number); } case 'I': { int16_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::ubjson, number) and sax->number_integer(number); } case 'l': { int32_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::ubjson, number) and sax->number_integer(number); } case 'L': { int64_t number; - return get_number(number) and sax->number_integer(number); + return get_number(input_format_t::ubjson, number) and sax->number_integer(number); } case 'd': { float number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::ubjson, number) and sax->number_float(static_cast<number_float_t>(number), ""); } case 'D': { double number; - return get_number(number) and sax->number_float(static_cast<number_float_t>(number), ""); + return get_number(input_format_t::ubjson, number) and sax->number_float(static_cast<number_float_t>(number), ""); } case 'C': // char { get(); - if (JSON_UNLIKELY(not unexpect_eof())) + if (JSON_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "char"))) { return false; } if (JSON_UNLIKELY(current > 127)) { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char"))); } string_t s(1, static_cast<char>(current)); return sax->string(s); @@ -1513,7 +1651,7 @@ class binary_reader default: // anything else { auto last_token = get_token_string(); - return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading UBJSON; last byte: 0x" + last_token)); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value"))); } } } @@ -1655,14 +1793,124 @@ class binary_reader return sax->end_object(); } + /////////////////////// + // Utility functions // + /////////////////////// + + /*! + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a -'ve valued + `std::char_traits<char>::eof()` in that case. + + @return character read from the input + */ + int get() + { + ++chars_read; + return (current = ia->get_character()); + } + + /*! + @return character read from the input after ignoring all 'N' entries + */ + int get_ignore_noop() + { + do + { + get(); + } + while (current == 'N'); + + return current; + } + + /* + @brief read a number from the input + + @tparam NumberType the type of the number + @param[in] format the current format (for diagnostics) + @param[out] result number of type @a NumberType + + @return whether conversion completed + + @note This function needs to respect the system's endianess, because + bytes in CBOR, MessagePack, and UBJSON are stored in network order + (big endian) and therefore need reordering on little endian systems. + */ + template<typename NumberType, bool InputIsLittleEndian = false> + bool get_number(const input_format_t format, NumberType& result) + { + // step 1: read input into array with system's byte order + std::array<uint8_t, sizeof(NumberType)> vec; + for (std::size_t i = 0; i < sizeof(NumberType); ++i) + { + get(); + if (JSON_UNLIKELY(not unexpect_eof(format, "number"))) + { + return false; + } + + // reverse byte order prior to conversion if necessary + if (is_little_endian && !InputIsLittleEndian) + { + vec[sizeof(NumberType) - i - 1] = static_cast<uint8_t>(current); + } + else + { + vec[i] = static_cast<uint8_t>(current); // LCOV_EXCL_LINE + } + } + + // step 2: convert array into number of type T and return + std::memcpy(&result, vec.data(), sizeof(NumberType)); + return true; + } + + /*! + @brief create a string by reading characters from the input + + @tparam NumberType the type of the number + @param[in] format the current format (for diagnostics) + @param[in] len number of characters to read + @param[out] result string created by reading @a len bytes + + @return whether string creation completed + + @note We can not reserve @a len bytes for the result, because @a len + may be too large. Usually, @ref unexpect_eof() detects the end of + the input before we run out of string memory. + */ + template<typename NumberType> + bool get_string(const input_format_t format, + const NumberType len, + string_t& result) + { + bool success = true; + std::generate_n(std::back_inserter(result), len, [this, &success, &format]() + { + get(); + if (JSON_UNLIKELY(not unexpect_eof(format, "string"))) + { + success = false; + } + return static_cast<char>(current); + }); + return success; + } + /*! + @param[in] format the current format (for diagnostics) + @param[in] context further context information (for diagnostics) @return whether the last read character is not EOF */ - bool unexpect_eof() const + bool unexpect_eof(const input_format_t format, const char* context) const { if (JSON_UNLIKELY(current == std::char_traits<char>::eof())) { - return sax->parse_error(chars_read, "<end of file>", parse_error::create(110, chars_read, "unexpected end of input")); + return sax->parse_error(chars_read, "<end of file>", + parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context))); } return true; } @@ -1673,10 +1921,49 @@ class binary_reader std::string get_token_string() const { char cr[3]; - snprintf(cr, 3, "%.2hhX", static_cast<unsigned char>(current)); + (std::snprintf)(cr, 3, "%.2hhX", static_cast<unsigned char>(current)); return std::string{cr}; } + /*! + @param[in] format the current format + @param[in] detail a detailed error message + @param[in] context further contect information + @return a message string to use in the parse_error exceptions + */ + std::string exception_message(const input_format_t format, + const std::string& detail, + const std::string& context) const + { + std::string error_msg = "syntax error while parsing "; + + switch (format) + { + case input_format_t::cbor: + error_msg += "CBOR"; + break; + + case input_format_t::msgpack: + error_msg += "MessagePack"; + break; + + case input_format_t::ubjson: + error_msg += "UBJSON"; + break; + + case input_format_t::bson: + error_msg += "BSON"; + break; + + // LCOV_EXCL_START + default: + assert(false); + // LCOV_EXCL_STOP + } + + return error_msg + " " + context + ": " + detail; + } + private: /// input adapter input_adapter_t ia = nullptr; @@ -1693,5 +1980,5 @@ class binary_reader /// the SAX parser json_sax_t* sax = nullptr; }; -} -} +} // namespace detail +} // namespace nlohmann diff --git a/include/lib/modernjson/detail/input/input_adapters.hpp b/include/lib/modernjson/detail/input/input_adapters.hpp index 5abaee3..dfb8caf 100644 --- a/include/lib/modernjson/detail/input/input_adapters.hpp +++ b/include/lib/modernjson/detail/input/input_adapters.hpp @@ -18,7 +18,7 @@ namespace nlohmann namespace detail { /// the supported input formats -enum class input_format_t { json, cbor, msgpack, ubjson }; +enum class input_format_t { json, cbor, msgpack, ubjson, bson }; //////////////////// // input adapters // @@ -60,8 +60,8 @@ class input_stream_adapter : public input_adapter_protocol ~input_stream_adapter() override { // clear stream flags; we use underlying streambuf I/O, do not - // maintain ifstream flags - is.clear(); + // maintain ifstream flags, except eof + is.clear(is.rdstate() & std::ios::eofbit); } explicit input_stream_adapter(std::istream& i) @@ -71,13 +71,21 @@ class input_stream_adapter : public input_adapter_protocol // delete because of pointer members input_stream_adapter(const input_stream_adapter&) = delete; input_stream_adapter& operator=(input_stream_adapter&) = delete; + input_stream_adapter(input_stream_adapter&&) = delete; + input_stream_adapter& operator=(input_stream_adapter&&) = delete; // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to // ensure that std::char_traits<char>::eof() and the character 0xFF do not // end up as the same value, eg. 0xFFFFFFFF. std::char_traits<char>::int_type get_character() override { - return sb.sbumpc(); + auto res = sb.sbumpc(); + // set eof manually, as we don't use the istream interface. + if (res == EOF) + { + is.clear(is.rdstate() | std::ios::eofbit); + } + return res; } private: @@ -90,13 +98,16 @@ class input_stream_adapter : public input_adapter_protocol class input_buffer_adapter : public input_adapter_protocol { public: - input_buffer_adapter(const char* b, const std::size_t l) + input_buffer_adapter(const char* b, const std::size_t l) noexcept : cursor(b), limit(b + l) {} // delete because of pointer members input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter& operator=(input_buffer_adapter&) = delete; + input_buffer_adapter(input_buffer_adapter&&) = delete; + input_buffer_adapter& operator=(input_buffer_adapter&&) = delete; + ~input_buffer_adapter() override = default; std::char_traits<char>::int_type get_character() noexcept override { @@ -115,38 +126,66 @@ class input_buffer_adapter : public input_adapter_protocol const char* const limit; }; -template<typename WideStringType> -class wide_string_input_adapter : public input_adapter_protocol +template<typename WideStringType, size_t T> +struct wide_string_input_helper { - public: - explicit wide_string_input_adapter(const WideStringType& w) : str(w) {} - - std::char_traits<char>::int_type get_character() noexcept override + // UTF-32 + static void fill_buffer(const WideStringType& str, size_t& current_wchar, std::array<std::char_traits<char>::int_type, 4>& utf8_bytes, size_t& utf8_bytes_index, size_t& utf8_bytes_filled) { - // check if buffer needs to be filled - if (utf8_bytes_index == utf8_bytes_filled) + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits<char>::eof(); + utf8_bytes_filled = 1; + } + else { - if (sizeof(typename WideStringType::value_type) == 2) + // get the current character + const auto wc = static_cast<int>(str[current_wchar++]); + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) { - fill_buffer_utf16(); + utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = 0xF0 | ((wc >> 18) & 0x07); + utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 4; } else { - fill_buffer_utf32(); + // unknown character + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; } - - assert(utf8_bytes_filled > 0); - assert(utf8_bytes_index == 0); } - - // use buffer - assert(utf8_bytes_filled > 0); - assert(utf8_bytes_index < utf8_bytes_filled); - return utf8_bytes[utf8_bytes_index++]; } +}; - private: - void fill_buffer_utf16() +template<typename WideStringType> +struct wide_string_input_helper<WideStringType, 2> +{ + // UTF-16 + static void fill_buffer(const WideStringType& str, size_t& current_wchar, std::array<std::char_traits<char>::int_type, 4>& utf8_bytes, size_t& utf8_bytes_index, size_t& utf8_bytes_filled) { utf8_bytes_index = 0; @@ -158,7 +197,7 @@ class wide_string_input_adapter : public input_adapter_protocol else { // get the current character - const int wc = static_cast<int>(str[current_wchar++]); + const auto wc = static_cast<int>(str[current_wchar++]); // UTF-16 to UTF-8 encoding if (wc < 0x80) @@ -183,7 +222,7 @@ class wide_string_input_adapter : public input_adapter_protocol { if (current_wchar < str.size()) { - const int wc2 = static_cast<int>(str[current_wchar++]); + const auto wc2 = static_cast<int>(str[current_wchar++]); const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); utf8_bytes[0] = 0xf0 | (charcode >> 18); utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); @@ -201,58 +240,40 @@ class wide_string_input_adapter : public input_adapter_protocol } } } +}; - void fill_buffer_utf32() - { - utf8_bytes_index = 0; +template<typename WideStringType> +class wide_string_input_adapter : public input_adapter_protocol +{ + public: + explicit wide_string_input_adapter(const WideStringType& w) noexcept + : str(w) + {} - if (current_wchar == str.size()) - { - utf8_bytes[0] = std::char_traits<char>::eof(); - utf8_bytes_filled = 1; - } - else + std::char_traits<char>::int_type get_character() noexcept override + { + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) { - // get the current character - const int wc = static_cast<int>(str[current_wchar++]); + fill_buffer<sizeof(typename WideStringType::value_type)>(); - // UTF-32 to UTF-8 encoding - if (wc < 0x80) - { - utf8_bytes[0] = wc; - utf8_bytes_filled = 1; - } - else if (wc <= 0x7FF) - { - utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); - utf8_bytes[1] = 0x80 | (wc & 0x3F); - utf8_bytes_filled = 2; - } - else if (wc <= 0xFFFF) - { - utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); - utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); - utf8_bytes[2] = 0x80 | (wc & 0x3F); - utf8_bytes_filled = 3; - } - else if (wc <= 0x10FFFF) - { - utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07); - utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); - utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); - utf8_bytes[3] = 0x80 | (wc & 0x3F); - utf8_bytes_filled = 4; - } - else - { - // unknown character - utf8_bytes[0] = wc; - utf8_bytes_filled = 1; - } + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index == 0); } + + // use buffer + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index < utf8_bytes_filled); + return utf8_bytes[utf8_bytes_index++]; } private: + template<size_t T> + void fill_buffer() + { + wide_string_input_helper<WideStringType, T>::fill_buffer(str, current_wchar, utf8_bytes, utf8_bytes_index, utf8_bytes_filled); + } + /// the wstring to process const WideStringType& str; @@ -373,5 +394,5 @@ class input_adapter /// the actual adapter input_adapter_t ia = nullptr; }; -} -} +} // namespace detail +} // namespace nlohmann diff --git a/include/lib/modernjson/detail/input/json_sax.hpp b/include/lib/modernjson/detail/input/json_sax.hpp index 968fcd0..e1d48a2 100644 --- a/include/lib/modernjson/detail/input/json_sax.hpp +++ b/include/lib/modernjson/detail/input/json_sax.hpp @@ -113,7 +113,7 @@ struct json_sax @brief a parse error occurred @param[in] position the position in the input where the error occurs @param[in] last_token the last read token - @param[in] error_msg a detailed error message + @param[in] ex an exception object describing the error @return whether parsing should proceed (must return false) */ virtual bool parse_error(std::size_t position, @@ -181,7 +181,7 @@ class json_sax_dom_parser return true; } - bool number_float(number_float_t val, const string_t&) + bool number_float(number_float_t val, const string_t& /*unused*/) { handle_value(val); return true; @@ -238,7 +238,7 @@ class json_sax_dom_parser return true; } - bool parse_error(std::size_t, const std::string&, + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& ex) { errored = true; @@ -286,20 +286,19 @@ class json_sax_dom_parser root = BasicJsonType(std::forward<Value>(v)); return &root; } + + assert(ref_stack.back()->is_array() or ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v)); + return &(ref_stack.back()->m_value.array->back()); + } else { - assert(ref_stack.back()->is_array() or ref_stack.back()->is_object()); - if (ref_stack.back()->is_array()) - { - ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v)); - return &(ref_stack.back()->m_value.array->back()); - } - else - { - assert(object_element); - *object_element = BasicJsonType(std::forward<Value>(v)); - return object_element; - } + assert(object_element); + *object_element = BasicJsonType(std::forward<Value>(v)); + return object_element; } } @@ -358,7 +357,7 @@ class json_sax_dom_callback_parser return true; } - bool number_float(number_float_t val, const string_t&) + bool number_float(number_float_t val, const string_t& /*unused*/) { handle_value(val); return true; @@ -496,7 +495,7 @@ class json_sax_dom_callback_parser return true; } - bool parse_error(std::size_t, const std::string&, + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& ex) { errored = true; @@ -574,37 +573,37 @@ class json_sax_dom_callback_parser root = std::move(value); return {true, &root}; } + + // skip this value if we already decided to skip the parent + // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360) + if (not ref_stack.back()) + { + return {false, nullptr}; + } + + // we now only expect arrays and objects + assert(ref_stack.back()->is_array() or ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + ref_stack.back()->m_value.array->push_back(std::move(value)); + return {true, &(ref_stack.back()->m_value.array->back())}; + } else { - // skip this value if we already decided to skip the parent - // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360) - if (not ref_stack.back()) - { - return {false, nullptr}; - } + // check if we should store an element for the current key + assert(not key_keep_stack.empty()); + const bool store_element = key_keep_stack.back(); + key_keep_stack.pop_back(); - assert(ref_stack.back()->is_array() or ref_stack.back()->is_object()); - if (ref_stack.back()->is_array()) + if (not store_element) { - ref_stack.back()->m_value.array->push_back(std::move(value)); - return {true, &(ref_stack.back()->m_value.array->back())}; + return {false, nullptr}; } - else - { - // check if we should store an element for the current key - assert(not key_keep_stack.empty()); - const bool store_element = key_keep_stack.back(); - key_keep_stack.pop_back(); - - if (not store_element) - { - return {false, nullptr}; - } - assert(object_element); - *object_element = std::move(value); - return {true, object_element}; - } + assert(object_element); + *object_element = std::move(value); + return {true, object_element}; } } @@ -642,37 +641,37 @@ class json_sax_acceptor return true; } - bool boolean(bool) + bool boolean(bool /*unused*/) { return true; } - bool number_integer(number_integer_t) + bool number_integer(number_integer_t /*unused*/) { return true; } - bool number_unsigned(number_unsigned_t) + bool number_unsigned(number_unsigned_t /*unused*/) { return true; } - bool number_float(number_float_t, const string_t&) + bool number_float(number_float_t /*unused*/, const string_t& /*unused*/) { return true; } - bool string(string_t&) + bool string(string_t& /*unused*/) { return true; } - bool start_object(std::size_t = std::size_t(-1)) + bool start_object(std::size_t /*unused*/ = std::size_t(-1)) { return true; } - bool key(string_t&) + bool key(string_t& /*unused*/) { return true; } @@ -682,7 +681,7 @@ class json_sax_acceptor return true; } - bool start_array(std::size_t = std::size_t(-1)) + bool start_array(std::size_t /*unused*/ = std::size_t(-1)) { return true; } @@ -692,11 +691,11 @@ class json_sax_acceptor return true; } - bool parse_error(std::size_t, const std::string&, const detail::exception&) + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/) { return false; } }; -} +} // namespace detail -} +} // namespace nlohmann diff --git a/include/lib/modernjson/detail/input/lexer.hpp b/include/lib/modernjson/detail/input/lexer.hpp index 9606211..b61e289 100644 --- a/include/lib/modernjson/detail/input/lexer.hpp +++ b/include/lib/modernjson/detail/input/lexer.hpp @@ -10,6 +10,7 @@ #include <lib/modernjson/detail/macro_scope.hpp> #include <lib/modernjson/detail/input/input_adapters.hpp> +#include <lib/modernjson/detail/input/position_t.hpp> namespace nlohmann { @@ -104,7 +105,10 @@ class lexer // delete because of pointer members lexer(const lexer&) = delete; + lexer(lexer&&) = delete; lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = delete; + ~lexer() = default; private: ///////////////////// @@ -393,39 +397,194 @@ class lexer // invalid control characters case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + case 0x1F: { - error_message = "invalid string: control character must be escaped"; + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; return token_type::parse_error; } @@ -709,7 +868,7 @@ class lexer locale's decimal point is used instead of `.` to work with the locale-dependent converters. */ - token_type scan_number() + token_type scan_number() // lgtm [cpp/use-of-goto] { // reset token_buffer to store the number's bytes reset(); @@ -1082,7 +1241,9 @@ scan_number_done: */ std::char_traits<char>::int_type get() { - ++chars_read; + ++position.chars_read_total; + ++position.chars_read_current_line; + if (next_unget) { // just reset the next_unget variable and work with current @@ -1097,6 +1258,13 @@ scan_number_done: { token_string.push_back(std::char_traits<char>::to_char_type(current)); } + + if (current == '\n') + { + ++position.lines_read; + ++position.chars_read_current_line = 0; + } + return current; } @@ -1104,14 +1272,29 @@ scan_number_done: @brief unget current character (read it again on next get) We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read and - token_string. The next call to get() will behave as if the unget character - is read again. + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. */ void unget() { next_unget = true; - --chars_read; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + if (JSON_LIKELY(current != std::char_traits<char>::eof())) { assert(token_string.size() != 0); @@ -1159,9 +1342,9 @@ scan_number_done: ///////////////////// /// return position of last read token - constexpr std::size_t get_position() const noexcept + constexpr position_t get_position() const noexcept { - return chars_read; + return position; } /// return the last read token (for errors only). Will never contain EOF @@ -1177,7 +1360,7 @@ scan_number_done: { // escape control characters char cs[9]; - snprintf(cs, 9, "<U+%.4X>", static_cast<unsigned char>(c)); + (std::snprintf)(cs, 9, "<U+%.4X>", static_cast<unsigned char>(c)); result += cs; } else @@ -1208,30 +1391,20 @@ scan_number_done: { if (get() == 0xEF) { - if (get() == 0xBB and get() == 0xBF) - { - // we completely parsed the BOM - return true; - } - else - { - // after reading 0xEF, an unexpected character followed - return false; - } - } - else - { - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; + // check if we completely parse the BOM + return get() == 0xBB and get() == 0xBF; } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; } token_type scan() { // initially, skip the BOM - if (chars_read == 0 and not skip_bom()) + if (position.chars_read_total == 0 and not skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; return token_type::parse_error; @@ -1309,8 +1482,8 @@ scan_number_done: /// whether the next get() call should just return current bool next_unget = false; - /// the number of characters read - std::size_t chars_read = 0; + /// the start position of the current token + position_t position; /// raw input token string (for error messages) std::vector<char> token_string {}; @@ -1329,5 +1502,5 @@ scan_number_done: /// the decimal point const char decimal_point_char = '.'; }; -} -} +} // namespace detail +} // namespace nlohmann diff --git a/include/lib/modernjson/detail/input/parser.hpp b/include/lib/modernjson/detail/input/parser.hpp index a622077..d205bbb 100644 --- a/include/lib/modernjson/detail/input/parser.hpp +++ b/include/lib/modernjson/detail/input/parser.hpp @@ -91,7 +91,8 @@ class parser { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_of_input, "value"))); } // in case of an error, return discarded value @@ -119,7 +120,8 @@ class parser { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_of_input, "value"))); } // in case of an error, return discarded value @@ -154,7 +156,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_of_input, "value"))); } return result; @@ -164,7 +167,7 @@ class parser template <typename SAX> bool sax_parse_internal(SAX* sax) { - // stack to remember the hieararchy of structured values we are parsing + // stack to remember the hierarchy of structured values we are parsing // true = array; false = object std::vector<bool> states; // value to avoid a goto (see comment where set to true) @@ -199,14 +202,12 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::value_string, "object key"))); } - else + if (JSON_UNLIKELY(not sax->key(m_lexer.get_string()))) { - if (JSON_UNLIKELY(not sax->key(m_lexer.get_string()))) - { - return false; - } + return false; } // parse separator (:) @@ -214,7 +215,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::name_separator, "object separator"))); } // remember we are now inside an object @@ -328,14 +330,16 @@ class parser // using "uninitialized" to avoid "expected" message return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::uninitialized, "value"))); } default: // the last token was unexpected { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::literal_or_value, "value"))); } } } @@ -347,7 +351,7 @@ class parser // we reached this line after we successfully parsed a value if (states.empty()) { - // empty stack: we reached the end of the hieararchy: done + // empty stack: we reached the end of the hierarchy: done return true; } else @@ -383,7 +387,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_array, "array"))); } } else // object @@ -396,7 +401,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::value_string, "object key"))); } else { @@ -411,7 +417,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::name_separator, "object separator"))); } // parse values @@ -440,7 +447,8 @@ class parser { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), - parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object))); + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_object, "object"))); } } } @@ -453,9 +461,17 @@ class parser return (last_token = m_lexer.scan()); } - std::string exception_message(const token_type expected) + std::string exception_message(const token_type expected, const std::string& context) { - std::string error_msg = "syntax error - "; + std::string error_msg = "syntax error "; + + if (not context.empty()) + { + error_msg += "while parsing " + context + " "; + } + + error_msg += "- "; + if (last_token == token_type::parse_error) { error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" + @@ -484,5 +500,5 @@ class parser /// whether to throw exceptions in case of errors const bool allow_exceptions = true; }; -} -} +} // namespace detail +} // namespace nlohmann diff --git a/include/lib/modernjson/detail/input/position_t.hpp b/include/lib/modernjson/detail/input/position_t.hpp new file mode 100644 index 0000000..37f4ab1 --- /dev/null +++ b/include/lib/modernjson/detail/input/position_t.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include <cstddef> // size_t + +namespace nlohmann +{ +namespace detail +{ +/// struct to capture the start position of the current token +struct position_t +{ + /// the total number of characters read + std::size_t chars_read_total = 0; + /// the number of characters read in the current line + std::size_t chars_read_current_line = 0; + /// the number of lines read + std::size_t lines_read = 0; + + /// conversion to size_t to preserve SAX interface + constexpr operator size_t() const + { + return chars_read_total; + } +}; + +} +} |