From 0671e92ced71187690c81db645279d39ecf92e16 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 16 Oct 2018 20:38:50 +0200 Subject: [PATCH] :construction: proposal for different error handlers #1198 Proof of concept; currently only as parameter to the internal dump_escaped function; that is, not yet exposed to the dump function. --- include/nlohmann/detail/output/serializer.hpp | 67 +++++++++++++++++-- single_include/nlohmann/json.hpp | 67 +++++++++++++++++-- 2 files changed, 120 insertions(+), 14 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index bb74a86e6..7adf0c2f7 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -39,6 +39,14 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: + /// how to treat decoding errors + enum class error_handler_t + { + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences + }; + /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use @@ -278,10 +286,12 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences + @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) + void dump_escaped(const string_t& s, const bool ensure_ascii, + const error_handler_t error_handler = error_handler_t::strict) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; @@ -389,9 +399,33 @@ class serializer case UTF8_REJECT: // decode found invalid UTF-8 byte { - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + } + + case error_handler_t::ignore: + { + state = UTF8_ACCEPT; + continue; + } + + case error_handler_t::replace: + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + state = UTF8_ACCEPT; + continue; + } + } } default: // decode found yet incomplete multi-byte code point @@ -417,9 +451,28 @@ class serializer else { // we finish reading, but do not accept: string was incomplete - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + } + + case error_handler_t::ignore: + { + break; + } + + case error_handler_t::replace: + { + // write buffer, but replace last byte + o->write_characters(string_buffer.data(), bytes - 1); + o->write_characters("\\ufffd", 6); + break; + } + } } } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index dc206d301..c4681d7b8 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9991,6 +9991,14 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: + /// how to treat decoding errors + enum class error_handler_t + { + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences + }; + /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use @@ -10230,10 +10238,12 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences + @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) + void dump_escaped(const string_t& s, const bool ensure_ascii, + const error_handler_t error_handler = error_handler_t::strict) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; @@ -10341,9 +10351,33 @@ class serializer case UTF8_REJECT: // decode found invalid UTF-8 byte { - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + } + + case error_handler_t::ignore: + { + state = UTF8_ACCEPT; + continue; + } + + case error_handler_t::replace: + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + state = UTF8_ACCEPT; + continue; + } + } } default: // decode found yet incomplete multi-byte code point @@ -10369,9 +10403,28 @@ class serializer else { // we finish reading, but do not accept: string was incomplete - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + } + + case error_handler_t::ignore: + { + break; + } + + case error_handler_t::replace: + { + // write buffer, but replace last byte + o->write_characters(string_buffer.data(), bytes - 1); + o->write_characters("\\ufffd", 6); + break; + } + } } }