Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism

2024-12-26 18:41:03 +00:00 · 2015-01-10 10:36:30 +01:00 · 2015-01-10 10:36:30 +01:00 · 222aacc213
commit 222aacc213
parent 13efc7a02a
3 changed files with 81 additions and 0 deletions
--- a/src/json.cc
+++ b/src/json.cc
@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
                    result += '\n';
                } else if (currentChar == 'r') {
                    result += '\r';
                } else if (currentChar == 'u') {
                    pos_++;
                    result += parseUnicodeEscape();
                } else {
                    error("expected one of \\,/,b,f,n,r,t behind backslash.");
                }
@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
    error("expected '\"'");
 }
 std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
    // it's just a ASCII compatible codepoint,
    // so we just interpret the point as a character
    if (codepoint <= 0x7f) {
        return std::string(1, static_cast<char>(codepoint));
    }
    else if (codepoint <= 0x7ff)
    {
        std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
        result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
        return result;
    }
    else if (codepoint <= 0xffff)
    {
        std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
        result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
        return result;
    }
    else if (codepoint <= 0x1fffff)
    {
        std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
        result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
        return result;
    } else {
        std::string errorMessage = "Invalid codepoint: ";
        errorMessage += codepoint;
        error(errorMessage);
    }
 }
 /*!
 Parses the JSON style unicode escape sequence (\uXXXX).
@return the utf-8 character the escape sequence escaped
@pre  An opening quote \p " was read in the main parse function @ref parse.
      pos_ is the position after the opening quote.
@post The character after the closing quote \p " is the current character @ref
      current_. Whitespace is skipped.
 */
 std::string json::parser::parseUnicodeEscape() {
    const auto startPos = pos_;
    if (pos_ + 3 >= buffer_.size()) {
        error("Got end of input while parsing unicode escape sequence \\uXXXX");
    }
    std::string hexCode(4, ' ');
    for(; pos_ < startPos + 4; pos_++) {
        char currentChar = buffer_[pos_];
        if (   (currentChar >= '0' && currentChar <= '9')
            || (currentChar >= 'a' && currentChar <= 'f')
            || (currentChar >= 'A' && currentChar <= 'F')) {
            // all is well, we have valid hexadecimal chars
            // so we copy that char into our string
            hexCode[pos_ - startPos] = currentChar;
        } else {
            error("Found non-hexadecimal character in unicode escape sequence!");
        }
    }
    pos_--;
    // case is safe as 4 hex characters can't present more than 16 bits
    return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
 }
 /*!
 This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the
--- a/src/json.h
+++ b/src/json.h
@ -418,6 +418,10 @@ class json
        inline void error(const std::string&) __attribute__((noreturn));
        /// parse a quoted string
        inline std::string parseString();
        /// transforms a unicode codepoint to it's UTF-8 presentation
        inline std::string unicodeToUTF8(unsigned int codepoint);
        /// parses a unicode escape sequence
        inline std::string parseUnicodeEscape();
        /// parse a Boolean "true"
        inline void parseTrue();
        /// parse a Boolean "false"
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
        CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
        CHECK(json::parse("\"\\n\"") == json("\n"));
        // escape unicode characters
        CHECK(json::parse("\"\\u002F\"") == json("/"));
        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
        // escaping senseless stuff
        CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);