From 5a54e46709122427fae296abc04cdab8dacbfc6d Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 16:49:10 +0100 Subject: [PATCH] Fully implemented the JSON spec --- src/json.cc | 246 +++++++++++++++++++++++++++++++++++----------- src/json.h | 6 +- test/json_unit.cc | 42 +++++++- 3 files changed, 232 insertions(+), 62 deletions(-) diff --git a/src/json.cc b/src/json.cc index 982251677..b664ef157 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2049,40 +2049,61 @@ std::string json::parser::parseString() // the result of the parse process std::string result; - // iterate with pos_ over the whole string - for (; pos_ < buffer_.size(); pos_++) { + // iterate with pos_ over the whole input until we found the end and return + // or we exit via error() + for (; pos_ < buffer_.size(); pos_++) + { char currentChar = buffer_[pos_]; - // uneven amount of backslashes means the user wants to escape something - if (!evenAmountOfBackslashes) { + if (!evenAmountOfBackslashes) + { + // uneven amount of backslashes means the user wants to escape something + // so we know there is a case such as '\X' or '\\\X' but we don't + // know yet what X is. + // at this point in the code, the currentChar has the value of X // slash, backslash and quote are copied as is if ( currentChar == '/' || currentChar == '\\' - || currentChar == '"') { + || currentChar == '"') + { result += currentChar; - } else { - // All other characters are replaced by their respective special character - if (currentChar == 't') { - result += '\t'; - } else if (currentChar == 'b') { - result += '\b'; - } else if (currentChar == 'f') { - result += '\f'; - } else if (currentChar == 'n') { - result += '\n'; - } else if (currentChar == 'r') { - result += '\r'; - } else if (currentChar == 'u') { - pos_++; - result += parseUnicodeEscape(); - } else { - error("expected one of \\,/,b,f,n,r,t behind backslash."); - } - // TODO implement \uXXXX } - } else { - if (currentChar == '"') { + else + { + // All other characters are replaced by their respective special character + if (currentChar == 't') + result += '\t'; + else if (currentChar == 'b') + result += '\b'; + else if (currentChar == 'f') + result += '\f'; + else if (currentChar == 'n') + result += '\n'; + else if (currentChar == 'r') + result += '\r'; + else if (currentChar == 'u') + { + // \uXXXX[\uXXXX] is used for escaping unicode, which + // has it's own subroutine. + result += parseUnicodeEscape(); + // the parsing process has brought us one step behind the + // unicode escape sequence: + // \uXXXX + // ^ + // so we need to go one character back or the parser + // would skip the character we are currently pointing at + // (as the for-loop will drecement pos_ after this iteration). + pos_--; + } + else // user did something like \z and we should report a error + error("expected one of \\,/,b,f,n,r,t,u behind backslash."); + } + } + else + { + if (currentChar == '"') + { // currentChar is a quote, so we found the end of the string @@ -2093,7 +2114,9 @@ std::string json::parser::parseString() // bring the result of the parsing process back to the caller return result; - } else if (currentChar != '\\') { + } + else if (currentChar != '\\') + { // all non-backslash characters are added to the end of the result string. // the only backslashes we want in the result are the ones that are escaped (which happens above). result += currentChar; @@ -2121,34 +2144,74 @@ std::string json::parser::parseString() error("expected '\"'"); } -std::string json::parser::unicodeToUTF8(unsigned int codepoint) { - // it's just a ASCII compatible codepoint, - // so we just interpret the point as a character - if (codepoint <= 0x7f) { + +/*! +Turns a code point into it's UTF-8 representation. +You should only pass numbers < 0x10ffff into this function +(everything else is a invalid code point). + +@return the UTF-8 representation of the given codepoint + +@pre This method isn't accessing the members of the parser + +@post This method isn't accessing the members of the parser +*/ +std::string json::parser::codepointToUTF8(unsigned int codepoint) +{ + // this method contains a lot of bit manipulations to + // build the bytes for UTF-8. + + // the '(... >> S) & 0xHH'-patterns are used to retrieve + // certain bits from the code points. + + // all static casts in this method have boundary checks + + // we initialize all strings with their final length + // (e.g. 1 to 4 bytes) to save the reallocations. + + + if (codepoint <= 0x7f) + { + // it's just a ASCII compatible codepoint, + // so we just interpret the point as a character + // and return ASCII + return std::string(1, static_cast(codepoint)); } + // if true, we need two bytes to encode this as UTF-8 else if (codepoint <= 0x7ff) { - std::string result(2, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); - result[1] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xC0 enables the two most significant two bits + // to make this a two-byte UTF-8 character. + std::string result(2, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result[1] = static_cast(0x80 | (codepoint & 0x3F)); return result; } + // if true, now we need three bytes to encode this as UTF-8 else if (codepoint <= 0xffff) { - std::string result(3, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); - result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); - result[2] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xE0 enables the three most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(3, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (codepoint & 0x3F)); return result; } - else if (codepoint <= 0x1fffff) + // if true, we need maximal four bytes to encode this as UTF-8 + else if (codepoint <= 0x10ffff) { - std::string result(4, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); - result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3f)); - result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); - result[3] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xE0 enables the four most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(4, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (codepoint & 0x3F)); return result; - } else { + } + else + { + // Can't be tested without direct access to this private method. std::string errorMessage = "Invalid codepoint: "; errorMessage += codepoint; error(errorMessage); @@ -2156,39 +2219,110 @@ std::string json::parser::unicodeToUTF8(unsigned int codepoint) { } /*! -Parses the JSON style unicode escape sequence (\uXXXX). +Parses 4 hexadecimal characters as a number. -@return the utf-8 character the escape sequence escaped +@return the value of the number the hexadecimal characters represent. -@pre An opening quote \p " was read in the main parse function @ref parse. - pos_ is the position after the opening quote. +@pre pos_ is pointing to the first of the 4 hexadecimal characters. -@post The character after the closing quote \p " is the current character @ref - current_. Whitespace is skipped. +@post pos_ is pointing to the character after the 4 hexadecimal characters. */ -std::string json::parser::parseUnicodeEscape() { +unsigned int json::parser::parse4HexCodepoint() +{ const auto startPos = pos_; - if (pos_ + 3 >= buffer_.size()) { + + // check if the remaining buffer is long enough to even hold 4 characters + if (pos_ + 3 >= buffer_.size()) + { error("Got end of input while parsing unicode escape sequence \\uXXXX"); } + + // make a string that can hold the pair std::string hexCode(4, ' '); - for(; pos_ < startPos + 4; pos_++) { + + for(; pos_ < startPos + 4; pos_++) + { + // no boundary check here as we already checked above char currentChar = buffer_[pos_]; + + // check if we have a hexadecimal character if ( (currentChar >= '0' && currentChar <= '9') || (currentChar >= 'a' && currentChar <= 'f') - || (currentChar >= 'A' && currentChar <= 'F')) { + || (currentChar >= 'A' && currentChar <= 'F')) + { // all is well, we have valid hexadecimal chars // so we copy that char into our string hexCode[pos_ - startPos] = currentChar; - } else { + } + else + { error("Found non-hexadecimal character in unicode escape sequence!"); } } - pos_--; - // case is safe as 4 hex characters can't present more than 16 bits - return unicodeToUTF8(static_cast(std::stoul(hexCode, nullptr, 16))); + // the cast is safe as 4 hex characters can't present more than 16 bits + // the input to stoul was checked to contain only hexadecimal characters (see above) + return static_cast(std::stoul(hexCode, nullptr, 16)); } +/*! +Parses the unicode escape codes as defined in the ECMA-404. +The escape sequence has two forms: +1. \uXXXX +2. \uXXXX\uYYYY +where X and Y are a hexadecimal character (a-zA-Z0-9). + +Form 1 just contains the unicode code point in the hexadecimal number XXXX. +Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY. + +@return the UTF-8 character this unicode escape sequence escaped. + +@pre pos_ is pointing at at the 'u' behind the first backslash. + +@post pos_ is pointing at the character behind the last X (or Y in form 2). +*/ +std::string json::parser::parseUnicodeEscape() +{ + // jump to the first hex value + pos_++; + // parse the hex first hex values + unsigned int firstCodepoint = parse4HexCodepoint(); + + + if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF) + { + // we found invalid code points, which means we either have a malformed input + // or we found a high surrogate. + // we can only find out by seeing if the next character also wants to encode + // a unicode character (so, we have the \uXXXX\uXXXX case here). + + // jump behind the next \u + pos_ += 2; + // try to parse the next hex values. + // the method does boundary checking for us, so no need to do that here + unsigned secondCodepoint = parse4HexCodepoint(); + // ok, we have a low surrogate, check if it is a valid one + if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF) + { + // calculate the final code point from the pair according to the spec + unsigned int finalCodePoint = + // high surrogate occupies the most significant 22 bits + (firstCodepoint << 10) + // low surrogate occupies the least significant 15 bits + + secondCodepoint + // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result + // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + + // we transform the calculated point into UTF-8 + return codepointToUTF8(finalCodePoint); + } + else + error("missing low surrogate"); + + } + // We have Form 1, so we just interpret the XXXX as a code point + return codepointToUTF8(firstCodepoint); +} /*! diff --git a/src/json.h b/src/json.h index 1b5e8fcf2..90c5ded88 100644 --- a/src/json.h +++ b/src/json.h @@ -419,8 +419,10 @@ class json /// parse a quoted string inline std::string parseString(); /// transforms a unicode codepoint to it's UTF-8 presentation - inline std::string unicodeToUTF8(unsigned int codepoint); - /// parses a unicode escape sequence + inline std::string codepointToUTF8(unsigned int codepoint); + /// parses 4 hex characters that represent a unicode codepoint + inline unsigned int parse4HexCodepoint(); + /// parses \uXXXX[\uXXXX] unicode escape characters inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); diff --git a/test/json_unit.cc b/test/json_unit.cc index fb89a2a1b..ab679fbfa 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1652,10 +1652,6 @@ TEST_CASE("Parser") CHECK(json::parse("\"a\\nz\"") == json("a\nz")); CHECK(json::parse("\"\\n\"") == json("\n")); - // escape unicode characters - CHECK(json::parse("\"\\u002F\"") == json("/")); - CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); - // escaping senseless stuff CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument); @@ -1665,6 +1661,44 @@ TEST_CASE("Parser") CHECK_THROWS_AS(json::parse("\""), std::invalid_argument); } + SECTION("unicode_escaping") + { + // two tests for uppercase and lowercase hex + + // normal forward slash in ASCII range + CHECK(json::parse("\"\\u002F\"") == json("/")); + CHECK(json::parse("\"\\u002f\"") == json("/")); + // german a umlaut + CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); + CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4")); + // weird d + CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111")); + // unicode arrow left + CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190")); + // pleasing osiris by testing hieroglyph support + CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060")); + CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060")); + + + // no hex numbers behind the \u + CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument); + // missing part of a surrogate pair + CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument); + // senseless surrogate pair + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument); + } + SECTION("boolean") { // accept the exact values