diff --git a/.gitignore b/.gitignore index 5b2bc0fab..8157f1a98 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ json_unit json_benchmarks +json_benchmarks_simple fuzz-testing *.dSYM diff --git a/benchmarks/Makefile b/benchmarks/Makefile index 0e4068c4e..ef2de8a33 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -1,11 +1,21 @@ -all: json_benchmarks - ./json_benchmarks -json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons +# +# Build/run json.hpp benchmarks, eg. CXX=g++-7 make +# +# The existing json_benchmarks did not allow optimization under some compilers +# +all: json_benchmarks json_benchmarks_simple number_jsons + bash -c 'time ./json_benchmarks' + bash -c 'time ./json_benchmarks_simple' + +json_benchmarks: src/benchmarks.cpp ../src/json.hpp $(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@ +json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp + $(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@ + number_jsons: (test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py) clean: - rm -f json_benchmarks files/numbers/*.json + rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json diff --git a/benchmarks/src/benchmarks.cpp b/benchmarks/src/benchmarks.cpp index 55a4e4788..a76c37833 100644 --- a/benchmarks/src/benchmarks.cpp +++ b/benchmarks/src/benchmarks.cpp @@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx, { // using string streams for benchmarking to factor-out cold-cache disk // access. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else std::stringstream istr; { // read file into string stream @@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx, // read the stream once json j; - j << istr; + istr >> j; // clear flags and rewind istr.clear(); istr.seekg(0); } +#endif switch (mode) { @@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx, istr.clear(); istr.seekg(0); json j; - j << istr; + istr >> j; } break; @@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx, { // create JSON value from input json j; - j << istr; + istr >> j; std::stringstream ostr; ctx.reset_timer(); diff --git a/benchmarks/src/benchmarks_simple.cpp b/benchmarks/src/benchmarks_simple.cpp new file mode 100644 index 000000000..4fad680a1 --- /dev/null +++ b/benchmarks/src/benchmarks_simple.cpp @@ -0,0 +1,158 @@ +// +// benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance +// +// For some reason, the complexity of benchmarks.cpp doesn't allow +// the compiler to optimize code using json.hpp effectively. The +// exact same tests, with the use of benchpress and cxxopts produces +// much faster code, at least under g++. +// +#include +#include +#include +#include +#include + +#include + +using json = nlohmann::json; + +enum class EMode { input, output, indent }; + +static double bench(const EMode mode, size_t iters, const std::string& in_path ) +{ + // using string streams for benchmarking to factor-out cold-cache disk + // access. Define FROMFILE to use file I/O instead. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else + std::stringstream istr; + { + // read file into string stream + std::ifstream input_file(in_path); + istr << input_file.rdbuf(); + input_file.close(); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#endif + double tps = 0; + switch (mode) + { + // benchmarking input + case EMode::input: + { + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + // clear flags and rewind + istr.clear(); + istr.seekg(0); + json j; + istr >> j; + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + break; + } + + // benchmarking output + case EMode::output: + case EMode::indent: + { + // create JSON value from input + json j; + istr >> j; + std::stringstream ostr; + + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + if (mode == EMode::indent) + { + ostr << j; + } + else + { + ostr << std::setw(4) << j; + } + + // reset data + ostr.str(std::string()); + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + + break; + } + } + return tps; +} + +template +struct average { + T _sum { 0 }; + size_t _count { 0 }; + T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; } + operator T() { return _sum / _count; } +}; + +// Execute each test approximately enough times to get near 1 +// transaction per second, and compute the average; a single aggregate +// number that gives a performance metric representing both parsing +// and output. + +int main( int, char ** ) +{ + std::list> tests { + { "parse jeopardy.json", EMode::input, 2, "files/jeopardy/jeopardy.json" }, + { "parse canada.json", EMode::input, 30, "files/nativejson-benchmark/canada.json" }, + { "parse citm_catalog.json", EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" }, + { "parse twitter.json", EMode::input, 225, "files/nativejson-benchmark/twitter.json" }, + { "parse floats.json", EMode::input, 5, "files/numbers/floats.json" }, + { "parse signed_ints.json", EMode::input, 6, "files/numbers/signed_ints.json" }, + { "parse unsigned_ints.json", EMode::input, 6, "files/numbers/unsigned_ints.json" }, + { "dump jeopardy.json", EMode::output, 5, "files/jeopardy/jeopardy.json" }, + { "dump jeopardy.json w/ind.", EMode::indent, 5, "files/jeopardy/jeopardy.json" }, + { "dump floats.json", EMode::output, 2, "files/numbers/floats.json" }, + { "dump signed_ints.json", EMode::output, 20, "files/numbers/signed_ints.json" }, + }; + + average avg; + for ( auto t : tests ) { + std::string name, path; + EMode mode; + size_t iters; + std::tie(name, mode, iters, path) = t; + auto tps = bench( mode, iters, path ); + avg += tps; + std::cout + << std::left + << std::setw( 30 ) << name + << std::right + << " x " << std::setw( 3 ) << iters + << std::left + << " == " << std::setw( 10 ) << tps + << std::right + << " TPS, " << std::setw( 8 ) << std::round( tps * 1e6 / iters ) + << " ms/op" + << std::endl; + } + std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl; + std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl; + return 0; +} diff --git a/src/json.hpp b/src/json.hpp index 15219be85..94d1c1e5d 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1394,123 +1394,97 @@ constexpr T static_const::value; // input adapters // //////////////////// -/// abstract input adapter interface +/*! +@brief abstract input adapter interface + +Produces a stream of std::char_traits::int_type characters from a +std::istream, a buffer, or some other input type. Accepts the return of exactly +one non-EOF character for future input. The int_type characters returned +consist of all valid char values as positive values (typically unsigned char), +plus an EOF value outside that range, specified by the value of the function +std::char_traits::eof(). This value is typically -1, but could be any +arbitrary value which is not a valid char value. + +@return Typically [0,255] plus std::char_traits::eof(). +*/ struct input_adapter_protocol { - virtual int get_character() = 0; - virtual std::string read(std::size_t offset, std::size_t length) = 0; + virtual std::char_traits::int_type get_character() = 0; + virtual void unget_character() = 0; // restore the last non-eof() character to input virtual ~input_adapter_protocol() = default; }; /// a type to simplify interfaces using input_adapter_t = std::shared_ptr; -/// input adapter for cached stream input -template -class cached_input_stream_adapter : public input_adapter_protocol +/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at +/// beginning of input. Does not support changing the underlying std::streambuf +/// in mid-input. Maintains underlying std::istream and std::streambuf to +/// support subsequent use of standard std::istream operations to process any +/// input characters following those used in parsing the JSON input. Clears the +/// std::istream flags; any input errors (eg. EOF) will be detected by the first +/// subsequent call for input from the std::istream. +class input_stream_adapter : public input_adapter_protocol { public: - explicit cached_input_stream_adapter(std::istream& i) - : is(i), start_position(is.tellg()) + ~input_stream_adapter() override { - fill_buffer(); - - // skip byte order mark - if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF') - { - buffer_pos += 3; - processed_chars += 3; - } - } - - ~cached_input_stream_adapter() override - { - // clear stream flags - is.clear(); - // We initially read a lot of characters into the buffer, and we may - // not have processed all of them. Therefore, we need to "rewind" the - // stream after the last processed char. - is.seekg(start_position); - is.ignore(static_cast(processed_chars)); - // clear stream flags + // clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags is.clear(); } - - int get_character() override + explicit input_stream_adapter(std::istream& i) + : is(i) + , sb(*i.rdbuf()) { - // check if refilling is necessary and possible - if (buffer_pos == fill_size and not eof) + // Ignore Byte Order Mark at start of input + std::char_traits::int_type c; + if (( c = get_character() ) == 0xEF ) { - fill_buffer(); - - // check and remember that filling did not yield new input - if (fill_size == 0) + if (( c = get_character() ) == 0xBB ) { - eof = true; - return std::char_traits::eof(); + if (( c = get_character() ) == 0xBF ) + { + return; // Ignore BOM + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xBB' ); } - - // the buffer is ready - buffer_pos = 0; + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xEF' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); // Not BOM. Process as usual. } - - ++processed_chars; - assert(buffer_pos < buffer.size()); - return buffer[buffer_pos++] & 0xFF; } - std::string read(std::size_t offset, std::size_t length) override + // delete because of pointer members + input_stream_adapter(const input_stream_adapter&) = delete; + input_stream_adapter& operator=(input_stream_adapter&) = delete; + + // std::istream/std::streambuf use std::char_traits::to_int_type, to + // ensure that std::char_traits::eof() and the character 0xff do not + // end up as the same value, eg. 0xffffffff. + std::char_traits::int_type get_character() override { - // create buffer - std::string result(length, '\0'); - - // save stream position - const auto current_pos = is.tellg(); - // save stream flags - const auto flags = is.rdstate(); - - // clear stream flags - is.clear(); - // set stream position - is.seekg(static_cast(offset)); - // read bytes - is.read(&result[0], static_cast(length)); - - // reset stream position - is.seekg(current_pos); - // reset stream flags - is.setstate(flags); - - return result; + return sb.sbumpc(); } + void unget_character() override + { + sb.sungetc(); // Avoided for performance: is.unget(); + } private: - void fill_buffer() - { - // fill - is.read(buffer.data(), static_cast(buffer.size())); - // store number of bytes in the buffer - fill_size = static_cast(is.gcount()); - } /// the associated input stream std::istream& is; - - /// chars returned via get_character() - std::size_t processed_chars = 0; - /// chars processed in the current buffer - std::size_t buffer_pos = 0; - - /// whether stream reached eof - bool eof = false; - /// how many chars have been copied to the buffer by last (re)fill - std::size_t fill_size = 0; - - /// position of the stream when we started - const std::streampos start_position; - - /// internal buffer - std::array buffer{{}}; + std::streambuf &sb; }; /// input adapter for buffer input @@ -1531,21 +1505,22 @@ class input_buffer_adapter : public input_adapter_protocol input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter& operator=(input_buffer_adapter&) = delete; - int get_character() noexcept override + std::char_traits::int_type get_character() noexcept override { if (JSON_LIKELY(cursor < limit)) { - return *(cursor++) & 0xFF; + return std::char_traits::to_int_type(*(cursor++)); } return std::char_traits::eof(); } - std::string read(std::size_t offset, std::size_t length) override + void unget_character() noexcept override { - // avoid reading too many characters - const auto max_length = static_cast(limit - start); - return std::string(start + offset, (std::min)(length, max_length - offset)); + if (JSON_LIKELY(cursor > start)) + { + --cursor; + } } private: @@ -1564,11 +1539,11 @@ class input_adapter /// input adapter for input stream input_adapter(std::istream& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for input stream input_adapter(std::istream&& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for buffer template::to_char_type(current)); } - /// get a character from the input - int get() + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + std::char_traits::int_type get() { ++chars_read; - return next_unget ? (next_unget = false, current) - : (current = ia->get_character()); + current = ia->get_character(); + if (JSON_LIKELY( current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + return current; } + /// unget current character (return it again on next get) + void unget() + { + --chars_read; + if (JSON_LIKELY(current != std::char_traits::eof())) + { + ia->unget_character(); + assert(token_string.size() != 0); + token_string.pop_back(); + } + } + /// add a character to yytext void add(int c) { - // resize yytext if necessary; this condition is deemed unlikely, - // because we start with a 1024-byte buffer - if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) - { - yytext.resize(2 * yytext.capacity(), '\0'); - } - assert(yylen < yytext.size()); - yytext[yylen++] = static_cast(c); + yytext.push_back(std::char_traits::to_char_type(c)); } public: @@ -2753,12 +2739,10 @@ scan_number_done: return value_float; } - /// return string value - const std::string get_string() + /// return current string value (implicitly resets the token; useful only once) + std::string move_string() { - // yytext cannot be returned as char*, because it may contain a null - // byte (parsed as "\u0000") - return std::string(yytext.data(), yylen); + return std::move( yytext ); } ///////////////////// @@ -2771,22 +2755,16 @@ scan_number_done: return chars_read; } - /// return the last read token (for errors only) + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. std::string get_token_string() const { - // get the raw byte sequence of the last token - std::string s = ia->read(start_pos, chars_read - start_pos); - // escape control characters std::string result; - for (auto c : s) + for (auto c : token_string) { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -2883,20 +2861,16 @@ scan_number_done: detail::input_adapter_t ia = nullptr; /// the current character - int current = std::char_traits::eof(); - - /// whether get() should return the last character again - bool next_unget = false; + std::char_traits::int_type current = std::char_traits::eof(); /// the number of characters read std::size_t chars_read = 0; - /// the start position of the current token - std::size_t start_pos = 0; + + /// raw input token string (for error messages) + std::vector token_string { }; /// buffer for variable-length tokens (numbers, strings) - std::vector yytext = std::vector(1024, '\0'); - /// current index in yytext - std::size_t yylen = 0; + std::string yytext { }; /// a description of occurred lexer errors const char* error_message = ""; @@ -3034,11 +3008,19 @@ class parser { case token_type::begin_object: { - if (keep and (not callback or ((keep = callback(depth++, parse_event_t::object_start, result))))) + if (keep) { - // explicitly set result to object to cope with {} - result.m_type = value_t::object; - result.m_value = value_t::object; + if (callback) + { + keep = callback(depth++, parse_event_t::object_start, result); + } + + if (not callback or keep) + { + // explicitly set result to object to cope with {} + result.m_type = value_t::object; + result.m_value = value_t::object; + } } // read next token @@ -3065,7 +3047,7 @@ class parser { return; } - key = m_lexer.get_string(); + key = m_lexer.move_string(); bool keep_tag = false; if (keep) @@ -3130,11 +3112,19 @@ class parser case token_type::begin_array: { - if (keep and (not callback or ((keep = callback(depth++, parse_event_t::array_start, result))))) + if (keep) { - // explicitly set result to object to cope with [] - result.m_type = value_t::array; - result.m_value = value_t::array; + if (callback) + { + keep = callback(depth++, parse_event_t::array_start, result); + } + + if (not callback or keep) + { + // explicitly set result to array to cope with [] + result.m_type = value_t::array; + result.m_value = value_t::array; + } } // read next token @@ -3203,7 +3193,7 @@ class parser case token_type::value_string: { result.m_type = value_t::string; - result.m_value = m_lexer.get_string(); + result.m_value = m_lexer.move_string(); break; } @@ -5205,7 +5195,7 @@ class binary_reader @brief get next character from the input This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns + not throw in case the input reached EOF, but returns a -'ve valued `std::char_traits::eof()` in that case. @return character read from the input @@ -5276,7 +5266,7 @@ class binary_reader { get(); check_eof(); - return current; + return static_cast(current); }); return result; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4ccbaf06d..c72867839 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -66,6 +66,22 @@ set_target_properties(catch_main PROPERTIES ) target_include_directories(catch_main PRIVATE "thirdparty/catch") +# https://stackoverflow.com/questions/2368811/how-to-set-warning-level-in-cmake +if(MSVC) + # Force to always compile with W4 + if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") + string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") + endif() + + # Disable warning C4389: '==': signed/unsigned mismatch + # Disable warning C4309: 'static_cast': truncation of constant value + # Disable warning C4566: character represented by universal-character-name '\uFF01' cannot be represented in the current code page (1252) + # Disable warning C4996: 'nlohmann::basic_json::operator <<': was declared deprecated + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4389 /wd4309 /wd4566 /wd4996") +endif() + ############################################################################# # one executable for each unit test file ############################################################################# diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index 08d4d6efe..9afa7d269 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -215,7 +215,7 @@ TEST_CASE("parser class") std::string s = "\"1\""; s[1] = '\0'; CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&); - CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); + CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); } } diff --git a/test/src/unit-constructor1.cpp b/test/src/unit-constructor1.cpp index a78b99f49..e45edd440 100644 --- a/test/src/unit-constructor1.cpp +++ b/test/src/unit-constructor1.cpp @@ -247,7 +247,7 @@ TEST_CASE("constructors") SECTION("std::pair") { - std::pair p{1.0, "string"}; + std::pair p{1.0f, "string"}; json j(p); CHECK(j.type() == json::value_t::array); diff --git a/test/src/unit-readme.cpp b/test/src/unit-readme.cpp index e921c4b68..5a442c601 100644 --- a/test/src/unit-readme.cpp +++ b/test/src/unit-readme.cpp @@ -38,6 +38,11 @@ using nlohmann::json; #include #include +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable : 4189) // local variable is initialized but not referenced +#endif + TEST_CASE("README", "[hide]") { { @@ -298,3 +303,7 @@ TEST_CASE("README", "[hide]") std::cout.rdbuf(old_cout_buffer); } } + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 0c0d148da..2d30a52c5 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -1233,4 +1233,24 @@ TEST_CASE("regression tests") "[json.exception.type_error.302] type must be array, but is null"); } } + + SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.") + { + SECTION("example 1") + { + std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" ); + json j1, j2, j3; + i1_2_3 >> j1; + i1_2_3 >> j2; + i1_2_3 >> j3; + + std::map m1 = j1; + std::map m2 = j2; + int i3 = j3; + + CHECK( m1 == ( std::map {{ "first", "one" }} )); + CHECK( m2 == ( std::map {{ "second", "two" }} )); + CHECK( i3 == 3 ); + } + } } diff --git a/test/src/unit-udt.cpp b/test/src/unit-udt.cpp index 6aa469fe4..409cac839 100644 --- a/test/src/unit-udt.cpp +++ b/test/src/unit-udt.cpp @@ -201,7 +201,7 @@ void from_json(const BasicJsonType& j, country& c) { {u8"中华人民共和国", country::china}, {"France", country::france}, - {"Российская Федерация", country::russia} + {u8"Российская Федерация", country::russia} }; const auto it = m.find(str);