More escaping

This commit is contained in:
Victor Zverovich 2021-08-21 09:35:05 -07:00
parent f69a572538
commit 6397095ca4
4 changed files with 91 additions and 28 deletions

View File

@ -2525,8 +2525,8 @@ template <> struct formatter<detail::bigint> {
};
FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
for_each_codepoint(s, [this](uint32_t cp, int error) {
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
for_each_codepoint(s, [this](uint32_t cp, string_view) {
if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
if (cp <= 0xFFFF) {
buffer_.push_back(static_cast<wchar_t>(cp));
} else {
@ -2534,6 +2534,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
}
return true;
});
buffer_.push_back(0);
}

View File

@ -483,27 +483,38 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
return next;
}
enum { invalid_code_point = ~uint32_t() };
// Invokes f(cp, sv) for every code point cp in s with sv being the string view
// corresponding to the code point. cp is invalid_code_point on error.
template <typename F>
FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
auto decode = [f](const char* p) {
auto decode = [f](const char* buf_ptr, const char* ptr) {
auto cp = uint32_t();
auto error = 0;
p = utf8_decode(p, &cp, &error);
f(cp, error);
return p;
auto end = utf8_decode(buf_ptr, &cp, &error);
bool result = f(error ? invalid_code_point : cp,
string_view(ptr, to_unsigned(end - buf_ptr)));
return result ? end : nullptr;
};
auto p = s.data();
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
if (s.size() >= block_size) {
for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p);
for (auto end = p + s.size() - block_size + 1; p < end;) {
p = decode(p, p);
if (!p) return;
}
}
if (auto num_chars_left = s.data() + s.size() - p) {
char buf[2 * block_size - 1] = {};
copy_str<char>(p, p + num_chars_left, buf);
p = buf;
const char* buf_ptr = buf;
do {
p = decode(p);
} while (p - buf < num_chars_left);
auto end = decode(buf_ptr, p);
if (!end) return;
p += end - buf_ptr;
buf_ptr = end;
} while (buf_ptr - buf < num_chars_left);
}
}
@ -518,10 +529,10 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) {
// It is not a lambda for compatibility with C++14.
struct count_code_points {
size_t* count;
FMT_CONSTEXPR void operator()(uint32_t cp, int error) const {
FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
*count += detail::to_unsigned(
1 +
(error == 0 && cp >= 0x1100 &&
(cp >= 0x1100 &&
(cp <= 0x115f || // Hangul Jamo init. consonants
cp == 0x2329 || // LEFT-POINTING ANGLE BRACKET
cp == 0x232a || // RIGHT-POINTING ANGLE BRACKET
@ -539,6 +550,7 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) {
(cp >= 0x1f300 && cp <= 0x1f64f) ||
// Supplemental Symbols and Pictographs:
(cp >= 0x1f900 && cp <= 0x1f9ff))));
return true;
}
};
for_each_codepoint(s, count_code_points{&num_code_points});

View File

@ -227,17 +227,65 @@ template <typename OutputIt> OutputIt write_delimiter(OutputIt out) {
return out;
}
template <typename Char> inline bool is_printable_ascii(Char c) {
return c >= 0x20 && c < 0x7e;
inline auto is_printable(uint32_t cp) -> bool {
if (0x2a6de <= cp && cp < 0x2a700) return false;
if (0x2b735 <= cp && cp < 0x2b740) return false;
if (0x2b81e <= cp && cp < 0x2b820) return false;
if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
if (0x2ebe1 <= cp && cp < 0x2f800) return false;
if (0x2fa1e <= cp && cp < 0x30000) return false;
if (0x3134b <= cp && cp < 0xe0100) return false;
if (0xe01f0 <= cp && cp < 0x110000) return false;
return true;
}
template <
typename Char, typename OutputIt, typename T,
FMT_ENABLE_IF(is_std_string_like<typename std::decay<T>::type>::value)>
OutputIt write_range_entry(OutputIt out, const T& str) {
inline auto needs_escape(uint32_t cp) -> bool {
return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
!is_printable(cp);
}
template <typename Char> struct find_escape_result {
const Char* begin;
const Char* end;
uint32_t cp;
};
template <typename Char>
auto find_escape(const Char* begin, const Char* end)
-> find_escape_result<Char> {
for (; begin != end; ++begin) {
auto cp = static_cast<typename std::make_unsigned<Char>::type>(*begin);
if (needs_escape(cp)) return {begin, begin + 1, cp};
}
return {begin, nullptr, 0};
}
auto find_escape(const char* begin, const char* end)
-> find_escape_result<char> {
if (!is_utf8()) return find_escape<char>(begin, end);
auto result = find_escape_result<char>{end, nullptr, 0};
for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
[&](uint32_t cp, string_view sv) {
if (needs_escape(cp)) {
result = {sv.begin(), sv.end(), cp};
return false;
}
return true;
});
return result;
}
template <typename Char, typename OutputIt>
auto write_range_entry(OutputIt out, basic_string_view<Char> str) -> OutputIt {
*out++ = '"';
for (Char c : basic_string_view<Char>(str)) {
switch (c) {
auto begin = str.begin(), end = str.end();
do {
auto escape = find_escape(begin, end);
out = copy_str<Char>(begin, escape.begin, out);
begin = escape.end;
if (!begin) break;
auto c = static_cast<Char>(escape.cp);
switch (escape.cp) {
case '\n':
*out++ = '\\';
c = 'n';
@ -256,13 +304,14 @@ OutputIt write_range_entry(OutputIt out, const T& str) {
*out++ = '\\';
break;
default:
if (is_printable_ascii(c)) break;
if (sizeof(Char) != 1 && c >= 0x80) break;
out = format_to(out, "\\x{:02x}", c);
for (Char escape_char : basic_string_view<Char>(
escape.begin, to_unsigned(escape.end - escape.begin))) {
out = format_to(out, "\\x{:02x}", escape_char);
}
continue;
}
*out++ = c;
}
} while (begin != end);
*out++ = '"';
return out;
}

View File

@ -264,7 +264,8 @@ TEST(ranges_test, join_range) {
#endif // FMT_RANGES_TEST_ENABLE_JOIN
TEST(ranges_test, escape_string) {
EXPECT_EQ(fmt::format("{}", std::vector<std::string>{"\n\r\t\"\\"}),
"[\"\\n\\r\\t\\\"\\\\\"]");
EXPECT_EQ(fmt::format("{}", std::vector<std::string>{"\x7"}), "[\"\\x07\"]");
}
using vec = std::vector<std::string>;
EXPECT_EQ(fmt::format("{}", vec{"\n\r\t\"\\"}), "[\"\\n\\r\\t\\\"\\\\\"]");
EXPECT_EQ(fmt::format("{}", vec{"\x07"}), "[\"\\x07\"]");
EXPECT_EQ(fmt::format("{}", vec{"\x7f"}), "[\"\\x7f\"]");
}