Fix bugs in utf8 decoder (#3056)

Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>

Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
This commit is contained in:
Vladislav Shchapov 2022-08-27 03:37:15 +05:00 committed by GitHub
parent 4a8e2949bb
commit f98048b621
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 133 additions and 13 deletions

View File

@ -2297,12 +2297,16 @@ constexpr auto to_ascii(Char c) -> underlying_t<Char> {
return c;
}
template <typename Char>
FMT_CONSTEXPR auto code_point_length_impl(Char begin) -> int {
return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
[static_cast<unsigned char>(begin) >> 3];
}
template <typename Char>
FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
if (const_check(sizeof(Char) != 1)) return 1;
auto lengths =
"\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4";
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
int len = code_point_length_impl(*begin);
// Compute the pointer to the next character early so that the next
// iteration can start working on the next character. Neither Clang

View File

@ -602,25 +602,28 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
*/
FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
-> const char* {
constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
constexpr const int shifte[] = {0, 6, 4, 2, 0};
int len = code_point_length(s);
const char* next = s + len;
int len = code_point_length_impl(*s);
// Compute the pointer to the next character early so that the next
// iteration can start working on the next character. Neither Clang
// nor GCC figure out this reordering on their own.
const char* next = s + len + !len;
using uchar = unsigned char;
// Assume a four-byte character and load four bytes. Unused bits are
// shifted out.
*c = uint32_t(s[0] & masks[len]) << 18;
*c |= uint32_t(s[1] & 0x3f) << 12;
*c |= uint32_t(s[2] & 0x3f) << 6;
*c |= uint32_t(s[3] & 0x3f) << 0;
*c = uint32_t(uchar(s[0]) & masks[len]) << 18;
*c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
*c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
*c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
*c >>= shiftc[len];
// Accumulate the various error conditions.
using uchar = unsigned char;
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
@ -629,8 +632,6 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
*e |= uchar(s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
*e |= ((uchar(s[0]) & prefix_masks[len]) !=
uchar((prefix_masks[len] << 1) & 0xFF)); // first byte correct?
return next;
}

View File

@ -430,3 +430,118 @@ TEST(format_impl_test, write_console_signature) {
(void)p;
}
#endif
// A public domain branchless UTF-8 decoder by Christopher Wellons:
// https://github.com/skeeto/branchless-utf8
constexpr bool unicode_is_surrogate(uint32_t c) {
return c >= 0xD800U && c <= 0xDFFFU;
}
FMT_CONSTEXPR char* utf8_encode(char* s, uint32_t c) {
if (c >= (1UL << 16)) {
s[0] = static_cast<char>(0xf0 | (c >> 18));
s[1] = static_cast<char>(0x80 | ((c >> 12) & 0x3f));
s[2] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
s[3] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
return s + 4;
} else if (c >= (1UL << 11)) {
s[0] = static_cast<char>(0xe0 | (c >> 12));
s[1] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
s[2] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
return s + 3;
} else if (c >= (1UL << 7)) {
s[0] = static_cast<char>(0xc0 | (c >> 6));
s[1] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
return s + 2;
} else {
s[0] = static_cast<char>(c);
return s + 1;
}
}
// Make sure it can decode every character
TEST(format_impl_test, utf8_decode_decode_all) {
for (uint32_t i = 0; i < 0x10ffff; i++) {
if (!unicode_is_surrogate(i)) {
int e;
uint32_t c;
char buf[8] = {0};
char* end = utf8_encode(buf, i);
const char* res = fmt::detail::utf8_decode(buf, &c, &e);
EXPECT_EQ(end, res);
EXPECT_EQ(c, i);
EXPECT_EQ(e, 0);
}
}
}
// Reject everything outside of U+0000..U+10FFFF
TEST(format_impl_test, utf8_decode_out_of_range) {
for (uint32_t i = 0x110000; i < 0x1fffff; i++) {
int e;
uint32_t c;
char buf[8] = {0};
utf8_encode(buf, i);
const char* end = fmt::detail::utf8_decode(buf, &c, &e);
EXPECT_NE(e, 0);
EXPECT_EQ(end - buf, 4);
}
}
// Does it reject all surrogate halves?
TEST(format_impl_test, utf8_decode_surrogate_halves) {
for (uint32_t i = 0xd800; i <= 0xdfff; i++) {
int e;
uint32_t c;
char buf[8] = {0};
utf8_encode(buf, i);
fmt::detail::utf8_decode(buf, &c, &e);
EXPECT_NE(e, 0);
}
}
// How about non-canonical encodings?
TEST(format_impl_test, utf8_decode_non_canonical_encodings) {
int e;
uint32_t c;
const char* end;
char buf2[8] = {char(0xc0), char(0xA4)};
end = fmt::detail::utf8_decode(buf2, &c, &e);
EXPECT_NE(e, 0); // non-canonical len 2
EXPECT_EQ(end, buf2 + 2); // non-canonical recover 2
char buf3[8] = {char(0xe0), char(0x80), char(0xA4)};
end = fmt::detail::utf8_decode(buf3, &c, &e);
EXPECT_NE(e, 0); // non-canonical len 3
EXPECT_EQ(end, buf3 + 3); // non-canonical recover 3
char buf4[8] = {char(0xf0), char(0x80), char(0x80), char(0xA4)};
end = fmt::detail::utf8_decode(buf4, &c, &e);
EXPECT_NE(e, 0); // non-canonical encoding len 4
EXPECT_EQ(end, buf4 + 4); // non-canonical recover 4
}
// Let's try some bogus byte sequences
TEST(format_impl_test, utf8_decode_bogus_byte_sequences) {
int e;
uint32_t c;
// Invalid first byte
char buf0[4] = {char(0xff)};
auto len = fmt::detail::utf8_decode(buf0, &c, &e) - buf0;
EXPECT_NE(e, 0); // "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
EXPECT_EQ(len, 1); // "bogus [ff] recovery %d", len);
// Invalid first byte
char buf1[4] = {char(0x80)};
len = fmt::detail::utf8_decode(buf1, &c, &e) - buf1;
EXPECT_NE(e, 0); // "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
EXPECT_EQ(len, 1); // "bogus [80] recovery %d", len);
// Looks like a two-byte sequence but second byte is wrong
char buf2[4] = {char(0xc0), char(0x0a)};
len = fmt::detail::utf8_decode(buf2, &c, &e) - buf2;
EXPECT_NE(e, 0); // "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c
EXPECT_EQ(len, 2); // "bogus [c0 0a] recovery %d", len);
}