mirror of
https://github.com/fmtlib/fmt.git
synced 2024-11-12 22:20:05 +00:00
Fix bugs in utf8 decoder (#3056)
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru> Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
This commit is contained in:
parent
4a8e2949bb
commit
f98048b621
@ -2297,12 +2297,16 @@ constexpr auto to_ascii(Char c) -> underlying_t<Char> {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
FMT_CONSTEXPR auto code_point_length_impl(Char begin) -> int {
|
||||
return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
|
||||
[static_cast<unsigned char>(begin) >> 3];
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
|
||||
if (const_check(sizeof(Char) != 1)) return 1;
|
||||
auto lengths =
|
||||
"\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4";
|
||||
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
|
||||
int len = code_point_length_impl(*begin);
|
||||
|
||||
// Compute the pointer to the next character early so that the next
|
||||
// iteration can start working on the next character. Neither Clang
|
||||
|
@ -602,25 +602,28 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
|
||||
*/
|
||||
FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
|
||||
-> const char* {
|
||||
constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
|
||||
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
|
||||
constexpr const int shifte[] = {0, 6, 4, 2, 0};
|
||||
|
||||
int len = code_point_length(s);
|
||||
const char* next = s + len;
|
||||
int len = code_point_length_impl(*s);
|
||||
// Compute the pointer to the next character early so that the next
|
||||
// iteration can start working on the next character. Neither Clang
|
||||
// nor GCC figure out this reordering on their own.
|
||||
const char* next = s + len + !len;
|
||||
|
||||
using uchar = unsigned char;
|
||||
|
||||
// Assume a four-byte character and load four bytes. Unused bits are
|
||||
// shifted out.
|
||||
*c = uint32_t(s[0] & masks[len]) << 18;
|
||||
*c |= uint32_t(s[1] & 0x3f) << 12;
|
||||
*c |= uint32_t(s[2] & 0x3f) << 6;
|
||||
*c |= uint32_t(s[3] & 0x3f) << 0;
|
||||
*c = uint32_t(uchar(s[0]) & masks[len]) << 18;
|
||||
*c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
|
||||
*c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
|
||||
*c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
|
||||
*c >>= shiftc[len];
|
||||
|
||||
// Accumulate the various error conditions.
|
||||
using uchar = unsigned char;
|
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
@ -629,8 +632,6 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
|
||||
*e |= uchar(s[3]) >> 6;
|
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len];
|
||||
*e |= ((uchar(s[0]) & prefix_masks[len]) !=
|
||||
uchar((prefix_masks[len] << 1) & 0xFF)); // first byte correct?
|
||||
|
||||
return next;
|
||||
}
|
||||
|
@ -430,3 +430,118 @@ TEST(format_impl_test, write_console_signature) {
|
||||
(void)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
// A public domain branchless UTF-8 decoder by Christopher Wellons:
|
||||
// https://github.com/skeeto/branchless-utf8
|
||||
constexpr bool unicode_is_surrogate(uint32_t c) {
|
||||
return c >= 0xD800U && c <= 0xDFFFU;
|
||||
}
|
||||
|
||||
FMT_CONSTEXPR char* utf8_encode(char* s, uint32_t c) {
|
||||
if (c >= (1UL << 16)) {
|
||||
s[0] = static_cast<char>(0xf0 | (c >> 18));
|
||||
s[1] = static_cast<char>(0x80 | ((c >> 12) & 0x3f));
|
||||
s[2] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
|
||||
s[3] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
|
||||
return s + 4;
|
||||
} else if (c >= (1UL << 11)) {
|
||||
s[0] = static_cast<char>(0xe0 | (c >> 12));
|
||||
s[1] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
|
||||
s[2] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
|
||||
return s + 3;
|
||||
} else if (c >= (1UL << 7)) {
|
||||
s[0] = static_cast<char>(0xc0 | (c >> 6));
|
||||
s[1] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
|
||||
return s + 2;
|
||||
} else {
|
||||
s[0] = static_cast<char>(c);
|
||||
return s + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure it can decode every character
|
||||
TEST(format_impl_test, utf8_decode_decode_all) {
|
||||
for (uint32_t i = 0; i < 0x10ffff; i++) {
|
||||
if (!unicode_is_surrogate(i)) {
|
||||
int e;
|
||||
uint32_t c;
|
||||
char buf[8] = {0};
|
||||
char* end = utf8_encode(buf, i);
|
||||
const char* res = fmt::detail::utf8_decode(buf, &c, &e);
|
||||
EXPECT_EQ(end, res);
|
||||
EXPECT_EQ(c, i);
|
||||
EXPECT_EQ(e, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reject everything outside of U+0000..U+10FFFF
|
||||
TEST(format_impl_test, utf8_decode_out_of_range) {
|
||||
for (uint32_t i = 0x110000; i < 0x1fffff; i++) {
|
||||
int e;
|
||||
uint32_t c;
|
||||
char buf[8] = {0};
|
||||
utf8_encode(buf, i);
|
||||
const char* end = fmt::detail::utf8_decode(buf, &c, &e);
|
||||
EXPECT_NE(e, 0);
|
||||
EXPECT_EQ(end - buf, 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Does it reject all surrogate halves?
|
||||
TEST(format_impl_test, utf8_decode_surrogate_halves) {
|
||||
for (uint32_t i = 0xd800; i <= 0xdfff; i++) {
|
||||
int e;
|
||||
uint32_t c;
|
||||
char buf[8] = {0};
|
||||
utf8_encode(buf, i);
|
||||
fmt::detail::utf8_decode(buf, &c, &e);
|
||||
EXPECT_NE(e, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// How about non-canonical encodings?
|
||||
TEST(format_impl_test, utf8_decode_non_canonical_encodings) {
|
||||
int e;
|
||||
uint32_t c;
|
||||
const char* end;
|
||||
|
||||
char buf2[8] = {char(0xc0), char(0xA4)};
|
||||
end = fmt::detail::utf8_decode(buf2, &c, &e);
|
||||
EXPECT_NE(e, 0); // non-canonical len 2
|
||||
EXPECT_EQ(end, buf2 + 2); // non-canonical recover 2
|
||||
|
||||
char buf3[8] = {char(0xe0), char(0x80), char(0xA4)};
|
||||
end = fmt::detail::utf8_decode(buf3, &c, &e);
|
||||
EXPECT_NE(e, 0); // non-canonical len 3
|
||||
EXPECT_EQ(end, buf3 + 3); // non-canonical recover 3
|
||||
|
||||
char buf4[8] = {char(0xf0), char(0x80), char(0x80), char(0xA4)};
|
||||
end = fmt::detail::utf8_decode(buf4, &c, &e);
|
||||
EXPECT_NE(e, 0); // non-canonical encoding len 4
|
||||
EXPECT_EQ(end, buf4 + 4); // non-canonical recover 4
|
||||
}
|
||||
|
||||
// Let's try some bogus byte sequences
|
||||
TEST(format_impl_test, utf8_decode_bogus_byte_sequences) {
|
||||
int e;
|
||||
uint32_t c;
|
||||
|
||||
// Invalid first byte
|
||||
char buf0[4] = {char(0xff)};
|
||||
auto len = fmt::detail::utf8_decode(buf0, &c, &e) - buf0;
|
||||
EXPECT_NE(e, 0); // "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
|
||||
EXPECT_EQ(len, 1); // "bogus [ff] recovery %d", len);
|
||||
|
||||
// Invalid first byte
|
||||
char buf1[4] = {char(0x80)};
|
||||
len = fmt::detail::utf8_decode(buf1, &c, &e) - buf1;
|
||||
EXPECT_NE(e, 0); // "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
|
||||
EXPECT_EQ(len, 1); // "bogus [80] recovery %d", len);
|
||||
|
||||
// Looks like a two-byte sequence but second byte is wrong
|
||||
char buf2[4] = {char(0xc0), char(0x0a)};
|
||||
len = fmt::detail::utf8_decode(buf2, &c, &e) - buf2;
|
||||
EXPECT_NE(e, 0); // "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c
|
||||
EXPECT_EQ(len, 2); // "bogus [c0 0a] recovery %d", len);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user