Improve code point computation

This commit is contained in:
Victor Zverovich 2021-02-13 07:52:39 -08:00
parent ee0fed639c
commit 13b117b5bc
4 changed files with 139 additions and 129 deletions

View File

@ -2589,54 +2589,6 @@ int snprintf_float(T value, int precision, float_specs specs,
}
}
// A public domain branchless UTF-8 decoder by Christopher Wellons:
// https://github.com/skeeto/branchless-utf8
/* Decode the next character, c, from buf, reporting errors in e.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in e, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
int len = code_point_length(buf);
const char* next = buf + len;
// Assume a four-byte character and load four bytes. Unused bits are
// shifted out.
auto s = reinterpret_cast<const unsigned char*>(buf);
*c = uint32_t(s[0] & masks[len]) << 18;
*c |= uint32_t(s[1] & 0x3f) << 12;
*c |= uint32_t(s[2] & 0x3f) << 6;
*c |= uint32_t(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
// Accumulate the various error conditions.
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
}
struct stringifier {
template <typename T> FMT_INLINE std::string operator()(T value) const {
return to_string(value);
@ -2678,10 +2630,7 @@ template <> struct formatter<detail::bigint> {
};
FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
auto transcode = [this](const char* p) {
auto cp = uint32_t();
auto error = 0;
p = utf8_decode(p, &cp, &error);
for_each_codepoint(s, [this](uint32_t cp, int error) {
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
if (cp <= 0xFFFF) {
buffer_.push_back(static_cast<wchar_t>(cp));
@ -2690,21 +2639,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
}
return p;
};
auto p = s.data();
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
if (s.size() >= block_size) {
for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p);
}
if (auto num_chars_left = s.data() + s.size() - p) {
char buf[2 * block_size - 1] = {};
memcpy(buf, p, to_unsigned(num_chars_left));
p = buf;
do {
p = transcode(p);
} while (p - buf < num_chars_left);
}
});
buffer_.push_back(0);
}

View File

@ -539,42 +539,6 @@ class truncating_iterator<OutputIt, std::true_type>
truncating_iterator& operator*() { return *this; }
};
template <typename Char>
inline size_t count_code_points(basic_string_view<Char> s) {
return s.size();
}
// Counts the number of code points in a UTF-8 string.
FMT_CONSTEXPR inline size_t count_code_points(basic_string_view<char> s) {
const char* data = s.data();
size_t num_code_points = 0;
for (size_t i = 0, size = s.size(); i != size; ++i) {
if ((data[i] & 0xc0) != 0x80) ++num_code_points;
}
return num_code_points;
}
inline size_t count_code_points(basic_string_view<char8_type> s) {
return count_code_points(basic_string_view<char>(
reinterpret_cast<const char*>(s.data()), s.size()));
}
template <typename Char>
inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
size_t size = s.size();
return n < size ? n : size;
}
// Calculates the index of the nth code point in a UTF-8 string.
inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
const char8_type* data = s.data();
size_t num_code_points = 0;
for (size_t i = 0, size = s.size(); i != size; ++i) {
if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
}
return s.size();
}
// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
// instead (#1998).
template <typename OutputIt, typename Size, typename T>
@ -634,6 +598,130 @@ inline counting_iterator copy_str(InputIt begin, InputIt end,
return it + (end - begin);
}
template <typename Char>
FMT_CONSTEXPR int code_point_length(const Char* begin) {
if (const_check(sizeof(Char) != 1)) return 1;
constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
// Compute the pointer to the next character early so that the next
// iteration can start working on the next character. Neither Clang
// nor GCC figure out this reordering on their own.
return len + !len;
}
// A public domain branchless UTF-8 decoder by Christopher Wellons:
// https://github.com/skeeto/branchless-utf8
/* Decode the next character, c, from s, reporting errors in e.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in e, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
FMT_CONSTEXPR inline const char* utf8_decode(const char* s, uint32_t* c,
int* e) {
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
constexpr const int shifte[] = {0, 6, 4, 2, 0};
int len = code_point_length(s);
const char* next = s + len;
// Assume a four-byte character and load four bytes. Unused bits are
// shifted out.
*c = uint32_t(s[0] & masks[len]) << 18;
*c |= uint32_t(s[1] & 0x3f) << 12;
*c |= uint32_t(s[2] & 0x3f) << 6;
*c |= uint32_t(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
// Accumulate the various error conditions.
using uchar = unsigned char;
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (uchar(s[1]) & 0xc0) >> 2;
*e |= (uchar(s[2]) & 0xc0) >> 4;
*e |= uchar(s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
}
template <typename F>
FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
auto decode = [f](const char* p) {
auto cp = uint32_t();
auto error = 0;
p = utf8_decode(p, &cp, &error);
f(cp, error);
return p;
};
auto p = s.data();
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
if (s.size() >= block_size) {
for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p);
}
if (auto num_chars_left = s.data() + s.size() - p) {
char buf[2 * block_size - 1] = {};
copy_str<char>(p, p + num_chars_left, buf);
p = buf;
do {
p = decode(p);
} while (p - buf < num_chars_left);
}
}
template <typename Char>
inline size_t compute_width(basic_string_view<Char> s) {
return s.size();
}
// Computes approximate display width of a UTF-8 string.
FMT_CONSTEXPR inline size_t compute_width(string_view s) {
size_t num_code_points = 0;
// It is not a lambda for compatibility with C++14.
struct count_code_points {
size_t* count;
FMT_CONSTEXPR void operator()(uint32_t, int) const { ++*count; }
};
for_each_codepoint(s, count_code_points{&num_code_points});
return num_code_points;
}
inline size_t compute_width(basic_string_view<char8_type> s) {
return compute_width(basic_string_view<char>(
reinterpret_cast<const char*>(s.data()), s.size()));
}
template <typename Char>
inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
size_t size = s.size();
return n < size ? n : size;
}
// Calculates the index of the nth code point in a UTF-8 string.
inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
const char8_type* data = s.data();
size_t num_code_points = 0;
for (size_t i = 0, size = s.size(); i != size; ++i) {
if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
}
return s.size();
}
template <typename T>
using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
sizeof(T) <= sizeof(double)>;
@ -1674,7 +1762,7 @@ FMT_CONSTEXPR OutputIt write(OutputIt out, basic_string_view<StrChar> s,
if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
size = code_point_index(s, to_unsigned(specs.precision));
auto width = specs.width != 0
? count_code_points(basic_string_view<StrChar>(data, size))
? compute_width(basic_string_view<StrChar>(data, size))
: 0;
using iterator = remove_reference_t<decltype(reserve(out, 0))>;
return write_padded(out, specs, size, width, [=](iterator it) {
@ -2274,9 +2362,8 @@ class arg_formatter_base {
template <typename Ch>
void write(const Ch* s, size_t size, const format_specs& specs) {
auto width = specs.width != 0
? count_code_points(basic_string_view<Ch>(s, size))
: 0;
auto width =
specs.width != 0 ? compute_width(basic_string_view<Ch>(s, size)) : 0;
out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) {
return copy_str<Char>(s, s + size, it);
});
@ -2879,19 +2966,6 @@ template <typename SpecHandler, typename Char> struct precision_adapter {
SpecHandler& handler;
};
template <typename Char>
FMT_CONSTEXPR int code_point_length(const Char* begin) {
if (const_check(sizeof(Char) != 1)) return 1;
constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
// Compute the pointer to the next character early so that the next
// iteration can start working on the next character. Neither Clang
// nor GCC figure out this reordering on their own.
return len + !len;
}
template <typename Char> constexpr bool is_ascii_letter(Char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
@ -3742,7 +3816,8 @@ template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
#if !FMT_MSC_VER
// MSVC lets function pointers decay to void pointers, so this
// overload is unnecessary.
template <typename T, typename... Args> inline const void* ptr(T (*fn)(Args...)) {
template <typename T, typename... Args>
inline const void* ptr(T (*fn)(Args...)) {
return detail::bit_cast<const void*>(fn);
}
#endif

View File

@ -408,9 +408,9 @@ TEST(FormatTest, FormatErrorCode) {
}
}
TEST(FormatTest, CountCodePoints) {
TEST(FormatTest, ComputeWidth) {
EXPECT_EQ(4,
fmt::detail::count_code_points(
fmt::detail::compute_width(
fmt::basic_string_view<fmt::detail::char8_type>(
reinterpret_cast<const fmt::detail::char8_type*>("ёжик"))));
}

View File

@ -1467,8 +1467,7 @@ TEST(FormatterTest, FormatUCharString) {
EXPECT_EQ("test", format("{0:s}", ptr));
}
void function_pointer_test(int, double, std::string) {
}
void function_pointer_test(int, double, std::string) {}
TEST(FormatterTest, FormatPointer) {
check_unknown_types(reinterpret_cast<void*>(0x1234), "p", "pointer");
@ -1482,7 +1481,8 @@ TEST(FormatterTest, FormatPointer) {
EXPECT_EQ(format("{}", fmt::ptr(up.get())), format("{}", fmt::ptr(up)));
std::shared_ptr<int> sp(new int(1));
EXPECT_EQ(format("{}", fmt::ptr(sp.get())), format("{}", fmt::ptr(sp)));
EXPECT_EQ(format("{}", fmt::detail::bit_cast<const void *>(&function_pointer_test)),
EXPECT_EQ(
format("{}", fmt::detail::bit_cast<const void*>(&function_pointer_test)),
format("{}", fmt::ptr(function_pointer_test)));
EXPECT_EQ("0x0", format("{}", nullptr));
}
@ -2565,7 +2565,7 @@ TEST(FormatTest, FormatUTF8Precision) {
str_type str(reinterpret_cast<const fmt::detail::char8_type*>(
u8"caf\u00e9s")); // cafés
auto result = fmt::format(format, str);
EXPECT_EQ(fmt::detail::count_code_points(result), 4);
EXPECT_EQ(fmt::detail::compute_width(result), 4);
EXPECT_EQ(result.size(), 5);
EXPECT_EQ(from_u8str(result), from_u8str(str.substr(0, 5)));
}