Improve code point computation
This commit is contained in:
parent
ee0fed639c
commit
13b117b5bc
@ -2589,54 +2589,6 @@ int snprintf_float(T value, int precision, float_specs specs,
|
||||
}
|
||||
}
|
||||
|
||||
// A public domain branchless UTF-8 decoder by Christopher Wellons:
|
||||
// https://github.com/skeeto/branchless-utf8
|
||||
/* Decode the next character, c, from buf, reporting errors in e.
|
||||
*
|
||||
* Since this is a branchless decoder, four bytes will be read from the
|
||||
* buffer regardless of the actual length of the next character. This
|
||||
* means the buffer _must_ have at least three bytes of zero padding
|
||||
* following the end of the data stream.
|
||||
*
|
||||
* Errors are reported in e, which will be non-zero if the parsed
|
||||
* character was somehow invalid: invalid byte sequence, non-canonical
|
||||
* encoding, or a surrogate half.
|
||||
*
|
||||
* The function returns a pointer to the next character. When an error
|
||||
* occurs, this pointer will be a guess that depends on the particular
|
||||
* error, but it will always advance at least one byte.
|
||||
*/
|
||||
inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
|
||||
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||
static const int shiftc[] = {0, 18, 12, 6, 0};
|
||||
static const int shifte[] = {0, 6, 4, 2, 0};
|
||||
|
||||
int len = code_point_length(buf);
|
||||
const char* next = buf + len;
|
||||
|
||||
// Assume a four-byte character and load four bytes. Unused bits are
|
||||
// shifted out.
|
||||
auto s = reinterpret_cast<const unsigned char*>(buf);
|
||||
*c = uint32_t(s[0] & masks[len]) << 18;
|
||||
*c |= uint32_t(s[1] & 0x3f) << 12;
|
||||
*c |= uint32_t(s[2] & 0x3f) << 6;
|
||||
*c |= uint32_t(s[3] & 0x3f) << 0;
|
||||
*c >>= shiftc[len];
|
||||
|
||||
// Accumulate the various error conditions.
|
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
*e |= (s[1] & 0xc0) >> 2;
|
||||
*e |= (s[2] & 0xc0) >> 4;
|
||||
*e |= (s[3]) >> 6;
|
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len];
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
struct stringifier {
|
||||
template <typename T> FMT_INLINE std::string operator()(T value) const {
|
||||
return to_string(value);
|
||||
@ -2678,10 +2630,7 @@ template <> struct formatter<detail::bigint> {
|
||||
};
|
||||
|
||||
FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
||||
auto transcode = [this](const char* p) {
|
||||
auto cp = uint32_t();
|
||||
auto error = 0;
|
||||
p = utf8_decode(p, &cp, &error);
|
||||
for_each_codepoint(s, [this](uint32_t cp, int error) {
|
||||
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
|
||||
if (cp <= 0xFFFF) {
|
||||
buffer_.push_back(static_cast<wchar_t>(cp));
|
||||
@ -2690,21 +2639,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
||||
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
|
||||
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
|
||||
}
|
||||
return p;
|
||||
};
|
||||
auto p = s.data();
|
||||
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
|
||||
if (s.size() >= block_size) {
|
||||
for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p);
|
||||
}
|
||||
if (auto num_chars_left = s.data() + s.size() - p) {
|
||||
char buf[2 * block_size - 1] = {};
|
||||
memcpy(buf, p, to_unsigned(num_chars_left));
|
||||
p = buf;
|
||||
do {
|
||||
p = transcode(p);
|
||||
} while (p - buf < num_chars_left);
|
||||
}
|
||||
});
|
||||
buffer_.push_back(0);
|
||||
}
|
||||
|
||||
|
@ -539,42 +539,6 @@ class truncating_iterator<OutputIt, std::true_type>
|
||||
truncating_iterator& operator*() { return *this; }
|
||||
};
|
||||
|
||||
template <typename Char>
|
||||
inline size_t count_code_points(basic_string_view<Char> s) {
|
||||
return s.size();
|
||||
}
|
||||
|
||||
// Counts the number of code points in a UTF-8 string.
|
||||
FMT_CONSTEXPR inline size_t count_code_points(basic_string_view<char> s) {
|
||||
const char* data = s.data();
|
||||
size_t num_code_points = 0;
|
||||
for (size_t i = 0, size = s.size(); i != size; ++i) {
|
||||
if ((data[i] & 0xc0) != 0x80) ++num_code_points;
|
||||
}
|
||||
return num_code_points;
|
||||
}
|
||||
|
||||
inline size_t count_code_points(basic_string_view<char8_type> s) {
|
||||
return count_code_points(basic_string_view<char>(
|
||||
reinterpret_cast<const char*>(s.data()), s.size()));
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
|
||||
size_t size = s.size();
|
||||
return n < size ? n : size;
|
||||
}
|
||||
|
||||
// Calculates the index of the nth code point in a UTF-8 string.
|
||||
inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
|
||||
const char8_type* data = s.data();
|
||||
size_t num_code_points = 0;
|
||||
for (size_t i = 0, size = s.size(); i != size; ++i) {
|
||||
if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
|
||||
}
|
||||
return s.size();
|
||||
}
|
||||
|
||||
// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
|
||||
// instead (#1998).
|
||||
template <typename OutputIt, typename Size, typename T>
|
||||
@ -634,6 +598,130 @@ inline counting_iterator copy_str(InputIt begin, InputIt end,
|
||||
return it + (end - begin);
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
FMT_CONSTEXPR int code_point_length(const Char* begin) {
|
||||
if (const_check(sizeof(Char) != 1)) return 1;
|
||||
constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
||||
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
|
||||
|
||||
// Compute the pointer to the next character early so that the next
|
||||
// iteration can start working on the next character. Neither Clang
|
||||
// nor GCC figure out this reordering on their own.
|
||||
return len + !len;
|
||||
}
|
||||
|
||||
// A public domain branchless UTF-8 decoder by Christopher Wellons:
|
||||
// https://github.com/skeeto/branchless-utf8
|
||||
/* Decode the next character, c, from s, reporting errors in e.
|
||||
*
|
||||
* Since this is a branchless decoder, four bytes will be read from the
|
||||
* buffer regardless of the actual length of the next character. This
|
||||
* means the buffer _must_ have at least three bytes of zero padding
|
||||
* following the end of the data stream.
|
||||
*
|
||||
* Errors are reported in e, which will be non-zero if the parsed
|
||||
* character was somehow invalid: invalid byte sequence, non-canonical
|
||||
* encoding, or a surrogate half.
|
||||
*
|
||||
* The function returns a pointer to the next character. When an error
|
||||
* occurs, this pointer will be a guess that depends on the particular
|
||||
* error, but it will always advance at least one byte.
|
||||
*/
|
||||
FMT_CONSTEXPR inline const char* utf8_decode(const char* s, uint32_t* c,
|
||||
int* e) {
|
||||
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
|
||||
constexpr const int shifte[] = {0, 6, 4, 2, 0};
|
||||
|
||||
int len = code_point_length(s);
|
||||
const char* next = s + len;
|
||||
|
||||
// Assume a four-byte character and load four bytes. Unused bits are
|
||||
// shifted out.
|
||||
*c = uint32_t(s[0] & masks[len]) << 18;
|
||||
*c |= uint32_t(s[1] & 0x3f) << 12;
|
||||
*c |= uint32_t(s[2] & 0x3f) << 6;
|
||||
*c |= uint32_t(s[3] & 0x3f) << 0;
|
||||
*c >>= shiftc[len];
|
||||
|
||||
// Accumulate the various error conditions.
|
||||
using uchar = unsigned char;
|
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
*e |= (uchar(s[1]) & 0xc0) >> 2;
|
||||
*e |= (uchar(s[2]) & 0xc0) >> 4;
|
||||
*e |= uchar(s[3]) >> 6;
|
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len];
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
|
||||
auto decode = [f](const char* p) {
|
||||
auto cp = uint32_t();
|
||||
auto error = 0;
|
||||
p = utf8_decode(p, &cp, &error);
|
||||
f(cp, error);
|
||||
return p;
|
||||
};
|
||||
auto p = s.data();
|
||||
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
|
||||
if (s.size() >= block_size) {
|
||||
for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p);
|
||||
}
|
||||
if (auto num_chars_left = s.data() + s.size() - p) {
|
||||
char buf[2 * block_size - 1] = {};
|
||||
copy_str<char>(p, p + num_chars_left, buf);
|
||||
p = buf;
|
||||
do {
|
||||
p = decode(p);
|
||||
} while (p - buf < num_chars_left);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
inline size_t compute_width(basic_string_view<Char> s) {
|
||||
return s.size();
|
||||
}
|
||||
|
||||
// Computes approximate display width of a UTF-8 string.
|
||||
FMT_CONSTEXPR inline size_t compute_width(string_view s) {
|
||||
size_t num_code_points = 0;
|
||||
// It is not a lambda for compatibility with C++14.
|
||||
struct count_code_points {
|
||||
size_t* count;
|
||||
FMT_CONSTEXPR void operator()(uint32_t, int) const { ++*count; }
|
||||
};
|
||||
for_each_codepoint(s, count_code_points{&num_code_points});
|
||||
return num_code_points;
|
||||
}
|
||||
|
||||
inline size_t compute_width(basic_string_view<char8_type> s) {
|
||||
return compute_width(basic_string_view<char>(
|
||||
reinterpret_cast<const char*>(s.data()), s.size()));
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
|
||||
size_t size = s.size();
|
||||
return n < size ? n : size;
|
||||
}
|
||||
|
||||
// Calculates the index of the nth code point in a UTF-8 string.
|
||||
inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
|
||||
const char8_type* data = s.data();
|
||||
size_t num_code_points = 0;
|
||||
for (size_t i = 0, size = s.size(); i != size; ++i) {
|
||||
if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
|
||||
}
|
||||
return s.size();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
|
||||
sizeof(T) <= sizeof(double)>;
|
||||
@ -1674,7 +1762,7 @@ FMT_CONSTEXPR OutputIt write(OutputIt out, basic_string_view<StrChar> s,
|
||||
if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
|
||||
size = code_point_index(s, to_unsigned(specs.precision));
|
||||
auto width = specs.width != 0
|
||||
? count_code_points(basic_string_view<StrChar>(data, size))
|
||||
? compute_width(basic_string_view<StrChar>(data, size))
|
||||
: 0;
|
||||
using iterator = remove_reference_t<decltype(reserve(out, 0))>;
|
||||
return write_padded(out, specs, size, width, [=](iterator it) {
|
||||
@ -2274,9 +2362,8 @@ class arg_formatter_base {
|
||||
|
||||
template <typename Ch>
|
||||
void write(const Ch* s, size_t size, const format_specs& specs) {
|
||||
auto width = specs.width != 0
|
||||
? count_code_points(basic_string_view<Ch>(s, size))
|
||||
: 0;
|
||||
auto width =
|
||||
specs.width != 0 ? compute_width(basic_string_view<Ch>(s, size)) : 0;
|
||||
out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) {
|
||||
return copy_str<Char>(s, s + size, it);
|
||||
});
|
||||
@ -2879,19 +2966,6 @@ template <typename SpecHandler, typename Char> struct precision_adapter {
|
||||
SpecHandler& handler;
|
||||
};
|
||||
|
||||
template <typename Char>
|
||||
FMT_CONSTEXPR int code_point_length(const Char* begin) {
|
||||
if (const_check(sizeof(Char) != 1)) return 1;
|
||||
constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
||||
int len = lengths[static_cast<unsigned char>(*begin) >> 3];
|
||||
|
||||
// Compute the pointer to the next character early so that the next
|
||||
// iteration can start working on the next character. Neither Clang
|
||||
// nor GCC figure out this reordering on their own.
|
||||
return len + !len;
|
||||
}
|
||||
|
||||
template <typename Char> constexpr bool is_ascii_letter(Char c) {
|
||||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
||||
}
|
||||
@ -3742,7 +3816,8 @@ template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
|
||||
#if !FMT_MSC_VER
|
||||
// MSVC lets function pointers decay to void pointers, so this
|
||||
// overload is unnecessary.
|
||||
template <typename T, typename... Args> inline const void* ptr(T (*fn)(Args...)) {
|
||||
template <typename T, typename... Args>
|
||||
inline const void* ptr(T (*fn)(Args...)) {
|
||||
return detail::bit_cast<const void*>(fn);
|
||||
}
|
||||
#endif
|
||||
|
@ -408,9 +408,9 @@ TEST(FormatTest, FormatErrorCode) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FormatTest, CountCodePoints) {
|
||||
TEST(FormatTest, ComputeWidth) {
|
||||
EXPECT_EQ(4,
|
||||
fmt::detail::count_code_points(
|
||||
fmt::detail::compute_width(
|
||||
fmt::basic_string_view<fmt::detail::char8_type>(
|
||||
reinterpret_cast<const fmt::detail::char8_type*>("ёжик"))));
|
||||
}
|
||||
|
@ -1467,8 +1467,7 @@ TEST(FormatterTest, FormatUCharString) {
|
||||
EXPECT_EQ("test", format("{0:s}", ptr));
|
||||
}
|
||||
|
||||
void function_pointer_test(int, double, std::string) {
|
||||
}
|
||||
void function_pointer_test(int, double, std::string) {}
|
||||
|
||||
TEST(FormatterTest, FormatPointer) {
|
||||
check_unknown_types(reinterpret_cast<void*>(0x1234), "p", "pointer");
|
||||
@ -1482,7 +1481,8 @@ TEST(FormatterTest, FormatPointer) {
|
||||
EXPECT_EQ(format("{}", fmt::ptr(up.get())), format("{}", fmt::ptr(up)));
|
||||
std::shared_ptr<int> sp(new int(1));
|
||||
EXPECT_EQ(format("{}", fmt::ptr(sp.get())), format("{}", fmt::ptr(sp)));
|
||||
EXPECT_EQ(format("{}", fmt::detail::bit_cast<const void *>(&function_pointer_test)),
|
||||
EXPECT_EQ(
|
||||
format("{}", fmt::detail::bit_cast<const void*>(&function_pointer_test)),
|
||||
format("{}", fmt::ptr(function_pointer_test)));
|
||||
EXPECT_EQ("0x0", format("{}", nullptr));
|
||||
}
|
||||
@ -2565,7 +2565,7 @@ TEST(FormatTest, FormatUTF8Precision) {
|
||||
str_type str(reinterpret_cast<const fmt::detail::char8_type*>(
|
||||
u8"caf\u00e9s")); // cafés
|
||||
auto result = fmt::format(format, str);
|
||||
EXPECT_EQ(fmt::detail::count_code_points(result), 4);
|
||||
EXPECT_EQ(fmt::detail::compute_width(result), 4);
|
||||
EXPECT_EQ(result.size(), 5);
|
||||
EXPECT_EQ(from_u8str(result), from_u8str(str.substr(0, 5)));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user