Improve code point computation

2021-02-13 07:52:39 -08:00 · 2021-02-13 07:52:39 -08:00 · 13b117b5bc
commit 13b117b5bc
parent ee0fed639c
4 changed files with 139 additions and 129 deletions
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@ -2589,54 +2589,6 @@ int snprintf_float(T value, int precision, float_specs specs,
  }
 }

-// A public domain branchless UTF-8 decoder by Christopher Wellons:
-// https://github.com/skeeto/branchless-utf8
-/* Decode the next character, c, from buf, reporting errors in e.
- *
- * Since this is a branchless decoder, four bytes will be read from the
- * buffer regardless of the actual length of the next character. This
- * means the buffer _must_ have at least three bytes of zero padding
- * following the end of the data stream.
- *
- * Errors are reported in e, which will be non-zero if the parsed
- * character was somehow invalid: invalid byte sequence, non-canonical
- * encoding, or a surrogate half.
- *
- * The function returns a pointer to the next character. When an error
- * occurs, this pointer will be a guess that depends on the particular
- * error, but it will always advance at least one byte.
- */
-inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
-  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
-  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
-  static const int shiftc[] = {0, 18, 12, 6, 0};
-  static const int shifte[] = {0, 6, 4, 2, 0};
-
-  int len = code_point_length(buf);
-  const char* next = buf + len;
-
-  // Assume a four-byte character and load four bytes. Unused bits are
-  // shifted out.
-  auto s = reinterpret_cast<const unsigned char*>(buf);
-  *c = uint32_t(s[0] & masks[len]) << 18;
-  *c |= uint32_t(s[1] & 0x3f) << 12;
-  *c |= uint32_t(s[2] & 0x3f) << 6;
-  *c |= uint32_t(s[3] & 0x3f) << 0;
-  *c >>= shiftc[len];
-
-  // Accumulate the various error conditions.
-  *e = (*c < mins[len]) << 6;       // non-canonical encoding
-  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
-  *e |= (*c > 0x10FFFF) << 8;       // out of range?
-  *e |= (s[1] & 0xc0) >> 2;
-  *e |= (s[2] & 0xc0) >> 4;
-  *e |= (s[3]) >> 6;
-  *e ^= 0x2a;  // top two bits of each tail byte correct?
-  *e >>= shifte[len];
-
-  return next;
-}
-
 struct stringifier {
  template <typename T> FMT_INLINE std::string operator()(T value) const {
    return to_string(value);
@ -2678,10 +2630,7 @@ template <> struct formatter<detail::bigint> {
 };

 FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
-  auto transcode = [this](const char* p) {
-    auto cp = uint32_t();
-    auto error = 0;
-    p = utf8_decode(p, &cp, &error);
+  for_each_codepoint(s, [this](uint32_t cp, int error) {
    if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
    if (cp <= 0xFFFF) {
      buffer_.push_back(static_cast<wchar_t>(cp));
@ -2690,21 +2639,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
    }
-    return p;
-  };
-  auto p = s.data();
-  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
-  if (s.size() >= block_size) {
-    for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p);
-  }
-  if (auto num_chars_left = s.data() + s.size() - p) {
-    char buf[2 * block_size - 1] = {};
-    memcpy(buf, p, to_unsigned(num_chars_left));
-    p = buf;
-    do {
-      p = transcode(p);
-    } while (p - buf < num_chars_left);
-  }
+  });
  buffer_.push_back(0);
 }

--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@ -539,42 +539,6 @@ class truncating_iterator<OutputIt, std::true_type>
  truncating_iterator& operator*() { return *this; }
 };

-template <typename Char>
-inline size_t count_code_points(basic_string_view<Char> s) {
-  return s.size();
-}
-
-// Counts the number of code points in a UTF-8 string.
-FMT_CONSTEXPR inline size_t count_code_points(basic_string_view<char> s) {
-  const char* data = s.data();
-  size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80) ++num_code_points;
-  }
-  return num_code_points;
-}
-
-inline size_t count_code_points(basic_string_view<char8_type> s) {
-  return count_code_points(basic_string_view<char>(
-      reinterpret_cast<const char*>(s.data()), s.size()));
-}
-
-template <typename Char>
-inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
-  size_t size = s.size();
-  return n < size ? n : size;
-}
-
-// Calculates the index of the nth code point in a UTF-8 string.
-inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
-  const char8_type* data = s.data();
-  size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
-  }
-  return s.size();
-}
-
 // <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
 // instead (#1998).
 template <typename OutputIt, typename Size, typename T>
@ -634,6 +598,130 @@ inline counting_iterator copy_str(InputIt begin, InputIt end,
  return it + (end - begin);
 }

+template <typename Char>
+FMT_CONSTEXPR int code_point_length(const Char* begin) {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                              0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
+  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
+
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  return len + !len;
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from s, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_CONSTEXPR inline const char* utf8_decode(const char* s, uint32_t* c,
+                                             int* e) {
+  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
+  constexpr const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = code_point_length(s);
+  const char* next = s + len;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(s[0] & masks[len]) << 18;
+  *c |= uint32_t(s[1] & 0x3f) << 12;
+  *c |= uint32_t(s[2] & 0x3f) << 6;
+  *c |= uint32_t(s[3] & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  using uchar = unsigned char;
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (uchar(s[1]) & 0xc0) >> 2;
+  *e |= (uchar(s[2]) & 0xc0) >> 4;
+  *e |= uchar(s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+template <typename F>
+FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
+  auto decode = [f](const char* p) {
+    auto cp = uint32_t();
+    auto error = 0;
+    p = utf8_decode(p, &cp, &error);
+    f(cp, error);
+    return p;
+  };
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p);
+  }
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    copy_str<char>(p, p + num_chars_left, buf);
+    p = buf;
+    do {
+      p = decode(p);
+    } while (p - buf < num_chars_left);
+  }
+}
+
+template <typename Char>
+inline size_t compute_width(basic_string_view<Char> s) {
+  return s.size();
+}
+
+// Computes approximate display width of a UTF-8 string.
+FMT_CONSTEXPR inline size_t compute_width(string_view s) {
+  size_t num_code_points = 0;
+  // It is not a lambda for compatibility with C++14.
+  struct count_code_points {
+    size_t* count;
+    FMT_CONSTEXPR void operator()(uint32_t, int) const { ++*count; }
+  };
+  for_each_codepoint(s, count_code_points{&num_code_points});
+  return num_code_points;
+}
+
+inline size_t compute_width(basic_string_view<char8_type> s) {
+  return compute_width(basic_string_view<char>(
+      reinterpret_cast<const char*>(s.data()), s.size()));
+}
+
+template <typename Char>
+inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
+  size_t size = s.size();
+  return n < size ? n : size;
+}
+
+// Calculates the index of the nth code point in a UTF-8 string.
+inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
+  const char8_type* data = s.data();
+  size_t num_code_points = 0;
+  for (size_t i = 0, size = s.size(); i != size; ++i) {
+    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
+  }
+  return s.size();
+}
+
 template <typename T>
 using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
                                    sizeof(T) <= sizeof(double)>;
@ -1674,7 +1762,7 @@ FMT_CONSTEXPR OutputIt write(OutputIt out, basic_string_view<StrChar> s,
  if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
    size = code_point_index(s, to_unsigned(specs.precision));
  auto width = specs.width != 0
-                   ? count_code_points(basic_string_view<StrChar>(data, size))
+                   ? compute_width(basic_string_view<StrChar>(data, size))
                   : 0;
  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
  return write_padded(out, specs, size, width, [=](iterator it) {
@ -2274,9 +2362,8 @@ class arg_formatter_base {

  template <typename Ch>
  void write(const Ch* s, size_t size, const format_specs& specs) {
-    auto width = specs.width != 0
-                     ? count_code_points(basic_string_view<Ch>(s, size))
-                     : 0;
+    auto width =
+        specs.width != 0 ? compute_width(basic_string_view<Ch>(s, size)) : 0;
    out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) {
      return copy_str<Char>(s, s + size, it);
    });
@ -2879,19 +2966,6 @@ template <typename SpecHandler, typename Char> struct precision_adapter {
  SpecHandler& handler;
 };

-template <typename Char>
-FMT_CONSTEXPR int code_point_length(const Char* begin) {
-  if (const_check(sizeof(Char) != 1)) return 1;
-  constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                              0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
-  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
-
-  // Compute the pointer to the next character early so that the next
-  // iteration can start working on the next character. Neither Clang
-  // nor GCC figure out this reordering on their own.
-  return len + !len;
-}
-
 template <typename Char> constexpr bool is_ascii_letter(Char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 }
@ -3742,7 +3816,8 @@ template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
 #if !FMT_MSC_VER
 // MSVC lets function pointers decay to void pointers, so this
 // overload is unnecessary.
-template <typename T, typename... Args> inline const void* ptr(T (*fn)(Args...)) {
+template <typename T, typename... Args>
+inline const void* ptr(T (*fn)(Args...)) {
  return detail::bit_cast<const void*>(fn);
 }
 #endif
--- a/test/format-impl-test.cc
+++ b/test/format-impl-test.cc
@ -408,9 +408,9 @@ TEST(FormatTest, FormatErrorCode) {
  }
 }

-TEST(FormatTest, CountCodePoints) {
+TEST(FormatTest, ComputeWidth) {
  EXPECT_EQ(4,
-            fmt::detail::count_code_points(
+            fmt::detail::compute_width(
                fmt::basic_string_view<fmt::detail::char8_type>(
                    reinterpret_cast<const fmt::detail::char8_type*>("ёжик"))));
 }
--- a/test/format-test.cc
+++ b/test/format-test.cc
@ -1467,8 +1467,7 @@ TEST(FormatterTest, FormatUCharString) {
  EXPECT_EQ("test", format("{0:s}", ptr));
 }

-void function_pointer_test(int, double, std::string) {
-}
+void function_pointer_test(int, double, std::string) {}

 TEST(FormatterTest, FormatPointer) {
  check_unknown_types(reinterpret_cast<void*>(0x1234), "p", "pointer");
@ -1482,7 +1481,8 @@ TEST(FormatterTest, FormatPointer) {
  EXPECT_EQ(format("{}", fmt::ptr(up.get())), format("{}", fmt::ptr(up)));
  std::shared_ptr<int> sp(new int(1));
  EXPECT_EQ(format("{}", fmt::ptr(sp.get())), format("{}", fmt::ptr(sp)));
-  EXPECT_EQ(format("{}", fmt::detail::bit_cast<const void *>(&function_pointer_test)),
+  EXPECT_EQ(
+      format("{}", fmt::detail::bit_cast<const void*>(&function_pointer_test)),
      format("{}", fmt::ptr(function_pointer_test)));
  EXPECT_EQ("0x0", format("{}", nullptr));
 }
@ -2565,7 +2565,7 @@ TEST(FormatTest, FormatUTF8Precision) {
  str_type str(reinterpret_cast<const fmt::detail::char8_type*>(
      u8"caf\u00e9s"));  // cafés
  auto result = fmt::format(format, str);
-  EXPECT_EQ(fmt::detail::count_code_points(result), 4);
+  EXPECT_EQ(fmt::detail::compute_width(result), 4);
  EXPECT_EQ(result.size(), 5);
  EXPECT_EQ(from_u8str(result), from_u8str(str.substr(0, 5)));
 }