Fix bugs in utf8 decoder (#3056)

Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru> Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
2024-11-12 22:20:05 +00:00 · 2022-08-27 03:37:15 +05:00 · 2022-08-27 03:37:15 +05:00 · f98048b621
commit f98048b621
parent 4a8e2949bb
3 changed files with 133 additions and 13 deletions
--- a/include/fmt/core.h
+++ b/include/fmt/core.h
@ -2297,12 +2297,16 @@ constexpr auto to_ascii(Char c) -> underlying_t<Char> {
  return c;
 }

+template <typename Char>
+FMT_CONSTEXPR auto code_point_length_impl(Char begin) -> int {
+  return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
+      [static_cast<unsigned char>(begin) >> 3];
+}
+
 template <typename Char>
 FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
  if (const_check(sizeof(Char) != 1)) return 1;
-  auto lengths =
-      "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4";
-  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
+  int len = code_point_length_impl(*begin);

  // Compute the pointer to the next character early so that the next
  // iteration can start working on the next character. Neither Clang
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@ -602,25 +602,28 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
 */
 FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
    -> const char* {
-  constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
  constexpr const int shifte[] = {0, 6, 4, 2, 0};

-  int len = code_point_length(s);
-  const char* next = s + len;
+  int len = code_point_length_impl(*s);
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  const char* next = s + len + !len;
+
+  using uchar = unsigned char;

  // Assume a four-byte character and load four bytes. Unused bits are
  // shifted out.
-  *c = uint32_t(s[0] & masks[len]) << 18;
-  *c |= uint32_t(s[1] & 0x3f) << 12;
-  *c |= uint32_t(s[2] & 0x3f) << 6;
-  *c |= uint32_t(s[3] & 0x3f) << 0;
+  *c = uint32_t(uchar(s[0]) & masks[len]) << 18;
+  *c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
+  *c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
+  *c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
  *c >>= shiftc[len];

  // Accumulate the various error conditions.
-  using uchar = unsigned char;
  *e = (*c < mins[len]) << 6;       // non-canonical encoding
  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
  *e |= (*c > 0x10FFFF) << 8;       // out of range?
@ -629,8 +632,6 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
  *e |= uchar(s[3]) >> 6;
  *e ^= 0x2a;  // top two bits of each tail byte correct?
  *e >>= shifte[len];
-  *e |= ((uchar(s[0]) & prefix_masks[len]) !=
-         uchar((prefix_masks[len] << 1) & 0xFF));  // first byte correct?

  return next;
 }
--- a/test/format-impl-test.cc
+++ b/test/format-impl-test.cc
@ -430,3 +430,118 @@ TEST(format_impl_test, write_console_signature) {
  (void)p;
 }
 #endif
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+constexpr bool unicode_is_surrogate(uint32_t c) {
+  return c >= 0xD800U && c <= 0xDFFFU;
+}
+
+FMT_CONSTEXPR char* utf8_encode(char* s, uint32_t c) {
+  if (c >= (1UL << 16)) {
+    s[0] = static_cast<char>(0xf0 | (c >> 18));
+    s[1] = static_cast<char>(0x80 | ((c >> 12) & 0x3f));
+    s[2] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
+    s[3] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
+    return s + 4;
+  } else if (c >= (1UL << 11)) {
+    s[0] = static_cast<char>(0xe0 | (c >> 12));
+    s[1] = static_cast<char>(0x80 | ((c >> 6) & 0x3f));
+    s[2] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
+    return s + 3;
+  } else if (c >= (1UL << 7)) {
+    s[0] = static_cast<char>(0xc0 | (c >> 6));
+    s[1] = static_cast<char>(0x80 | ((c >> 0) & 0x3f));
+    return s + 2;
+  } else {
+    s[0] = static_cast<char>(c);
+    return s + 1;
+  }
+}
+
+// Make sure it can decode every character
+TEST(format_impl_test, utf8_decode_decode_all) {
+  for (uint32_t i = 0; i < 0x10ffff; i++) {
+    if (!unicode_is_surrogate(i)) {
+      int e;
+      uint32_t c;
+      char buf[8] = {0};
+      char* end = utf8_encode(buf, i);
+      const char* res = fmt::detail::utf8_decode(buf, &c, &e);
+      EXPECT_EQ(end, res);
+      EXPECT_EQ(c, i);
+      EXPECT_EQ(e, 0);
+    }
+  }
+}
+
+// Reject everything outside of U+0000..U+10FFFF
+TEST(format_impl_test, utf8_decode_out_of_range) {
+  for (uint32_t i = 0x110000; i < 0x1fffff; i++) {
+    int e;
+    uint32_t c;
+    char buf[8] = {0};
+    utf8_encode(buf, i);
+    const char* end = fmt::detail::utf8_decode(buf, &c, &e);
+    EXPECT_NE(e, 0);
+    EXPECT_EQ(end - buf, 4);
+  }
+}
+
+// Does it reject all surrogate halves?
+TEST(format_impl_test, utf8_decode_surrogate_halves) {
+  for (uint32_t i = 0xd800; i <= 0xdfff; i++) {
+    int e;
+    uint32_t c;
+    char buf[8] = {0};
+    utf8_encode(buf, i);
+    fmt::detail::utf8_decode(buf, &c, &e);
+    EXPECT_NE(e, 0);
+  }
+}
+
+// How about non-canonical encodings?
+TEST(format_impl_test, utf8_decode_non_canonical_encodings) {
+  int e;
+  uint32_t c;
+  const char* end;
+
+  char buf2[8] = {char(0xc0), char(0xA4)};
+  end = fmt::detail::utf8_decode(buf2, &c, &e);
+  EXPECT_NE(e, 0);           // non-canonical len 2
+  EXPECT_EQ(end, buf2 + 2);  // non-canonical recover 2
+
+  char buf3[8] = {char(0xe0), char(0x80), char(0xA4)};
+  end = fmt::detail::utf8_decode(buf3, &c, &e);
+  EXPECT_NE(e, 0);           // non-canonical len 3
+  EXPECT_EQ(end, buf3 + 3);  // non-canonical recover 3
+
+  char buf4[8] = {char(0xf0), char(0x80), char(0x80), char(0xA4)};
+  end = fmt::detail::utf8_decode(buf4, &c, &e);
+  EXPECT_NE(e, 0);           // non-canonical encoding len 4
+  EXPECT_EQ(end, buf4 + 4);  // non-canonical recover 4
+}
+
+// Let's try some bogus byte sequences
+TEST(format_impl_test, utf8_decode_bogus_byte_sequences) {
+  int e;
+  uint32_t c;
+
+  // Invalid first byte
+  char buf0[4] = {char(0xff)};
+  auto len = fmt::detail::utf8_decode(buf0, &c, &e) - buf0;
+  EXPECT_NE(e, 0);    // "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
+  EXPECT_EQ(len, 1);  // "bogus [ff] recovery %d", len);
+
+  // Invalid first byte
+  char buf1[4] = {char(0x80)};
+  len = fmt::detail::utf8_decode(buf1, &c, &e) - buf1;
+  EXPECT_NE(e, 0);    // "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
+  EXPECT_EQ(len, 1);  // "bogus [80] recovery %d", len);
+
+  // Looks like a two-byte sequence but second byte is wrong
+  char buf2[4] = {char(0xc0), char(0x0a)};
+  len = fmt::detail::utf8_decode(buf2, &c, &e) - buf2;
+  EXPECT_NE(e, 0);    // "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c
+  EXPECT_EQ(len, 2);  // "bogus [c0 0a] recovery %d", len);
+}