A new instance of the utf-8 conversion changelist, this time against

bleeding_edge. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@170 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2008-09-05 13:39:14 +00:00 · 2008-09-05 13:39:14 +00:00 · 1d8c4a9d94
commit 1d8c4a9d94
parent 388c1094b7
5 changed files with 122 additions and 8 deletions
--- a/include/v8.h
+++ b/include/v8.h
@ -26,8 +26,8 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 /** \mainpage V8 API Reference Guide
- *	
- * V8 is Google's open source JavaScript engine. 
+ *
+ * V8 is Google's open source JavaScript engine.
 *
 * This set of documents provides reference material generated from the
 * V8 header file, include/v8.h.
@ -485,7 +485,7 @@ class EXPORT Data {
 /**
 * Pre-compilation data that can be associated with a script.  This
 * data can be calculated for a script in advance of actually
- * compiling it, and can bestored between compilations.  When script 
+ * compiling it, and can be stored between compilations.  When script
 * data is given to the compile method compilation will be faster.
 */
 class EXPORT ScriptData {  // NOLINT
@ -631,7 +631,7 @@ class EXPORT Value : public Data {
   * Returns true if this value is boolean.
   */
  bool IsBoolean();
-  
+
  /**
   * Returns true if this value is a number.
   */
@ -696,8 +696,18 @@ class EXPORT Boolean : public Primitive {
 */
 class EXPORT String : public Primitive {
 public:
+
+  /**
+   * Returns the number of characters in this string.
+   */
  int Length();

+  /**
+   * Returns the number of bytes in the UTF-8 encoded
+   * representation of this string.
+   */
+  int Utf8Length();
+
  /**
   * Write the contents of the string to an external buffer.
   * If no arguments are given, expects the buffer to be large
@ -716,9 +726,8 @@ class EXPORT String : public Primitive {
   * excluding the NULL terminator.
   */
  int Write(uint16_t* buffer, int start = 0, int length = -1);  // UTF-16
-  int WriteAscii(char* buffer,
-                 int start = 0,
-                 int length = -1);  // literally ascii
+  int WriteAscii(char* buffer, int start = 0, int length = -1);  // ASCII
+  int WriteUtf8(char* buffer, int length = -1); // UTF-8

  /**
   * Returns true if the string is external
@ -1755,7 +1764,7 @@ class EXPORT V8 {
  static void IgnoreOutOfMemoryException();

  /**
-   * Check if V8 is dead and therefore unusable.  This is the case after 
+   * Check if V8 is dead and therefore unusable.  This is the case after
   * fatal errors such as out-of-memory situations.
   */
  static bool IsDead();
--- a/src/api.cc
+++ b/src/api.cc
@ -1925,6 +1925,53 @@ int String::Length() {
 }


+int String::Utf8Length() {
+  if (IsDeadCheck("v8::String::Utf8Length()")) return 0;
+  return Utils::OpenHandle(this)->Utf8Length();
+}
+
+
+int String::WriteUtf8(char* buffer, int capacity) {
+  if (IsDeadCheck("v8::String::WriteUtf8()")) return 0;
+  LOG_API("String::WriteUtf8");
+  i::Handle<i::String> str = Utils::OpenHandle(this);
+  write_input_buffer.Reset(0, *str);
+  int len = str->length();
+  // Encode the first K - 3 bytes directly into the buffer since we
+  // know there's room for them.  If no capacity is given we copy all
+  // of them here.
+  int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
+  int i;
+  int pos = 0;
+  for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
+    i::uc32 c = write_input_buffer.GetNext();
+    int written = unibrow::Utf8::Encode(buffer + pos, c);
+    pos += written;
+  }
+  if (i < len) {
+    // For the last characters we need to check the length for each one
+    // because they may be longer than the remaining space in the
+    // buffer.
+    char intermediate[unibrow::Utf8::kMaxEncodedSize];
+    for (; i < len && pos < capacity; i++) {
+      i::uc32 c = write_input_buffer.GetNext();
+      int written = unibrow::Utf8::Encode(intermediate, c);
+      if (pos + written <= capacity) {
+        for (int j = 0; j < written; j++)
+          buffer[pos + j] = intermediate[j];
+        pos += written;
+      } else {
+        // We've reached the end of the buffer
+        break;
+      }
+    }
+  }
+  if (i == len && (capacity == -1 || pos < capacity))
+    buffer[pos++] = '\0';
+  return pos;
+}
+
+
 int String::WriteAscii(char* buffer, int start, int length) {
  if (IsDeadCheck("v8::String::WriteAscii()")) return 0;
  LOG_API("String::WriteAscii");
--- a/src/objects.cc
+++ b/src/objects.cc
@ -2915,6 +2915,22 @@ bool String::LooksValid() {
 }


+int String::Utf8Length() {
+  if (is_ascii()) return length();
+  // Attempt to flatten before accessing the string.  It probably
+  // doesn't make Utf8Length faster, but it is very likely that
+  // the string will be accessed later (for example by WriteUtf8)
+  // so it's still a good idea.
+  TryFlatten();
+  Access<StringInputBuffer> buffer(&string_input_buffer);
+  buffer->Reset(0, this);
+  int result = 0;
+  while (buffer->has_more())
+    result += unibrow::Utf8::Length(buffer->GetNext());
+  return result;
+}
+
+
 SmartPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
                                     RobustnessFlag robust_flag,
                                     int offset,
--- a/src/objects.h
+++ b/src/objects.h
@ -2842,6 +2842,8 @@ class String: public HeapObject {
      RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
      int* length_output = 0);

+  int Utf8Length();
+
  // Return a 16 bit Unicode representation of the string.
  // The string should be nearly flat, otherwise the performance of
  // of this method may be very bad.  Setting robustness_flag to
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -333,3 +333,43 @@ TEST(DeepAscii) {
    TraverseFirst(flat_string, string, DEEP_ASCII_DEPTH);
  }
 }
+
+
+TEST(Utf8Conversion) {
+  // Smoke test for converting strings to utf-8.
+  InitializeVM();
+  v8::HandleScope handle_scope;
+  // A simple ascii string
+  const char* ascii_string = "abcdef12345";
+  int len = v8::String::New(ascii_string, strlen(ascii_string))->Utf8Length();
+  CHECK_EQ(strlen(ascii_string), len);
+  // A mixed ascii and non-ascii string
+  // U+02E4 -> CB A4
+  // U+0064 -> 64
+  // U+12E4 -> E1 8B A4
+  // U+0030 -> 30
+  // U+3045 -> E3 81 85
+  const uint16_t mixed_string[] = {0x02E4, 0x0064, 0x12E4, 0x0030, 0x3045};
+  // The characters we expect to be output
+  const char as_utf8[11] = {0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30,
+      0xE3, 0x81, 0x85, 0x00};
+  // The number of bytes expected to be written for each length
+  const int lengths[12] = {0, 0, 2, 3, 3, 3, 6, 7, 7, 7, 10, 11};
+  v8::Handle<v8::String> mixed = v8::String::New(mixed_string, 5);
+  CHECK_EQ(10, mixed->Utf8Length());
+  // Try encoding the string with all capacities
+  char buffer[11];
+  for (int i = 0; i <= 11; i++) {
+    // Clear the buffer before reusing it
+    for (int j = 0; j < 11; j++)
+      buffer[j] = -1;
+    int written = mixed->WriteUtf8(buffer, i);
+    CHECK_EQ(lengths[i], written);
+    // Check that the contents are correct
+    for (int j = 0; j < lengths[i]; j++)
+      CHECK_EQ(as_utf8[j], buffer[j]);
+    // Check that the rest of the buffer hasn't been touched
+    for (int j = lengths[i]; j < 11; j++)
+      CHECK_EQ(-1, buffer[j]);
+  }
+}