From 1d8c4a9d94b920f3754825836eec48c3f6af3a17 Mon Sep 17 00:00:00 2001 From: "christian.plesner.hansen@gmail.com" Date: Fri, 5 Sep 2008 13:39:14 +0000 Subject: [PATCH] A new instance of the utf-8 conversion changelist, this time against bleeding_edge. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@170 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- include/v8.h | 25 +++++++++++++------- src/api.cc | 47 +++++++++++++++++++++++++++++++++++++ src/objects.cc | 16 +++++++++++++ src/objects.h | 2 ++ test/cctest/test-strings.cc | 40 +++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 8 deletions(-) diff --git a/include/v8.h b/include/v8.h index ddd423e0e4..683c55a52f 100644 --- a/include/v8.h +++ b/include/v8.h @@ -26,8 +26,8 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** \mainpage V8 API Reference Guide - * - * V8 is Google's open source JavaScript engine. + * + * V8 is Google's open source JavaScript engine. * * This set of documents provides reference material generated from the * V8 header file, include/v8.h. @@ -485,7 +485,7 @@ class EXPORT Data { /** * Pre-compilation data that can be associated with a script. This * data can be calculated for a script in advance of actually - * compiling it, and can bestored between compilations. When script + * compiling it, and can be stored between compilations. When script * data is given to the compile method compilation will be faster. */ class EXPORT ScriptData { // NOLINT @@ -631,7 +631,7 @@ class EXPORT Value : public Data { * Returns true if this value is boolean. */ bool IsBoolean(); - + /** * Returns true if this value is a number. */ @@ -696,8 +696,18 @@ class EXPORT Boolean : public Primitive { */ class EXPORT String : public Primitive { public: + + /** + * Returns the number of characters in this string. + */ int Length(); + /** + * Returns the number of bytes in the UTF-8 encoded + * representation of this string. + */ + int Utf8Length(); + /** * Write the contents of the string to an external buffer. * If no arguments are given, expects the buffer to be large @@ -716,9 +726,8 @@ class EXPORT String : public Primitive { * excluding the NULL terminator. */ int Write(uint16_t* buffer, int start = 0, int length = -1); // UTF-16 - int WriteAscii(char* buffer, - int start = 0, - int length = -1); // literally ascii + int WriteAscii(char* buffer, int start = 0, int length = -1); // ASCII + int WriteUtf8(char* buffer, int length = -1); // UTF-8 /** * Returns true if the string is external @@ -1755,7 +1764,7 @@ class EXPORT V8 { static void IgnoreOutOfMemoryException(); /** - * Check if V8 is dead and therefore unusable. This is the case after + * Check if V8 is dead and therefore unusable. This is the case after * fatal errors such as out-of-memory situations. */ static bool IsDead(); diff --git a/src/api.cc b/src/api.cc index 38d10cec33..ac11277d00 100644 --- a/src/api.cc +++ b/src/api.cc @@ -1925,6 +1925,53 @@ int String::Length() { } +int String::Utf8Length() { + if (IsDeadCheck("v8::String::Utf8Length()")) return 0; + return Utils::OpenHandle(this)->Utf8Length(); +} + + +int String::WriteUtf8(char* buffer, int capacity) { + if (IsDeadCheck("v8::String::WriteUtf8()")) return 0; + LOG_API("String::WriteUtf8"); + i::Handle str = Utils::OpenHandle(this); + write_input_buffer.Reset(0, *str); + int len = str->length(); + // Encode the first K - 3 bytes directly into the buffer since we + // know there's room for them. If no capacity is given we copy all + // of them here. + int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1); + int i; + int pos = 0; + for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) { + i::uc32 c = write_input_buffer.GetNext(); + int written = unibrow::Utf8::Encode(buffer + pos, c); + pos += written; + } + if (i < len) { + // For the last characters we need to check the length for each one + // because they may be longer than the remaining space in the + // buffer. + char intermediate[unibrow::Utf8::kMaxEncodedSize]; + for (; i < len && pos < capacity; i++) { + i::uc32 c = write_input_buffer.GetNext(); + int written = unibrow::Utf8::Encode(intermediate, c); + if (pos + written <= capacity) { + for (int j = 0; j < written; j++) + buffer[pos + j] = intermediate[j]; + pos += written; + } else { + // We've reached the end of the buffer + break; + } + } + } + if (i == len && (capacity == -1 || pos < capacity)) + buffer[pos++] = '\0'; + return pos; +} + + int String::WriteAscii(char* buffer, int start, int length) { if (IsDeadCheck("v8::String::WriteAscii()")) return 0; LOG_API("String::WriteAscii"); diff --git a/src/objects.cc b/src/objects.cc index 5927b86ec7..f0cefb3ee9 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -2915,6 +2915,22 @@ bool String::LooksValid() { } +int String::Utf8Length() { + if (is_ascii()) return length(); + // Attempt to flatten before accessing the string. It probably + // doesn't make Utf8Length faster, but it is very likely that + // the string will be accessed later (for example by WriteUtf8) + // so it's still a good idea. + TryFlatten(); + Access buffer(&string_input_buffer); + buffer->Reset(0, this); + int result = 0; + while (buffer->has_more()) + result += unibrow::Utf8::Length(buffer->GetNext()); + return result; +} + + SmartPointer String::ToCString(AllowNullsFlag allow_nulls, RobustnessFlag robust_flag, int offset, diff --git a/src/objects.h b/src/objects.h index ce0678ff0d..ba32260e80 100644 --- a/src/objects.h +++ b/src/objects.h @@ -2842,6 +2842,8 @@ class String: public HeapObject { RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL, int* length_output = 0); + int Utf8Length(); + // Return a 16 bit Unicode representation of the string. // The string should be nearly flat, otherwise the performance of // of this method may be very bad. Setting robustness_flag to diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc index 46c0536d57..f41e3e12ca 100644 --- a/test/cctest/test-strings.cc +++ b/test/cctest/test-strings.cc @@ -333,3 +333,43 @@ TEST(DeepAscii) { TraverseFirst(flat_string, string, DEEP_ASCII_DEPTH); } } + + +TEST(Utf8Conversion) { + // Smoke test for converting strings to utf-8. + InitializeVM(); + v8::HandleScope handle_scope; + // A simple ascii string + const char* ascii_string = "abcdef12345"; + int len = v8::String::New(ascii_string, strlen(ascii_string))->Utf8Length(); + CHECK_EQ(strlen(ascii_string), len); + // A mixed ascii and non-ascii string + // U+02E4 -> CB A4 + // U+0064 -> 64 + // U+12E4 -> E1 8B A4 + // U+0030 -> 30 + // U+3045 -> E3 81 85 + const uint16_t mixed_string[] = {0x02E4, 0x0064, 0x12E4, 0x0030, 0x3045}; + // The characters we expect to be output + const char as_utf8[11] = {0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30, + 0xE3, 0x81, 0x85, 0x00}; + // The number of bytes expected to be written for each length + const int lengths[12] = {0, 0, 2, 3, 3, 3, 6, 7, 7, 7, 10, 11}; + v8::Handle mixed = v8::String::New(mixed_string, 5); + CHECK_EQ(10, mixed->Utf8Length()); + // Try encoding the string with all capacities + char buffer[11]; + for (int i = 0; i <= 11; i++) { + // Clear the buffer before reusing it + for (int j = 0; j < 11; j++) + buffer[j] = -1; + int written = mixed->WriteUtf8(buffer, i); + CHECK_EQ(lengths[i], written); + // Check that the contents are correct + for (int j = 0; j < lengths[i]; j++) + CHECK_EQ(as_utf8[j], buffer[j]); + // Check that the rest of the buffer hasn't been touched + for (int j = lengths[i]; j < 11; j++) + CHECK_EQ(-1, buffer[j]); + } +}