// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Heuristics for deciding about the UTF8-ness of strings. #include "./utf8_util.h" #include "./types.h" namespace brotli { namespace { int ParseAsUTF8(int* symbol, const uint8_t* input, int size) { // ASCII if ((input[0] & 0x80) == 0) { *symbol = input[0]; if (*symbol > 0) { return 1; } } // 2-byte UTF8 if (size > 1 && (input[0] & 0xe0) == 0xc0 && (input[1] & 0xc0) == 0x80) { *symbol = (((input[0] & 0x1f) << 6) | (input[1] & 0x3f)); if (*symbol > 0x7f) { return 2; } } // 3-byte UFT8 if (size > 2 && (input[0] & 0xf0) == 0xe0 && (input[1] & 0xc0) == 0x80 && (input[2] & 0xc0) == 0x80) { *symbol = (((input[0] & 0x0f) << 12) | ((input[1] & 0x3f) << 6) | (input[2] & 0x3f)); if (*symbol > 0x7ff) { return 3; } } // 4-byte UFT8 if (size > 3 && (input[0] & 0xf8) == 0xf0 && (input[1] & 0xc0) == 0x80 && (input[2] & 0xc0) == 0x80 && (input[3] & 0xc0) == 0x80) { *symbol = (((input[0] & 0x07) << 18) | ((input[1] & 0x3f) << 12) | ((input[2] & 0x3f) << 6) | (input[3] & 0x3f)); if (*symbol > 0xffff && *symbol <= 0x10ffff) { return 4; } } // Not UTF8, emit a special symbol above the UTF8-code space *symbol = 0x110000 | input[0]; return 1; } } // namespace // Returns true if at least min_fraction of the data is UTF8-encoded. bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, const size_t length, const double min_fraction) { size_t size_utf8 = 0; size_t i = 0; while (i < length) { int symbol; int bytes_read = ParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); i += bytes_read; if (symbol < 0x110000) size_utf8 += bytes_read; } return size_utf8 > min_fraction * length; } } // namespace brotli