mirror of
https://github.com/google/brotli.git
synced 2024-12-29 03:01:16 +00:00
4c37566f4b
Move utf8 heuristics functions to their own file.
91 lines
2.5 KiB
C++
91 lines
2.5 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// Heuristics for deciding about the UTF8-ness of strings.
|
|
|
|
#include "./utf8_util.h"
|
|
|
|
#include "./types.h"
|
|
|
|
namespace brotli {
|
|
|
|
namespace {
|
|
|
|
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
|
|
// ASCII
|
|
if ((input[0] & 0x80) == 0) {
|
|
*symbol = input[0];
|
|
if (*symbol > 0) {
|
|
return 1;
|
|
}
|
|
}
|
|
// 2-byte UTF8
|
|
if (size > 1 &&
|
|
(input[0] & 0xe0) == 0xc0 &&
|
|
(input[1] & 0xc0) == 0x80) {
|
|
*symbol = (((input[0] & 0x1f) << 6) |
|
|
(input[1] & 0x3f));
|
|
if (*symbol > 0x7f) {
|
|
return 2;
|
|
}
|
|
}
|
|
// 3-byte UFT8
|
|
if (size > 2 &&
|
|
(input[0] & 0xf0) == 0xe0 &&
|
|
(input[1] & 0xc0) == 0x80 &&
|
|
(input[2] & 0xc0) == 0x80) {
|
|
*symbol = (((input[0] & 0x0f) << 12) |
|
|
((input[1] & 0x3f) << 6) |
|
|
(input[2] & 0x3f));
|
|
if (*symbol > 0x7ff) {
|
|
return 3;
|
|
}
|
|
}
|
|
// 4-byte UFT8
|
|
if (size > 3 &&
|
|
(input[0] & 0xf8) == 0xf0 &&
|
|
(input[1] & 0xc0) == 0x80 &&
|
|
(input[2] & 0xc0) == 0x80 &&
|
|
(input[3] & 0xc0) == 0x80) {
|
|
*symbol = (((input[0] & 0x07) << 18) |
|
|
((input[1] & 0x3f) << 12) |
|
|
((input[2] & 0x3f) << 6) |
|
|
(input[3] & 0x3f));
|
|
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
|
return 4;
|
|
}
|
|
}
|
|
// Not UTF8, emit a special symbol above the UTF8-code space
|
|
*symbol = 0x110000 | input[0];
|
|
return 1;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
// Returns true if at least min_fraction of the data is UTF8-encoded.
|
|
bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
|
|
const size_t length, const double min_fraction) {
|
|
size_t size_utf8 = 0;
|
|
size_t i = 0;
|
|
while (i < length) {
|
|
int symbol;
|
|
int bytes_read = ParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
|
|
i += bytes_read;
|
|
if (symbol < 0x110000) size_utf8 += bytes_read;
|
|
}
|
|
return size_utf8 > min_fraction * length;
|
|
}
|
|
|
|
} // namespace brotli
|