2015-11-27 10:27:11 +00:00
|
|
|
/* Copyright 2013 Google Inc. All Rights Reserved.
|
|
|
|
|
2015-12-11 10:11:51 +00:00
|
|
|
Distributed under MIT license.
|
2015-11-27 10:27:11 +00:00
|
|
|
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
|
|
|
*/
|
|
|
|
|
2016-06-03 09:19:23 +00:00
|
|
|
/* Heuristics for deciding about the UTF8-ness of strings. */
|
2015-10-01 13:10:42 +00:00
|
|
|
|
|
|
|
#include "./utf8_util.h"
|
|
|
|
|
2016-06-03 08:51:04 +00:00
|
|
|
#include "../common/types.h"
|
2015-10-01 13:10:42 +00:00
|
|
|
|
2016-06-13 09:01:04 +00:00
|
|
|
#if defined(__cplusplus) || defined(c_plusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
2015-10-01 13:10:42 +00:00
|
|
|
|
2016-06-13 09:01:04 +00:00
|
|
|
static size_t BrotliParseAsUTF8(
|
|
|
|
int* symbol, const uint8_t* input, size_t size) {
|
2016-06-03 09:19:23 +00:00
|
|
|
/* ASCII */
|
2015-10-01 13:10:42 +00:00
|
|
|
if ((input[0] & 0x80) == 0) {
|
|
|
|
*symbol = input[0];
|
|
|
|
if (*symbol > 0) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 09:19:23 +00:00
|
|
|
/* 2-byte UTF8 */
|
2015-10-28 16:44:47 +00:00
|
|
|
if (size > 1u &&
|
2015-10-01 13:10:42 +00:00
|
|
|
(input[0] & 0xe0) == 0xc0 &&
|
|
|
|
(input[1] & 0xc0) == 0x80) {
|
|
|
|
*symbol = (((input[0] & 0x1f) << 6) |
|
|
|
|
(input[1] & 0x3f));
|
|
|
|
if (*symbol > 0x7f) {
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 09:19:23 +00:00
|
|
|
/* 3-byte UFT8 */
|
2015-10-28 16:44:47 +00:00
|
|
|
if (size > 2u &&
|
2015-10-01 13:10:42 +00:00
|
|
|
(input[0] & 0xf0) == 0xe0 &&
|
|
|
|
(input[1] & 0xc0) == 0x80 &&
|
|
|
|
(input[2] & 0xc0) == 0x80) {
|
|
|
|
*symbol = (((input[0] & 0x0f) << 12) |
|
|
|
|
((input[1] & 0x3f) << 6) |
|
|
|
|
(input[2] & 0x3f));
|
|
|
|
if (*symbol > 0x7ff) {
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 09:19:23 +00:00
|
|
|
/* 4-byte UFT8 */
|
2015-10-28 16:44:47 +00:00
|
|
|
if (size > 3u &&
|
2015-10-01 13:10:42 +00:00
|
|
|
(input[0] & 0xf8) == 0xf0 &&
|
|
|
|
(input[1] & 0xc0) == 0x80 &&
|
|
|
|
(input[2] & 0xc0) == 0x80 &&
|
|
|
|
(input[3] & 0xc0) == 0x80) {
|
|
|
|
*symbol = (((input[0] & 0x07) << 18) |
|
|
|
|
((input[1] & 0x3f) << 12) |
|
|
|
|
((input[2] & 0x3f) << 6) |
|
|
|
|
(input[3] & 0x3f));
|
|
|
|
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 09:19:23 +00:00
|
|
|
/* Not UTF8, emit a special symbol above the UTF8-code space */
|
2015-10-01 13:10:42 +00:00
|
|
|
*symbol = 0x110000 | input[0];
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2016-06-03 09:19:23 +00:00
|
|
|
/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
|
2016-06-13 09:01:04 +00:00
|
|
|
int BrotliIsMostlyUTF8(const uint8_t* data, const size_t pos,
|
|
|
|
const size_t mask, const size_t length, const double min_fraction) {
|
2015-10-01 13:10:42 +00:00
|
|
|
size_t size_utf8 = 0;
|
|
|
|
size_t i = 0;
|
|
|
|
while (i < length) {
|
|
|
|
int symbol;
|
2016-06-13 09:01:04 +00:00
|
|
|
size_t bytes_read =
|
|
|
|
BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
|
2015-10-01 13:10:42 +00:00
|
|
|
i += bytes_read;
|
|
|
|
if (symbol < 0x110000) size_utf8 += bytes_read;
|
|
|
|
}
|
2016-06-13 09:01:04 +00:00
|
|
|
return (size_utf8 > min_fraction * (double)length) ? 1 : 0;
|
2015-10-01 13:10:42 +00:00
|
|
|
}
|
|
|
|
|
2016-06-13 09:01:04 +00:00
|
|
|
#if defined(__cplusplus) || defined(c_plusplus)
|
|
|
|
} /* extern "C" */
|
|
|
|
#endif
|