2019-07-30 13:58:45 +00:00
|
|
|
// Copyright 2019 Google LLC.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
#include "experimental/editor/word_boundaries.h"
|
|
|
|
|
|
|
|
#include <unicode/brkiter.h>
|
|
|
|
#include <unicode/unistr.h>
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
|
2019-08-08 20:16:24 +00:00
|
|
|
std::vector<bool> GetUtf8WordBoundaries(const char* begin, size_t byteCount, const char* locale) {
|
2019-07-30 13:58:45 +00:00
|
|
|
static constexpr UBreakIteratorType kIteratorType = UBRK_WORD;
|
|
|
|
struct UTextCloser {
|
|
|
|
void operator()(UText* p) { (void)utext_close(p); }
|
|
|
|
};
|
|
|
|
struct UBreakCloser {
|
|
|
|
void operator()(UBreakIterator* p) { (void)ubrk_close(p); }
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<bool> result;
|
2019-08-08 20:16:24 +00:00
|
|
|
if (0 == byteCount) {
|
2019-07-30 13:58:45 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
result.resize(byteCount);
|
|
|
|
|
|
|
|
UText utf8UText = UTEXT_INITIALIZER;
|
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
(void)utext_openUTF8(&utf8UText, begin, byteCount, &errorCode);
|
|
|
|
std::unique_ptr<UText, UTextCloser> autoclose1(&utf8UText);
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
UBreakIterator* iter = ubrk_open(kIteratorType, locale, nullptr, 0, &errorCode);
|
|
|
|
std::unique_ptr<UBreakIterator, UBreakCloser> autoclose2(iter);
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
ubrk_setUText(iter, &utf8UText, &errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
int pos = ubrk_first(iter);
|
|
|
|
while (pos != icu::BreakIterator::DONE) {
|
|
|
|
if ((unsigned)pos < (unsigned)byteCount) {
|
|
|
|
result[pos] = true;
|
|
|
|
}
|
|
|
|
pos = ubrk_next(iter);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|