v8/test/intl/segmenter/segment-word.js
Frank Tang 64da43ef36 Reland "[intl] Clean up intl_segmenter flag"
This is a reland of c9c3ec4c14


Original change's description:
> [intl] Clean up intl_segmenter flag
>
> Intl.Segmenter shipped in m87 and launched.
>
> Bug: v8:11225
> Change-Id: I4213e261e1aea717c1281f19785a8c29ff1bbd8b
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2570461
> Commit-Queue: Frank Tang <ftang@chromium.org>
> Reviewed-by: Shu-yu Guo <syg@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#71653}

Bug: v8:11225, v8:11240
Change-Id: Ibded9038671862d90206d328f8a12db51c40e63c
Cq-Include-Trybots: luci.v8.try:v8_linux64_gc_stress_custom_snapshot_dbg_ng,v8_linux_arm64_gc_stress_dbg_ng,v8_linux_gc_stress_dbg_ng,v8_mac64_gc_stress_dbg_ng
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2579043
Commit-Queue: Frank Tang <ftang@chromium.org>
Reviewed-by: Shu-yu Guo <syg@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71691}
2020-12-10 10:25:48 +00:00

43 lines
2.7 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const seg = new Intl.Segmenter([], {granularity: "word"})
for (const text of [
"Hello world!", // English
" Hello world! ", // English with space before/after
" Hello world? Foo bar!", // English
"Jedovatou mambu objevila žena v zahrádkářské kolonii.", // Czech
"Việt Nam: Nhất thể hóa sẽ khác Trung Quốc?", // Vietnamese
"Σοβαρές ενστάσεις Κομισιόν για τον προϋπολογισμό της Ιταλίας", // Greek
"Решение Индии о покупке российских С-400 расценили как вызов США", // Russian
"הרופא שהציל נשים והנערה ששועבדה ע", // Hebrew,
"ترامب للملك سلمان: أنا جاد للغاية.. عليك دفع المزيد", // Arabic
"भारत की एस 400 मिसाइल के मुकाबले पाक की थाड, जानें कौन कितना ताकतवर", // Hindi
"ரெட் அலர்ட் எச்சரிக்கை; புதுச்சேரியில் நாளை அரசு விடுமுறை!", // Tamil
"'ఉత్తర్వులు అందే వరకు ఓటర్ల తుది జాబితాను వెబ్‌సైట్లో పెట్టవద్దు'", // Telugu
"台北》抹黑柯P失敗朱學恒酸姚文智氣pupu嗆大老闆", // Chinese
"วัดไทรตีระฆังเบาลงช่วงเข้าพรรษา เจ้าอาวาสเผยคนร้องเรียนรับผลกรรมแล้ว", // Thai
"九州北部の一部が暴風域に入りました(日直予報士 2018年10月06日) - 日本気象協会 tenki.jp", // Japanese
"법원 “다스 지분 처분권·수익권 모두 MB가 보유”", // Korean
]) {
const segments = seg.segment(text);
let results = [];
var pos = -1;
for (let s of segments) {
assertEquals(["segment", "index", "input", "isWordLike"], Object.keys(s));
assertEquals(typeof s.isWordLike, "boolean");
assertEquals(typeof s.index, "number");
assertEquals(typeof s.segment, "string");
assertEquals(typeof s.input, "string");
assertEquals(text, s.input);
assertEquals(text.substring(s.index, s.index + s.segment.length),
s.segment);
assertTrue(pos < s.index);
pos = s.index;
results.push(s.segment);
}
assertTrue(pos < text.length);
assertEquals(text, results.join(""));
}