v8/test/intl/segmenter/segment-sentence.js
Frank Tang 4f87e1a045 Reland "[Intl] Sync Intl.Segmenter to latest version"
This is a reland of 482c3bbf1e

Original change's description:
> [Intl] Sync Intl.Segmenter to latest version
> 
> https://tc39.es/proposal-intl-segmenter/
> 
> TC39 passed Intl.Segmenter to stage 3 in Jul 21.
> This CL move our earlier prototype to the current spec.
> 
> Bug: v8:6891
> Change-Id: I07234beed54f671c26bdbfb3983c5bc2fa5a29b0
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2219413
> Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
> Reviewed-by: Frank Tang <ftang@chromium.org>
> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
> Commit-Queue: Frank Tang <ftang@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#69080}

Bug: v8:6891
Change-Id: Ie3a02d8ddf6f95f0632f97b38b613b185abeb592
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2321118
Reviewed-by: Frank Tang <ftang@chromium.org>
Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
Commit-Queue: Frank Tang <ftang@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69153}
2020-07-30 17:32:20 +00:00

44 lines
2.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-intl-segmenter
const seg = new Intl.Segmenter([], {granularity: "sentence"})
for (const text of [
"Hello world!", // English
" Hello world! ", // English with space before/after
" Hello world? Foo bar!", // English
"Jedovatou mambu objevila žena v zahrádkářské kolonii.", // Czech
"Việt Nam: Nhất thể hóa sẽ khác Trung Quốc?", // Vietnamese
"Σοβαρές ενστάσεις Κομισιόν για τον προϋπολογισμό της Ιταλίας", // Greek
"Решение Индии о покупке российских С-400 расценили как вызов США", // Russian
"הרופא שהציל נשים והנערה ששועבדה ע", // Hebrew,
"ترامب للملك سلمان: أنا جاد للغاية.. عليك دفع المزيد", // Arabic
"भारत की एस 400 मिसाइल के मुकाबले पाक की थाड, जानें कौन कितना ताकतवर", // Hindi
"ரெட் அலர்ட் எச்சரிக்கை; புதுச்சேரியில் நாளை அரசு விடுமுறை!", // Tamil
"'ఉత్తర్వులు అందే వరకు ఓటర్ల తుది జాబితాను వెబ్‌సైట్లో పెట్టవద్దు'", // Telugu
"台北》抹黑柯P失敗朱學恒酸姚文智氣pupu嗆大老闆", // Chinese
"วัดไทรตีระฆังเบาลงช่วงเข้าพรรษา เจ้าอาวาสเผยคนร้องเรียนรับผลกรรมแล้ว", // Thai
"九州北部の一部が暴風域に入りました(日直予報士 2018年10月06日) - 日本気象協会 tenki.jp", // Japanese
"법원 “다스 지분 처분권·수익권 모두 MB가 보유”", // Korean
]) {
const segments = seg.segment(text);
let results = [];
var pos = -1;
for (let s of segments) {
assertEquals(["segment", "index", "input"], Object.keys(s));
assertEquals(typeof s.index, "number");
assertEquals(typeof s.segment, "string");
assertEquals(typeof s.input, "string");
assertEquals(text, s.input);
assertEquals(text.substring(s.index, s.index + s.segment.length),
s.segment);
assertTrue(pos < s.index);
pos = s.index;
results.push(s.segment);
}
assertTrue(pos < text.length);
assertEquals(text, results.join(""));
}