diff --git a/icu4c/source/data/brkitr/line.txt b/icu4c/source/data/brkitr/line.txt index 515c52a36d..538e0a0b77 100644 --- a/icu4c/source/data/brkitr/line.txt +++ b/icu4c/source/data/brkitr/line.txt @@ -93,10 +93,11 @@ $ZW = [:LineBreak = ZWSpace:]; # # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), # SA (South East Asian: Thai, Lao, Khmer) +# SG (Unpaired Surrogates) # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SA $XX]; +$ALPlus = [$AL $AI $SA $SG $XX]; # # Combining Marks. X $CM* behaves as if it were X. Rule LB6. diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 43d82bdc2c..9c6a794a18 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2662,6 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() fAL = new UnicodeSet("[\\p{Line_break=AL}]", status); fID = new UnicodeSet("[\\p{Line_break=ID}]", status); fSA = new UnicodeSet("[\\p{Line_break=SA}]", status); + fSG = new UnicodeSet("[\\ud800-\\udfff]", status); fXX = new UnicodeSet("[\\p{Line_break=XX}]", status); if (U_FAILURE(status)) { @@ -2674,6 +2675,7 @@ RBBILineMonkey::RBBILineMonkey() fAL->addAll(*fXX); // Default behavior for XX is identical to AL fAL->addAll(*fAI); // Default behavior for AI is identical to AL fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL + fAL->addAll(*fSG); // Default behavior for SG is identical to AL. fSets->addElement(fBK, status); fSets->addElement(fCR, status); @@ -2710,6 +2712,7 @@ RBBILineMonkey::RBBILineMonkey() fSets->addElement(fID, status); fSets->addElement(fWJ, status); fSets->addElement(fSA, status); + fSets->addElement(fSG, status); fNumberMatcher = new RegexMatcher( "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?" @@ -3159,6 +3162,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fAL; delete fID; delete fSA; + delete fSG; delete fXX; delete fCharBI;