ICU-9601 from-UTF-8 m:n conversion: properly revert to pivoting for m:n matching

X-SVN-Rev: 32529
This commit is contained in:
Markus Scherer 2012-10-05 20:12:49 +00:00
parent 5786467fb0
commit a2eca4547b
6 changed files with 93 additions and 10 deletions

View File

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1998-2011, International Business Machines
* Copyright (C) 1998-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -2851,14 +2851,12 @@ ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status)
return -1;
}
if(cnv->preFromULength > 0){
if(cnv->preFromUFirstCP >= 0){
return U16_LENGTH(cnv->preFromUFirstCP)+cnv->preFromULength ;
}else if(cnv->preFromULength < 0){
return -cnv->preFromULength ;
}else if(cnv->fromUChar32 > 0){
return 1;
}else if(cnv->preFromUFirstCP >0){
return U16_LENGTH(cnv->preFromUFirstCP);
}
return 0;

View File

@ -5122,6 +5122,7 @@ moreBytes:
* but then exit the loop because the extension match would
* have consumed the source.
*/
*pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
@ -5142,7 +5143,9 @@ moreBytes:
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
if(U_SUCCESS(*pErrorCode) &&
cnv->preFromUFirstCP<0 &&
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
toULimit=utf8_countTrailBytes[b]+1;
@ -5428,6 +5431,7 @@ unassigned:
* but then exit the loop because the extension match would
* have consumed the source.
*/
*pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
@ -5449,7 +5453,9 @@ unassigned:
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
if(U_SUCCESS(*pErrorCode) &&
cnv->preFromUFirstCP<0 &&
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
toULimit=utf8_countTrailBytes[b]+1;

View File

@ -1,6 +1,6 @@
#******************************************************************************
#
# Copyright (C) 1998-2011, International Business Machines
# Copyright (C) 1998-2012, International Business Machines
# Corporation and others. All Rights Reserved.
#
#******************************************************************************
@ -129,7 +129,7 @@ TESTDT=$(TESTDATA)
TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu
TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUILDDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp
TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm test5.ucm ibm9027.ucm
TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test2.ucm test3.ucm test4.ucm test4x.ucm test5.ucm ibm9027.ucm
TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)
TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)

View File

@ -1808,6 +1808,21 @@ conversion:table(nofallback) {
:intvector{ 0,0,0,0,0,1,2,2,2,3,3,3,3,3 },
:int{1}, :int{0}, "", "0", ""
}
// Bug #9601 direct-from-UTF-8 m:n Unicode:charset conversion.
{
"*test1bmp",
"uv",
:bin{ 08 },
:intvector{ 0 },
:int{1}, :int{0}, "", "?", ""
}
{
"*test2",
"\U00101234\U00050005",
:bin{ 0700010e05 },
:intvector{ 0,0,0,0,0 },
:int{1}, :int{0}, "", "?", ""
}
}
}

59
icu4c/source/test/testdata/test2.ucm vendored Normal file
View File

@ -0,0 +1,59 @@
# *******************************************************************************
# * Copyright (C) 2012, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test2.ucm
#
# Test file for MBCS conversion with two-byte codepage data. (DBCS)
# Also contains extension mappings (m:n).
<code_set_name> "test2"
<mb_cur_max> 2
<mb_cur_min> 1
<uconv_class> "MBCS"
<subchar> \x1A
<icu:state> 0, 1:1, 5-9, 1a, ff
<icu:state> a-f.p
CHARMAP
# fromUnicode result is zero byte from other than U+0000
<U0040> \x00 |0
# nothing special
<U0065> \x05 |0
# extensions
<U00c0> \x05+\x01\x0d |0
<U00c0> \x05+\x01\x0e |3
<U00c0> \x05+\xff |3
# toUnicode result is fallback direct
<U0066> \x06 |3
# toUnicode result is direct non-BMP code point
<U101234> \x07 |0
<Ufebcd> \x08 |3
# extensions
<U101234>+<U50005>+<U60006> \x07+\x00+\x01\x0f+\x09 |0
<U101234>+<U50005> \x07+\x00+\x01\x0e+\x05 |0
<U101234>+<U60006> \x07+\x00+\x01\x0f+\x06 |0
<U101234>+<U70007> \x07+\x00+\x01\x0f |1
#unassigned \x09
# extensions where the first code point is unassigned, for replay testing
#<U00c4><U0300> \x09+\x09 |0
<U00c4><U00c4><U101234><U0005> \x05+\x01\x0c |0
# toUnicode result is surrogate pair: test real pair, single unit, unassigned
<U23456> \x01\x0a |0
<U000b> \x01\x0b |0
#unassigned \x01\x0c
<U34567> \x01\x0d |3
<U000e> \x01\x0e |3
#unassigned \x01\x0f
END CHARMAP

View File

@ -1,5 +1,5 @@
#**********************************************************************
#* Copyright (C) 1999-2010, International Business Machines Corporation
#* Copyright (C) 1999-2012, International Business Machines Corporation
#* and others. All Rights Reserved.
#**********************************************************************
#
@ -28,7 +28,7 @@ ALL : "$(TESTDATAOUT)\testdata.dat"
TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)
"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\test5.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp" "$(TESTDATABLD)\testnorm.nrm"
"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test2.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\test5.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp" "$(TESTDATABLD)\testnorm.nrm"
@echo Building test data
@copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"
@copy "$(TESTDATA)\old_l_testtypes.res" "$(TESTDATABLD)"
@ -55,6 +55,7 @@ iscii.res
test.icu
test1.cnv
test1bmp.cnv
test2.cnv
test3.cnv
test4.cnv
test4x.cnv
@ -133,6 +134,10 @@ $(TEST_RES_FILES:.res =.res
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\test2.cnv": "$(TESTDATA)\test2.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**